diff --git a/.gitattributes b/.gitattributes index 51ff1103f..725bde8d4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -73,7 +73,6 @@ proguard-rules.pro text *.bacpac binary *.class binary *.bmp binary -*.ico binary *.jar binary *.jpg binary *.mp3 binary diff --git a/README.md b/README.md index bad27cc01..0886af907 100644 --- a/README.md +++ b/README.md @@ -166,7 +166,6 @@ Samples for using the Speech Service REST API (no Speech SDK installation requir | --- | --- | | [Batch transcription](https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/batch/) | Demonstrates usage of batch transcription from different programming languages | | [Batch synthesis](https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/batch-synthesis/) | Demonstrates usage of batch synthesis from different programming languages | -| [Custom voice](https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/custom-voice/) | Demonstrates usage of custom voice from different programming languages | ## Tools diff --git a/ThirdPartyNotices.md b/ThirdPartyNotices.md index d45bca2fa..4e56e5a7e 100644 --- a/ThirdPartyNotices.md +++ b/ThirdPartyNotices.md @@ -3796,40 +3796,19 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI --------------------------------------------------------- -registry.npmjs.org/diff 5.2.0 - BSD 3-Clause + org.webjars.npm/difflib 0.2.4 - MIT -BSD 3-Clause License - -Copyright (c) 2009-2015, Kevin Decker -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. + MIT License -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. +Copyright (c) -3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --------------------------------------------------------- @@ -4161,22 +4140,4 @@ java-diff-utils/java-diff-utils 4.12 - Apache-2.0 distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. - ---------------------------------------------------------- - ---------------------------------------------------------- - - github.com/uuidjs/uuid 9.0.1 - MIT - - - - The MIT License (MIT) - -Copyright (c) 2010-2020 Robert Kieffer and other contributors - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file + limitations under the License. \ No newline at end of file diff --git a/quickstart/cpp/linux/from-microphone/README.md b/quickstart/cpp/linux/from-microphone/README.md index e4b46b019..fcb44b479 100644 --- a/quickstart/cpp/linux/from-microphone/README.md +++ b/quickstart/cpp/linux/from-microphone/README.md @@ -17,16 +17,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install build-essential libssl-dev libasound2 wget ``` -* On RHEL or CentOS, install these packages to build and run this sample: - - ```sh - sudo yum update - sudo yum groupinstall "Development tools" - sudo yum install alsa-lib openssl wget - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - ## Build the sample * [Download the sample code to your development PC.](/README.md#get-the-samples) diff --git a/quickstart/cpp/linux/text-to-speech/README.md b/quickstart/cpp/linux/text-to-speech/README.md index beb5a5fff..62cd4f1ac 100644 --- a/quickstart/cpp/linux/text-to-speech/README.md +++ b/quickstart/cpp/linux/text-to-speech/README.md @@ -17,16 +17,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install build-essential libssl-dev libasound2 wget ``` -* On RHEL or CentOS, install these packages to build and run this sample: - - ```sh - sudo yum update - sudo yum groupinstall "Development tools" - sudo yum install alsa-lib openssl wget - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - ## Build the sample * [Download the sample code to your development PC.](/README.md#get-the-samples) diff --git a/quickstart/cpp/linux/text-to-speech/helloworld.cpp b/quickstart/cpp/linux/text-to-speech/helloworld.cpp index 07a0cb60d..b0411f1d1 100644 --- a/quickstart/cpp/linux/text-to-speech/helloworld.cpp +++ b/quickstart/cpp/linux/text-to-speech/helloworld.cpp @@ -17,7 +17,7 @@ void synthesizeSpeech() auto config = SpeechConfig::FromSubscription("YourSubscriptionKey", "YourServiceRegion"); // Set the voice name, refer to https://aka.ms/speech/voices/neural for full list. - config->SetSpeechSynthesisVoiceName("en-US-AvaMultilingualNeural"); + config->SetSpeechSynthesisVoiceName("en-US-AriaNeural"); // Creates a speech synthesizer using the default speaker as audio output. The default spoken language is "en-us". auto synthesizer = SpeechSynthesizer::FromConfig(config); diff --git a/quickstart/cpp/macos/text-to-speech/helloworld.cpp b/quickstart/cpp/macos/text-to-speech/helloworld.cpp index ee5aaa08f..2d2ac551b 100644 --- a/quickstart/cpp/macos/text-to-speech/helloworld.cpp +++ b/quickstart/cpp/macos/text-to-speech/helloworld.cpp @@ -17,7 +17,7 @@ void synthesizeSpeech() auto config = SpeechConfig::FromSubscription("YourSubscriptionKey", "YourServiceRegion"); // Set the voice name, refer to https://aka.ms/speech/voices/neural for full list. - config->SetSpeechSynthesisVoiceName("en-US-AvaMultilingualNeural"); + config->SetSpeechSynthesisVoiceName("en-US-AriaNeural"); // Creates a speech synthesizer using the default speaker as audio output. The default spoken language is "en-us". auto synthesizer = SpeechSynthesizer::FromConfig(config); diff --git a/quickstart/cpp/windows/from-file/README.md b/quickstart/cpp/windows/from-file/README.md index efa523066..5bbb58f14 100644 --- a/quickstart/cpp/windows/from-file/README.md +++ b/quickstart/cpp/windows/from-file/README.md @@ -1,12 +1,12 @@ # Quickstart: Recognize speech from a file in C++ for Windows This sample demonstrates how to recognize speech using wave file as an input with C++ using the Speech SDK for Windows. -See the [accompanying article](https://docs.microsoft.com/azure/cognitive-services/speech-service/quickstarts/speech-to-text-from-microphone?tabs=dotnet%2Cx-android%2CWindows%2Cjava-runtime%2Cwindowsinstall&pivots=programming-language-cpp) on the SDK documentation page which describes how to build corresponding sample from scratch in Visual Studio 2022. +See the [accompanying article](https://docs.microsoft.com/azure/cognitive-services/speech-service/quickstarts/speech-to-text-from-microphone?tabs=dotnet%2Cx-android%2CWindows%2Cjava-runtime%2Cwindowsinstall&pivots=programming-language-cpp) on the SDK documentation page which describes how to build corresponding sample from scratch in Visual Studio 2017. ## Prerequisites * A subscription key for the Speech service. See [Try the speech service for free](https://docs.microsoft.com/azure/cognitive-services/speech-service/get-started). -* [Microsoft Visual Studio 2022](https://www.visualstudio.com/), Community Edition or higher. +* [Microsoft Visual Studio 2017](https://www.visualstudio.com/), Community Edition or higher. * The **Desktop development with C++** workload in Visual Studio and the **NuGet package manager** component in Visual Studio. You can enable both in **Tools** \> **Get Tools and Features**, under the **Workloads** and **Individual components** tabs, respectively. @@ -14,7 +14,7 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic * **By building this sample you will download the Microsoft Cognitive Services Speech SDK. By downloading you acknowledge its license, see [Speech SDK license agreement](https://aka.ms/csspeech/license).** * [Download the sample code to your development PC.](/README.md#get-the-samples) -* Start Microsoft Visual Studio 2022 and select **File** \> **Open** \> **Project/Solution**. +* Start Microsoft Visual Studio 2017 and select **File** \> **Open** \> **Project/Solution**. * Navigate to the folder containing this sample, and select the solution file contained within it. * Edit the `helloworld.cpp` source: * Replace the string `YourSubscriptionKey` with your own subscription key. diff --git a/quickstart/cpp/windows/from-file/helloworld/helloworld.vcxproj b/quickstart/cpp/windows/from-file/helloworld/helloworld.vcxproj index 3105645a7..7574b1107 100644 --- a/quickstart/cpp/windows/from-file/helloworld/helloworld.vcxproj +++ b/quickstart/cpp/windows/from-file/helloworld/helloworld.vcxproj @@ -170,12 +170,12 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/quickstart/cpp/windows/from-file/helloworld/packages.config b/quickstart/cpp/windows/from-file/helloworld/packages.config index 2d8ad1382..b2fd9e72f 100644 --- a/quickstart/cpp/windows/from-file/helloworld/packages.config +++ b/quickstart/cpp/windows/from-file/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/cpp/windows/from-microphone/helloworld/helloworld.vcxproj b/quickstart/cpp/windows/from-microphone/helloworld/helloworld.vcxproj index 70ecb0518..3d0e8a9aa 100644 --- a/quickstart/cpp/windows/from-microphone/helloworld/helloworld.vcxproj +++ b/quickstart/cpp/windows/from-microphone/helloworld/helloworld.vcxproj @@ -166,12 +166,12 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/quickstart/cpp/windows/from-microphone/helloworld/packages.config b/quickstart/cpp/windows/from-microphone/helloworld/packages.config index 2d8ad1382..b2fd9e72f 100644 --- a/quickstart/cpp/windows/from-microphone/helloworld/packages.config +++ b/quickstart/cpp/windows/from-microphone/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/cpp/windows/intent-recognition/helloworld/helloworld.vcxproj b/quickstart/cpp/windows/intent-recognition/helloworld/helloworld.vcxproj index 70ecb0518..3d0e8a9aa 100644 --- a/quickstart/cpp/windows/intent-recognition/helloworld/helloworld.vcxproj +++ b/quickstart/cpp/windows/intent-recognition/helloworld/helloworld.vcxproj @@ -166,12 +166,12 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/quickstart/cpp/windows/intent-recognition/helloworld/packages.config b/quickstart/cpp/windows/intent-recognition/helloworld/packages.config index 2d8ad1382..b2fd9e72f 100644 --- a/quickstart/cpp/windows/intent-recognition/helloworld/packages.config +++ b/quickstart/cpp/windows/intent-recognition/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/cpp/windows/multi-device-conversation/helloworld/helloworld.vcxproj b/quickstart/cpp/windows/multi-device-conversation/helloworld/helloworld.vcxproj index e2ebeb3e5..b6320d8df 100644 --- a/quickstart/cpp/windows/multi-device-conversation/helloworld/helloworld.vcxproj +++ b/quickstart/cpp/windows/multi-device-conversation/helloworld/helloworld.vcxproj @@ -158,12 +158,12 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/quickstart/cpp/windows/multi-device-conversation/helloworld/packages.config b/quickstart/cpp/windows/multi-device-conversation/helloworld/packages.config index 2d8ad1382..b2fd9e72f 100644 --- a/quickstart/cpp/windows/multi-device-conversation/helloworld/packages.config +++ b/quickstart/cpp/windows/multi-device-conversation/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/cpp/windows/speaker-recognition/helloworld/helloworld.vcxproj b/quickstart/cpp/windows/speaker-recognition/helloworld/helloworld.vcxproj index cc98de892..c3921b397 100644 --- a/quickstart/cpp/windows/speaker-recognition/helloworld/helloworld.vcxproj +++ b/quickstart/cpp/windows/speaker-recognition/helloworld/helloworld.vcxproj @@ -175,12 +175,12 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/quickstart/cpp/windows/speaker-recognition/helloworld/packages.config b/quickstart/cpp/windows/speaker-recognition/helloworld/packages.config index 2d8ad1382..b2fd9e72f 100644 --- a/quickstart/cpp/windows/speaker-recognition/helloworld/packages.config +++ b/quickstart/cpp/windows/speaker-recognition/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/cpp/windows/text-to-speech/helloworld/helloworld.cpp b/quickstart/cpp/windows/text-to-speech/helloworld/helloworld.cpp index 368d19030..73332c95d 100644 --- a/quickstart/cpp/windows/text-to-speech/helloworld/helloworld.cpp +++ b/quickstart/cpp/windows/text-to-speech/helloworld/helloworld.cpp @@ -18,7 +18,7 @@ void synthesizeSpeech() auto config = SpeechConfig::FromSubscription("YourSubscriptionKey", "YourServiceRegion"); // Set the voice name, refer to https://aka.ms/speech/voices/neural for full list. - config->SetSpeechSynthesisVoiceName("en-US-AvaMultilingualNeural"); + config->SetSpeechSynthesisVoiceName("en-US-AriaNeural"); // Creates a speech synthesizer using the default speaker as audio output. The default spoken language is "en-us". auto synthesizer = SpeechSynthesizer::FromConfig(config); diff --git a/quickstart/cpp/windows/text-to-speech/helloworld/helloworld.vcxproj b/quickstart/cpp/windows/text-to-speech/helloworld/helloworld.vcxproj index 70ecb0518..3d0e8a9aa 100644 --- a/quickstart/cpp/windows/text-to-speech/helloworld/helloworld.vcxproj +++ b/quickstart/cpp/windows/text-to-speech/helloworld/helloworld.vcxproj @@ -166,12 +166,12 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/quickstart/cpp/windows/text-to-speech/helloworld/packages.config b/quickstart/cpp/windows/text-to-speech/helloworld/packages.config index 2d8ad1382..b2fd9e72f 100644 --- a/quickstart/cpp/windows/text-to-speech/helloworld/packages.config +++ b/quickstart/cpp/windows/text-to-speech/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/cpp/windows/translate-speech-to-text/helloworld/helloworld.vcxproj b/quickstart/cpp/windows/translate-speech-to-text/helloworld/helloworld.vcxproj index 429b99da8..49b6502e4 100644 --- a/quickstart/cpp/windows/translate-speech-to-text/helloworld/helloworld.vcxproj +++ b/quickstart/cpp/windows/translate-speech-to-text/helloworld/helloworld.vcxproj @@ -167,12 +167,12 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/quickstart/cpp/windows/translate-speech-to-text/helloworld/packages.config b/quickstart/cpp/windows/translate-speech-to-text/helloworld/packages.config index 18dbd83b1..3edf4f372 100644 --- a/quickstart/cpp/windows/translate-speech-to-text/helloworld/packages.config +++ b/quickstart/cpp/windows/translate-speech-to-text/helloworld/packages.config @@ -1,4 +1,4 @@ - + diff --git a/quickstart/csharp/dotnet/conversation-transcription/helloworld/helloworld.csproj b/quickstart/csharp/dotnet/conversation-transcription/helloworld/helloworld.csproj index c37840866..aff3194c9 100644 --- a/quickstart/csharp/dotnet/conversation-transcription/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnet/conversation-transcription/helloworld/helloworld.csproj @@ -102,7 +102,7 @@ - 1.38.0 + 1.40.0 13.0.1 diff --git a/quickstart/csharp/dotnet/from-file/helloworld/helloworld.csproj b/quickstart/csharp/dotnet/from-file/helloworld/helloworld.csproj index a59a4c381..97954495d 100644 --- a/quickstart/csharp/dotnet/from-file/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnet/from-file/helloworld/helloworld.csproj @@ -95,7 +95,7 @@ - 1.38.0 + 1.40.0 diff --git a/quickstart/csharp/dotnet/from-file/helloworld/packages.config b/quickstart/csharp/dotnet/from-file/helloworld/packages.config index 30e57d47b..9c68fa2c0 100644 --- a/quickstart/csharp/dotnet/from-file/helloworld/packages.config +++ b/quickstart/csharp/dotnet/from-file/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/csharp/dotnet/from-microphone/helloworld/helloworld.csproj b/quickstart/csharp/dotnet/from-microphone/helloworld/helloworld.csproj index a59a4c381..97954495d 100644 --- a/quickstart/csharp/dotnet/from-microphone/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnet/from-microphone/helloworld/helloworld.csproj @@ -95,7 +95,7 @@ - 1.38.0 + 1.40.0 diff --git a/quickstart/csharp/dotnet/from-microphone/helloworld/packages.config b/quickstart/csharp/dotnet/from-microphone/helloworld/packages.config index 30e57d47b..9c68fa2c0 100644 --- a/quickstart/csharp/dotnet/from-microphone/helloworld/packages.config +++ b/quickstart/csharp/dotnet/from-microphone/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/csharp/dotnet/intent-recognition/helloworld/helloworld.csproj b/quickstart/csharp/dotnet/intent-recognition/helloworld/helloworld.csproj index 1ba44bca8..dab185482 100644 --- a/quickstart/csharp/dotnet/intent-recognition/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnet/intent-recognition/helloworld/helloworld.csproj @@ -97,7 +97,7 @@ - 1.38.0 + 1.40.0 diff --git a/quickstart/csharp/dotnet/intent-recognition/helloworld/packages.config b/quickstart/csharp/dotnet/intent-recognition/helloworld/packages.config index 30e57d47b..9c68fa2c0 100644 --- a/quickstart/csharp/dotnet/intent-recognition/helloworld/packages.config +++ b/quickstart/csharp/dotnet/intent-recognition/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/csharp/dotnet/meeting-transcription/helloworld/helloworld.csproj b/quickstart/csharp/dotnet/meeting-transcription/helloworld/helloworld.csproj index 1da8142c4..48d09c732 100644 --- a/quickstart/csharp/dotnet/meeting-transcription/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnet/meeting-transcription/helloworld/helloworld.csproj @@ -110,7 +110,7 @@ - 1.38.0 + 1.40.0 13.0.1 diff --git a/quickstart/csharp/dotnet/multi-device-conversation/helloworld/helloworld.csproj b/quickstart/csharp/dotnet/multi-device-conversation/helloworld/helloworld.csproj index 749fac4b0..3797d319a 100644 --- a/quickstart/csharp/dotnet/multi-device-conversation/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnet/multi-device-conversation/helloworld/helloworld.csproj @@ -74,7 +74,7 @@ - 1.38.0 + 1.40.0 diff --git a/quickstart/csharp/dotnet/multi-device-conversation/helloworld/packages.config b/quickstart/csharp/dotnet/multi-device-conversation/helloworld/packages.config index 60a08bdad..f6b884fdc 100644 --- a/quickstart/csharp/dotnet/multi-device-conversation/helloworld/packages.config +++ b/quickstart/csharp/dotnet/multi-device-conversation/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/csharp/dotnet/speaker-recognition/helloworld/helloworld.csproj b/quickstart/csharp/dotnet/speaker-recognition/helloworld/helloworld.csproj index 88882c8b6..87aeac31b 100644 --- a/quickstart/csharp/dotnet/speaker-recognition/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnet/speaker-recognition/helloworld/helloworld.csproj @@ -105,7 +105,7 @@ - 1.38.0 + 1.40.0 diff --git a/quickstart/csharp/dotnet/speaker-recognition/helloworld/packages.config b/quickstart/csharp/dotnet/speaker-recognition/helloworld/packages.config index 30e57d47b..9c68fa2c0 100644 --- a/quickstart/csharp/dotnet/speaker-recognition/helloworld/packages.config +++ b/quickstart/csharp/dotnet/speaker-recognition/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/csharp/dotnet/text-to-speech/helloworld/Program.cs b/quickstart/csharp/dotnet/text-to-speech/helloworld/Program.cs index 779faf154..1848bc19f 100644 --- a/quickstart/csharp/dotnet/text-to-speech/helloworld/Program.cs +++ b/quickstart/csharp/dotnet/text-to-speech/helloworld/Program.cs @@ -27,7 +27,7 @@ public static async Task SynthesisToSpeakerAsync() var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion"); // Set the voice name, refer to https://aka.ms/speech/voices/neural for full list. - config.SpeechSynthesisVoiceName = "en-US-AvaMultilingualNeural"; + config.SpeechSynthesisVoiceName = "en-US-AriaNeural"; // Creates a speech synthesizer using the default speaker as audio output. using (var synthesizer = new SpeechSynthesizer(config)) diff --git a/quickstart/csharp/dotnet/text-to-speech/helloworld/helloworld.csproj b/quickstart/csharp/dotnet/text-to-speech/helloworld/helloworld.csproj index 4315dec74..8bae726ed 100644 --- a/quickstart/csharp/dotnet/text-to-speech/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnet/text-to-speech/helloworld/helloworld.csproj @@ -94,7 +94,7 @@ - 1.38.0 + 1.40.0 diff --git a/quickstart/csharp/dotnet/text-to-speech/helloworld/packages.config b/quickstart/csharp/dotnet/text-to-speech/helloworld/packages.config index 30e57d47b..9c68fa2c0 100644 --- a/quickstart/csharp/dotnet/text-to-speech/helloworld/packages.config +++ b/quickstart/csharp/dotnet/text-to-speech/helloworld/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/quickstart/csharp/dotnet/translate-speech-to-text/helloworld/helloworld.csproj b/quickstart/csharp/dotnet/translate-speech-to-text/helloworld/helloworld.csproj index 94ad635af..8761ceb1a 100644 --- a/quickstart/csharp/dotnet/translate-speech-to-text/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnet/translate-speech-to-text/helloworld/helloworld.csproj @@ -76,7 +76,7 @@ - 1.38.0 + 1.40.0 diff --git a/quickstart/csharp/dotnet/translate-speech-to-text/helloworld/packages.config b/quickstart/csharp/dotnet/translate-speech-to-text/helloworld/packages.config index 4c7deb2f9..d6b210000 100644 --- a/quickstart/csharp/dotnet/translate-speech-to-text/helloworld/packages.config +++ b/quickstart/csharp/dotnet/translate-speech-to-text/helloworld/packages.config @@ -1,4 +1,4 @@ - + diff --git a/quickstart/csharp/dotnetcore/from-microphone/README.md b/quickstart/csharp/dotnetcore/from-microphone/README.md index afd19ed7a..5f05d22e0 100644 --- a/quickstart/csharp/dotnetcore/from-microphone/README.md +++ b/quickstart/csharp/dotnetcore/from-microphone/README.md @@ -24,15 +24,6 @@ This sample demonstrates how to recognize speech with C# under .NET 6.0 (Windows sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib dotnet-sdk-6.0 openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - ## Build the sample * **By building this sample you will download the Microsoft Cognitive Services Speech SDK. By downloading you acknowledge its license, see [Speech SDK license agreement](https://aka.ms/csspeech/license).** diff --git a/quickstart/csharp/dotnetcore/from-microphone/helloworld/helloworld.csproj b/quickstart/csharp/dotnetcore/from-microphone/helloworld/helloworld.csproj index 396c2d1fb..058e48805 100644 --- a/quickstart/csharp/dotnetcore/from-microphone/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnetcore/from-microphone/helloworld/helloworld.csproj @@ -7,7 +7,7 @@ - + diff --git a/quickstart/csharp/dotnetcore/text-to-speech/README.md b/quickstart/csharp/dotnetcore/text-to-speech/README.md index d18234722..929ec7b89 100644 --- a/quickstart/csharp/dotnetcore/text-to-speech/README.md +++ b/quickstart/csharp/dotnetcore/text-to-speech/README.md @@ -25,15 +25,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib dotnet-sdk-6.0 openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - ## Build the sample * **By building this sample you will download the Microsoft Cognitive Services Speech SDK. By downloading you acknowledge its license, see [Speech SDK license agreement](https://aka.ms/csspeech/license).** diff --git a/quickstart/csharp/dotnetcore/text-to-speech/helloworld/Program.cs b/quickstart/csharp/dotnetcore/text-to-speech/helloworld/Program.cs index 41dadb9de..571fa9970 100644 --- a/quickstart/csharp/dotnetcore/text-to-speech/helloworld/Program.cs +++ b/quickstart/csharp/dotnetcore/text-to-speech/helloworld/Program.cs @@ -27,7 +27,7 @@ public static async Task SynthesisToSpeakerAsync() var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion"); // Set the voice name, refer to https://aka.ms/speech/voices/neural for full list. - config.SpeechSynthesisVoiceName = "en-US-AvaMultilingualNeural"; + config.SpeechSynthesisVoiceName = "en-US-AriaNeural"; // Creates a speech synthesizer using the default speaker as audio output. diff --git a/quickstart/csharp/dotnetcore/text-to-speech/helloworld/helloworld.csproj b/quickstart/csharp/dotnetcore/text-to-speech/helloworld/helloworld.csproj index 396c2d1fb..058e48805 100644 --- a/quickstart/csharp/dotnetcore/text-to-speech/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnetcore/text-to-speech/helloworld/helloworld.csproj @@ -7,7 +7,7 @@ - + diff --git a/quickstart/csharp/dotnetcore/translate-speech-to-text/README.md b/quickstart/csharp/dotnetcore/translate-speech-to-text/README.md index b4ee7015d..7860f3996 100644 --- a/quickstart/csharp/dotnetcore/translate-speech-to-text/README.md +++ b/quickstart/csharp/dotnetcore/translate-speech-to-text/README.md @@ -25,15 +25,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib dotnet-sdk-6.0 openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - ## Build the sample * **By building this sample you will download the Microsoft Cognitive Services Speech SDK. By downloading you acknowledge its license, see [Speech SDK license agreement](https://aka.ms/csspeech/license).** diff --git a/quickstart/csharp/dotnetcore/translate-speech-to-text/helloworld/helloworld.csproj b/quickstart/csharp/dotnetcore/translate-speech-to-text/helloworld/helloworld.csproj index 396c2d1fb..058e48805 100644 --- a/quickstart/csharp/dotnetcore/translate-speech-to-text/helloworld/helloworld.csproj +++ b/quickstart/csharp/dotnetcore/translate-speech-to-text/helloworld/helloworld.csproj @@ -7,7 +7,7 @@ - + diff --git a/quickstart/csharp/uwp/from-microphone/helloworld/helloworld.csproj b/quickstart/csharp/uwp/from-microphone/helloworld/helloworld.csproj index 440fe7316..58e21afdd 100644 --- a/quickstart/csharp/uwp/from-microphone/helloworld/helloworld.csproj +++ b/quickstart/csharp/uwp/from-microphone/helloworld/helloworld.csproj @@ -106,7 +106,7 @@ - 1.38.0 + 1.40.0 6.2.8 diff --git a/quickstart/csharp/uwp/keyword-recognizer/helloworld/helloworld.csproj b/quickstart/csharp/uwp/keyword-recognizer/helloworld/helloworld.csproj index bfd45f77e..b7507c297 100644 --- a/quickstart/csharp/uwp/keyword-recognizer/helloworld/helloworld.csproj +++ b/quickstart/csharp/uwp/keyword-recognizer/helloworld/helloworld.csproj @@ -154,7 +154,7 @@ - 1.38.0 + 1.40.0 6.2.9 diff --git a/quickstart/csharp/uwp/text-to-speech/helloworld/helloworld.csproj b/quickstart/csharp/uwp/text-to-speech/helloworld/helloworld.csproj index 62af75f8b..5893e577d 100644 --- a/quickstart/csharp/uwp/text-to-speech/helloworld/helloworld.csproj +++ b/quickstart/csharp/uwp/text-to-speech/helloworld/helloworld.csproj @@ -106,7 +106,7 @@ - 1.38.0 + 1.40.0 6.2.8 diff --git a/quickstart/csharp/uwp/translate-speech-to-text/helloworld/helloworld.csproj b/quickstart/csharp/uwp/translate-speech-to-text/helloworld/helloworld.csproj index 3456c316f..d753bffd4 100644 --- a/quickstart/csharp/uwp/translate-speech-to-text/helloworld/helloworld.csproj +++ b/quickstart/csharp/uwp/translate-speech-to-text/helloworld/helloworld.csproj @@ -106,7 +106,7 @@ - 1.38.0 + 1.40.0 6.2.8 diff --git a/quickstart/csharp/uwp/virtual-assistant/helloworld.csproj b/quickstart/csharp/uwp/virtual-assistant/helloworld.csproj index c46f3133f..ebfdfa108 100644 --- a/quickstart/csharp/uwp/virtual-assistant/helloworld.csproj +++ b/quickstart/csharp/uwp/virtual-assistant/helloworld.csproj @@ -153,7 +153,7 @@ - 1.38.0 + 1.40.0 6.2.9 diff --git a/quickstart/csharp/xamarin/helloworld/helloworld.Android/Properties/AndroidManifest.xml b/quickstart/csharp/xamarin/helloworld/helloworld.Android/Properties/AndroidManifest.xml index 937d3e830..525de1cb0 100644 --- a/quickstart/csharp/xamarin/helloworld/helloworld.Android/Properties/AndroidManifest.xml +++ b/quickstart/csharp/xamarin/helloworld/helloworld.Android/Properties/AndroidManifest.xml @@ -1,6 +1,6 @@ - + diff --git a/quickstart/csharp/xamarin/helloworld/helloworld.Android/helloworld.Android.csproj b/quickstart/csharp/xamarin/helloworld/helloworld.Android/helloworld.Android.csproj index 7b4650fdd..f8a52e211 100644 --- a/quickstart/csharp/xamarin/helloworld/helloworld.Android/helloworld.Android.csproj +++ b/quickstart/csharp/xamarin/helloworld/helloworld.Android/helloworld.Android.csproj @@ -17,7 +17,7 @@ Resources Assets false - v12.0 + v10.0 true true Xamarin.Android.Net.AndroidClientHandler @@ -54,7 +54,7 @@ - + diff --git a/quickstart/csharp/xamarin/helloworld/helloworld.UWP/helloworld.UWP.csproj b/quickstart/csharp/xamarin/helloworld/helloworld.UWP/helloworld.UWP.csproj index 70eb0a3a5..6251d6c54 100644 --- a/quickstart/csharp/xamarin/helloworld/helloworld.UWP/helloworld.UWP.csproj +++ b/quickstart/csharp/xamarin/helloworld/helloworld.UWP/helloworld.UWP.csproj @@ -147,7 +147,7 @@ - + diff --git a/quickstart/csharp/xamarin/helloworld/helloworld.iOS/helloworld.iOS.csproj b/quickstart/csharp/xamarin/helloworld/helloworld.iOS/helloworld.iOS.csproj index 94518b9bd..cc3a0c0cf 100644 --- a/quickstart/csharp/xamarin/helloworld/helloworld.iOS/helloworld.iOS.csproj +++ b/quickstart/csharp/xamarin/helloworld/helloworld.iOS/helloworld.iOS.csproj @@ -124,7 +124,7 @@ - + diff --git a/quickstart/csharp/xamarin/helloworld/helloworld/helloworld.csproj b/quickstart/csharp/xamarin/helloworld/helloworld/helloworld.csproj index 24e797e89..27bd2887e 100644 --- a/quickstart/csharp/xamarin/helloworld/helloworld/helloworld.csproj +++ b/quickstart/csharp/xamarin/helloworld/helloworld/helloworld.csproj @@ -11,7 +11,7 @@ - + diff --git a/quickstart/java/android/from-microphone/app/build.gradle b/quickstart/java/android/from-microphone/app/build.gradle index df2999e0d..8a368cd0b 100644 --- a/quickstart/java/android/from-microphone/app/build.gradle +++ b/quickstart/java/android/from-microphone/app/build.gradle @@ -25,7 +25,7 @@ dependencies { implementation fileTree(include: ['*.jar'], dir: 'libs') // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.38.0' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.40.0' implementation 'androidx.appcompat:appcompat:1.3.1' implementation 'androidx.constraintlayout:constraintlayout:2.1.0' diff --git a/quickstart/java/android/intent-recognition/app/build.gradle b/quickstart/java/android/intent-recognition/app/build.gradle index df2999e0d..8a368cd0b 100644 --- a/quickstart/java/android/intent-recognition/app/build.gradle +++ b/quickstart/java/android/intent-recognition/app/build.gradle @@ -25,7 +25,7 @@ dependencies { implementation fileTree(include: ['*.jar'], dir: 'libs') // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.38.0' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.40.0' implementation 'androidx.appcompat:appcompat:1.3.1' implementation 'androidx.constraintlayout:constraintlayout:2.1.0' diff --git a/quickstart/java/android/keyword-recognizer-stream/.settings/org.eclipse.buildship.core.prefs b/quickstart/java/android/keyword-recognizer-stream/.settings/org.eclipse.buildship.core.prefs index bdf050953..e6a7ce8ff 100644 --- a/quickstart/java/android/keyword-recognizer-stream/.settings/org.eclipse.buildship.core.prefs +++ b/quickstart/java/android/keyword-recognizer-stream/.settings/org.eclipse.buildship.core.prefs @@ -1,11 +1,11 @@ -arguments= +arguments=--init-script C\:\\Users\\brianem\\AppData\\Roaming\\Code\\User\\globalStorage\\redhat.java\\1.33.0\\config_win\\org.eclipse.osgi\\55\\0\\.cp\\gradle\\init\\init.gradle --init-script C\:\\Users\\brianem\\AppData\\Roaming\\Code\\User\\globalStorage\\redhat.java\\1.33.0\\config_win\\org.eclipse.osgi\\55\\0\\.cp\\gradle\\protobuf\\init.gradle auto.sync=false build.scans.enabled=false connection.gradle.distribution=GRADLE_DISTRIBUTION(WRAPPER) connection.project.dir= eclipse.preferences.version=1 gradle.user.home= -java.home=C\:/Program Files/Eclipse Foundation/jdk-8.0.302.8-hotspot +java.home=C\:/Program Files/Microsoft/jdk-11.0.16.101-hotspot jvm.arguments= offline.mode=false override.workspace.settings=true diff --git a/quickstart/java/android/keyword-recognizer-stream/app/build.gradle b/quickstart/java/android/keyword-recognizer-stream/app/build.gradle index df2999e0d..8a368cd0b 100644 --- a/quickstart/java/android/keyword-recognizer-stream/app/build.gradle +++ b/quickstart/java/android/keyword-recognizer-stream/app/build.gradle @@ -25,7 +25,7 @@ dependencies { implementation fileTree(include: ['*.jar'], dir: 'libs') // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.38.0' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.40.0' implementation 'androidx.appcompat:appcompat:1.3.1' implementation 'androidx.constraintlayout:constraintlayout:2.1.0' diff --git a/quickstart/java/android/keyword-recognizer/.settings/org.eclipse.buildship.core.prefs b/quickstart/java/android/keyword-recognizer/.settings/org.eclipse.buildship.core.prefs index bdf050953..e6a7ce8ff 100644 --- a/quickstart/java/android/keyword-recognizer/.settings/org.eclipse.buildship.core.prefs +++ b/quickstart/java/android/keyword-recognizer/.settings/org.eclipse.buildship.core.prefs @@ -1,11 +1,11 @@ -arguments= +arguments=--init-script C\:\\Users\\brianem\\AppData\\Roaming\\Code\\User\\globalStorage\\redhat.java\\1.33.0\\config_win\\org.eclipse.osgi\\55\\0\\.cp\\gradle\\init\\init.gradle --init-script C\:\\Users\\brianem\\AppData\\Roaming\\Code\\User\\globalStorage\\redhat.java\\1.33.0\\config_win\\org.eclipse.osgi\\55\\0\\.cp\\gradle\\protobuf\\init.gradle auto.sync=false build.scans.enabled=false connection.gradle.distribution=GRADLE_DISTRIBUTION(WRAPPER) connection.project.dir= eclipse.preferences.version=1 gradle.user.home= -java.home=C\:/Program Files/Eclipse Foundation/jdk-8.0.302.8-hotspot +java.home=C\:/Program Files/Microsoft/jdk-11.0.16.101-hotspot jvm.arguments= offline.mode=false override.workspace.settings=true diff --git a/quickstart/java/android/keyword-recognizer/app/build.gradle b/quickstart/java/android/keyword-recognizer/app/build.gradle index df2999e0d..8a368cd0b 100644 --- a/quickstart/java/android/keyword-recognizer/app/build.gradle +++ b/quickstart/java/android/keyword-recognizer/app/build.gradle @@ -25,7 +25,7 @@ dependencies { implementation fileTree(include: ['*.jar'], dir: 'libs') // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.38.0' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.40.0' implementation 'androidx.appcompat:appcompat:1.3.1' implementation 'androidx.constraintlayout:constraintlayout:2.1.0' diff --git a/quickstart/java/android/text-to-speech/app/build.gradle b/quickstart/java/android/text-to-speech/app/build.gradle index df2999e0d..8a368cd0b 100644 --- a/quickstart/java/android/text-to-speech/app/build.gradle +++ b/quickstart/java/android/text-to-speech/app/build.gradle @@ -25,7 +25,7 @@ dependencies { implementation fileTree(include: ['*.jar'], dir: 'libs') // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.38.0' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.40.0' implementation 'androidx.appcompat:appcompat:1.3.1' implementation 'androidx.constraintlayout:constraintlayout:2.1.0' diff --git a/quickstart/java/jre/conversation-transcription/README.md b/quickstart/java/jre/conversation-transcription/README.md index 57198966f..5ce3f8081 100644 --- a/quickstart/java/jre/conversation-transcription/README.md +++ b/quickstart/java/jre/conversation-transcription/README.md @@ -20,15 +20,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib java-11-openjdk-devel openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - 1. This sample has not been verified with Eclipse on ARM platforms. ## Build the sample diff --git a/quickstart/java/jre/conversation-transcription/pom.xml b/quickstart/java/jre/conversation-transcription/pom.xml index 3f120415c..503771327 100644 --- a/quickstart/java/jre/conversation-transcription/pom.xml +++ b/quickstart/java/jre/conversation-transcription/pom.xml @@ -39,7 +39,7 @@ com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 com.google.code.gson diff --git a/quickstart/java/jre/from-microphone/README.md b/quickstart/java/jre/from-microphone/README.md index 8a0f1c518..8507e4128 100644 --- a/quickstart/java/jre/from-microphone/README.md +++ b/quickstart/java/jre/from-microphone/README.md @@ -19,15 +19,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib java-1.8.0-openjdk-devel openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - 1. This sample has not been verified with Eclipse on ARM platforms. ## Build the sample diff --git a/quickstart/java/jre/from-microphone/pom.xml b/quickstart/java/jre/from-microphone/pom.xml index e8e3bf9c3..cf5896191 100644 --- a/quickstart/java/jre/from-microphone/pom.xml +++ b/quickstart/java/jre/from-microphone/pom.xml @@ -62,7 +62,7 @@ com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 diff --git a/quickstart/java/jre/intent-recognition/README.md b/quickstart/java/jre/intent-recognition/README.md index 45af254c0..b30900e4f 100644 --- a/quickstart/java/jre/intent-recognition/README.md +++ b/quickstart/java/jre/intent-recognition/README.md @@ -19,15 +19,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib java-1.8.0-openjdk-devel openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - 1. This sample has not been verified with Eclipse on ARM platforms. ## Build the sample diff --git a/quickstart/java/jre/intent-recognition/pom.xml b/quickstart/java/jre/intent-recognition/pom.xml index 57eae56c6..47a41891b 100644 --- a/quickstart/java/jre/intent-recognition/pom.xml +++ b/quickstart/java/jre/intent-recognition/pom.xml @@ -39,7 +39,7 @@ com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 diff --git a/quickstart/java/jre/meeting-transcription/README.md b/quickstart/java/jre/meeting-transcription/README.md index 6775529c7..69a88a11d 100644 --- a/quickstart/java/jre/meeting-transcription/README.md +++ b/quickstart/java/jre/meeting-transcription/README.md @@ -20,15 +20,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib java-11-openjdk-devel openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - 1. This sample has not been verified with Eclipse on ARM platforms. ## Build the sample diff --git a/quickstart/java/jre/meeting-transcription/pom.xml b/quickstart/java/jre/meeting-transcription/pom.xml index c8c7efc25..56c95c1df 100644 --- a/quickstart/java/jre/meeting-transcription/pom.xml +++ b/quickstart/java/jre/meeting-transcription/pom.xml @@ -39,7 +39,7 @@ com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 com.google.code.gson diff --git a/quickstart/java/jre/speaker-recognition/README.md b/quickstart/java/jre/speaker-recognition/README.md index c4a96d058..b48e29df7 100644 --- a/quickstart/java/jre/speaker-recognition/README.md +++ b/quickstart/java/jre/speaker-recognition/README.md @@ -21,15 +21,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib java-1.8.0-openjdk-devel openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - 1. This sample has not been verified with Eclipse on ARM platforms. ## Build the sample diff --git a/quickstart/java/jre/speaker-recognition/pom.xml b/quickstart/java/jre/speaker-recognition/pom.xml index 5009d4a47..253cdfbdc 100644 --- a/quickstart/java/jre/speaker-recognition/pom.xml +++ b/quickstart/java/jre/speaker-recognition/pom.xml @@ -36,7 +36,7 @@ com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 diff --git a/quickstart/java/jre/text-to-speech/README.md b/quickstart/java/jre/text-to-speech/README.md index b2f20eb3f..5e77f5070 100644 --- a/quickstart/java/jre/text-to-speech/README.md +++ b/quickstart/java/jre/text-to-speech/README.md @@ -19,15 +19,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib java-1.8.0-openjdk-devel openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - 1. This sample has not been verified with Eclipse on ARM platforms. ## Build the sample diff --git a/quickstart/java/jre/text-to-speech/pom.xml b/quickstart/java/jre/text-to-speech/pom.xml index c50bf8e4b..ff509bd64 100644 --- a/quickstart/java/jre/text-to-speech/pom.xml +++ b/quickstart/java/jre/text-to-speech/pom.xml @@ -62,7 +62,7 @@ com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 diff --git a/quickstart/java/jre/text-to-speech/src/speechsdk/quickstart/Main.java b/quickstart/java/jre/text-to-speech/src/speechsdk/quickstart/Main.java index e2f0476ea..4f09cd3cd 100644 --- a/quickstart/java/jre/text-to-speech/src/speechsdk/quickstart/Main.java +++ b/quickstart/java/jre/text-to-speech/src/speechsdk/quickstart/Main.java @@ -30,7 +30,7 @@ public static void main(String[] args) { try (SpeechConfig config = SpeechConfig.fromSubscription(speechSubscriptionKey, serviceRegion)) { // Set the voice name, refer to https://aka.ms/speech/voices/neural for full // list. - config.setSpeechSynthesisVoiceName("en-US-AvaMultilingualNeural"); + config.setSpeechSynthesisVoiceName("en-US-AriaNeural"); try (SpeechSynthesizer synth = new SpeechSynthesizer(config)) { assert (config != null); diff --git a/quickstart/java/jre/translate-speech-to-text/README.md b/quickstart/java/jre/translate-speech-to-text/README.md index 710116f50..fe9aa87f4 100644 --- a/quickstart/java/jre/translate-speech-to-text/README.md +++ b/quickstart/java/jre/translate-speech-to-text/README.md @@ -19,15 +19,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib java-1.8.0-openjdk-devel openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - 1. This sample has not been verified with Eclipse on ARM platforms. ## Build the sample diff --git a/quickstart/java/jre/translate-speech-to-text/pom.xml b/quickstart/java/jre/translate-speech-to-text/pom.xml index aca39991c..6f4afacb8 100644 --- a/quickstart/java/jre/translate-speech-to-text/pom.xml +++ b/quickstart/java/jre/translate-speech-to-text/pom.xml @@ -20,7 +20,7 @@ com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 diff --git a/quickstart/java/jre/virtual-assistant/README.md b/quickstart/java/jre/virtual-assistant/README.md index fc146660d..055fa0c90 100644 --- a/quickstart/java/jre/virtual-assistant/README.md +++ b/quickstart/java/jre/virtual-assistant/README.md @@ -20,15 +20,6 @@ See the [accompanying article](https://docs.microsoft.com/azure/cognitive-servic sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib java-1.8.0-openjdk-devel openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - 1. This sample has not been verified with Eclipse on ARM platforms. ## Build the sample diff --git a/quickstart/java/jre/virtual-assistant/pom.xml b/quickstart/java/jre/virtual-assistant/pom.xml index 7c885ee09..7e8783c34 100644 --- a/quickstart/java/jre/virtual-assistant/pom.xml +++ b/quickstart/java/jre/virtual-assistant/pom.xml @@ -26,7 +26,7 @@ com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 org.slf4j diff --git a/quickstart/javascript/node/conversation-transcription/package.json b/quickstart/javascript/node/conversation-transcription/package.json index 619ef23e3..40c7319d6 100644 --- a/quickstart/javascript/node/conversation-transcription/package.json +++ b/quickstart/javascript/node/conversation-transcription/package.json @@ -12,6 +12,6 @@ "license": "MIT", "dependencies": { "https-proxy-agent": "^3.0.0", - "microsoft-cognitiveservices-speech-sdk": "^1.38.0" + "microsoft-cognitiveservices-speech-sdk": "^1.40.0" } } diff --git a/quickstart/javascript/node/from-file/package.json b/quickstart/javascript/node/from-file/package.json index 619ef23e3..40c7319d6 100644 --- a/quickstart/javascript/node/from-file/package.json +++ b/quickstart/javascript/node/from-file/package.json @@ -12,6 +12,6 @@ "license": "MIT", "dependencies": { "https-proxy-agent": "^3.0.0", - "microsoft-cognitiveservices-speech-sdk": "^1.38.0" + "microsoft-cognitiveservices-speech-sdk": "^1.40.0" } } diff --git a/quickstart/javascript/node/meeting-transcription/package.json b/quickstart/javascript/node/meeting-transcription/package.json index 619ef23e3..40c7319d6 100644 --- a/quickstart/javascript/node/meeting-transcription/package.json +++ b/quickstart/javascript/node/meeting-transcription/package.json @@ -12,6 +12,6 @@ "license": "MIT", "dependencies": { "https-proxy-agent": "^3.0.0", - "microsoft-cognitiveservices-speech-sdk": "^1.38.0" + "microsoft-cognitiveservices-speech-sdk": "^1.40.0" } } diff --git a/quickstart/javascript/node/speaker-recognition/identification/package.json b/quickstart/javascript/node/speaker-recognition/identification/package.json index 804346801..20203cb6a 100644 --- a/quickstart/javascript/node/speaker-recognition/identification/package.json +++ b/quickstart/javascript/node/speaker-recognition/identification/package.json @@ -12,6 +12,6 @@ "license": "MIT", "dependencies": { "https-proxy-agent": "^3.0.0", - "microsoft-cognitiveservices-speech-sdk": "^1.38.0" + "microsoft-cognitiveservices-speech-sdk": "^1.40.0" } } diff --git a/quickstart/javascript/node/speaker-recognition/verification/package.json b/quickstart/javascript/node/speaker-recognition/verification/package.json index 4dcb8b3dc..458f5baa7 100644 --- a/quickstart/javascript/node/speaker-recognition/verification/package.json +++ b/quickstart/javascript/node/speaker-recognition/verification/package.json @@ -12,6 +12,6 @@ "license": "MIT", "dependencies": { "https-proxy-agent": "^3.0.0", - "microsoft-cognitiveservices-speech-sdk": "^1.38.0" + "microsoft-cognitiveservices-speech-sdk": "^1.40.0" } } diff --git a/quickstart/javascript/node/text-to-speech/package.json b/quickstart/javascript/node/text-to-speech/package.json index 0d21c86c1..b18dc9818 100644 --- a/quickstart/javascript/node/text-to-speech/package.json +++ b/quickstart/javascript/node/text-to-speech/package.json @@ -12,7 +12,7 @@ "license": "MIT", "dependencies": { "https-proxy-agent": "^3.0.0", - "microsoft-cognitiveservices-speech-sdk": "^1.38.0", + "microsoft-cognitiveservices-speech-sdk": "^1.40.0", "readline": "^1.3.0" } } diff --git a/quickstart/objectivec/macos/from-microphone/helloworld/Podfile b/quickstart/objectivec/macos/from-microphone/helloworld/Podfile index 774b470c4..6f5257e09 100644 --- a/quickstart/objectivec/macos/from-microphone/helloworld/Podfile +++ b/quickstart/objectivec/macos/from-microphone/helloworld/Podfile @@ -1,4 +1,4 @@ target 'helloworld' do platform :osx, '10.13' - pod 'MicrosoftCognitiveServicesSpeech-macOS', '~> 1.38.0' + pod 'MicrosoftCognitiveServicesSpeech-macOS', '~> 1.40.0' end diff --git a/quickstart/objectivec/macos/text-to-speech/helloworld/Podfile b/quickstart/objectivec/macos/text-to-speech/helloworld/Podfile index 774b470c4..6f5257e09 100644 --- a/quickstart/objectivec/macos/text-to-speech/helloworld/Podfile +++ b/quickstart/objectivec/macos/text-to-speech/helloworld/Podfile @@ -1,4 +1,4 @@ target 'helloworld' do platform :osx, '10.13' - pod 'MicrosoftCognitiveServicesSpeech-macOS', '~> 1.38.0' + pod 'MicrosoftCognitiveServicesSpeech-macOS', '~> 1.40.0' end diff --git a/quickstart/python/from-microphone/README.md b/quickstart/python/from-microphone/README.md index d2b510c41..d34ec1125 100644 --- a/quickstart/python/from-microphone/README.md +++ b/quickstart/python/from-microphone/README.md @@ -17,14 +17,6 @@ Before you get started, here's a list of prerequisites: sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - ```sh - sudo yum update - sudo yum install alsa-lib openssl python3 - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - * On Windows you need the [Microsoft Visual C++ Redistributable for Visual Studio 2017](https://support.microsoft.com/help/2977003/the-latest-supported-visual-c-downloads) for your platform. ## Get the Speech SDK Python Package diff --git a/quickstart/python/intent-recognition/README.md b/quickstart/python/intent-recognition/README.md index a7e3997a4..9c95fbb8a 100644 --- a/quickstart/python/intent-recognition/README.md +++ b/quickstart/python/intent-recognition/README.md @@ -17,14 +17,6 @@ Before you get started, here's a list of prerequisites: sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - ```sh - sudo yum update - sudo yum install alsa-lib openssl python3 - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - * On Windows you need the [Microsoft Visual C++ Redistributable for Visual Studio 2017](https://support.microsoft.com/help/2977003/the-latest-supported-visual-c-downloads) for your platform. ## Get the Speech SDK Python Package diff --git a/quickstart/python/text-to-speech/README.md b/quickstart/python/text-to-speech/README.md index a2cfc5049..329d3c58b 100644 --- a/quickstart/python/text-to-speech/README.md +++ b/quickstart/python/text-to-speech/README.md @@ -17,14 +17,6 @@ Before you get started, here's a list of prerequisites: sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - ```sh - sudo yum update - sudo yum install alsa-lib openssl python3 - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - * On Windows you need the [Microsoft Visual C++ Redistributable for Visual Studio 2017](https://support.microsoft.com/help/2977003/the-latest-supported-visual-c-downloads) for your platform. ## Get the Speech SDK Python Package diff --git a/quickstart/python/text-to-speech/quickstart.py b/quickstart/python/text-to-speech/quickstart.py index a94b1f065..823248e11 100644 --- a/quickstart/python/text-to-speech/quickstart.py +++ b/quickstart/python/text-to-speech/quickstart.py @@ -10,7 +10,7 @@ speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Set the voice name, refer to https://aka.ms/speech/voices/neural for full list. -speech_config.speech_synthesis_voice_name = "en-US-AvaMultilingualNeural" +speech_config.speech_synthesis_voice_name = "en-US-AriaNeural" # Creates a speech synthesizer using the default speaker as audio output. speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config) diff --git a/sampledata/customspeech/en-US/biology/testing/audio-and-trans.zip b/sampledata/customspeech/en-US/biology/testing/audio-and-trans.zip index 277120af9..4d8ff4c66 100644 Binary files a/sampledata/customspeech/en-US/biology/testing/audio-and-trans.zip and b/sampledata/customspeech/en-US/biology/testing/audio-and-trans.zip differ diff --git a/sampledata/customspeech/en-US/display formatting/testing/audio-and-trans.zip b/sampledata/customspeech/en-US/display formatting/testing/audio-and-trans.zip index f99d2dbb2..7def754d0 100644 Binary files a/sampledata/customspeech/en-US/display formatting/testing/audio-and-trans.zip and b/sampledata/customspeech/en-US/display formatting/testing/audio-and-trans.zip differ diff --git a/samples/batch-avatar/README.md b/samples/batch-avatar/README.md index c521a8459..7ad14102e 100644 --- a/samples/batch-avatar/README.md +++ b/samples/batch-avatar/README.md @@ -3,21 +3,14 @@ The Batch avatar synthesis API (Preview) provides asynchronous synthesis of talking avatar to generate avatar video content with the text input. The functionality is exposed through a REST API and is easy to access from many programming languages. The samples here do **NOT** require the installation of the Cognitive Service Speech SDK, but use the REST API directly instead. -For a detailed explanation see the [batch avatar synthesis documentation](https://learn.microsoft.com/azure/ai-services/speech-service/text-to-speech-avatar/batch-synthesis-avatar) and the `README.md` in the language specific subdirectories. +For a detailed explanation see the [batch synthesis documentation](https://docs.microsoft.com/azure/cognitive-services/speech-service/batch-synthesis) and the `README.md` in the language specific subdirectories. Available samples: | Language | Directory | Description | | ---------- | -------- | ----------- | | Python | [python](python) | Python client calling batch avatar synthesis REST API | -| C# | [csharp](csharp) | C# client calling batch avatar REST API. | ## Note Refer to [this](../js/browser/avatar/README.md) for real time avatar synthesis. - - -## Resources - -1. [Batch avatar synthesis request properties](https://learn.microsoft.com/azure/ai-services/speech-service/text-to-speech-avatar/batch-synthesis-avatar-properties) -2. [The OPENAPI specification for the Batch avatar synthesis API](https://github.com/Azure/azure-rest-api-specs/blob/main/specification/cognitiveservices/data-plane/Speech/BatchAvatar/preview/2024-04-15-preview/batchavatar.json) diff --git a/samples/batch-avatar/csharp/BatchAvatarSample/.editorconfig b/samples/batch-avatar/csharp/BatchAvatarSample/.editorconfig deleted file mode 100644 index e25d487b5..000000000 --- a/samples/batch-avatar/csharp/BatchAvatarSample/.editorconfig +++ /dev/null @@ -1,5 +0,0 @@ -[*] -charset = utf-8 - -[*.cs] -file_header_template = \nCopyright (c) Microsoft. All rights reserved.\nLicensed under the MIT license. See LICENSE.md file in the project root for full license information.\n \ No newline at end of file diff --git a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample.sln b/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample.sln deleted file mode 100644 index 385ee940b..000000000 --- a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample.sln +++ /dev/null @@ -1,24 +0,0 @@ -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.9.34728.123 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BatchAvatarSample", "BatchAvatarSample\BatchAvatarSample.csproj", "{CF5ED996-4313-480A-8A3D-0BB8C3D0B73A}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Release|Any CPU = Release|Any CPU - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {CF5ED996-4313-480A-8A3D-0BB8C3D0B73A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {CF5ED996-4313-480A-8A3D-0BB8C3D0B73A}.Debug|Any CPU.Build.0 = Debug|Any CPU - {CF5ED996-4313-480A-8A3D-0BB8C3D0B73A}.Release|Any CPU.ActiveCfg = Release|Any CPU - {CF5ED996-4313-480A-8A3D-0BB8C3D0B73A}.Release|Any CPU.Build.0 = Release|Any CPU - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {B0A9639E-340F-46FF-B5AE-92569CFF2A72} - EndGlobalSection -EndGlobal diff --git a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/BatchAvatarSample.csproj b/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/BatchAvatarSample.csproj deleted file mode 100644 index 91b464afe..000000000 --- a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/BatchAvatarSample.csproj +++ /dev/null @@ -1,10 +0,0 @@ - - - - Exe - net8.0 - enable - enable - - - diff --git a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/Program.cs b/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/Program.cs deleted file mode 100644 index d6833294c..000000000 --- a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/Program.cs +++ /dev/null @@ -1,185 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -using BatchAvatarSample.dto; -using System.Net.Http.Json; -using System.Text.Json; -using System.Text.Json.Serialization; - -internal class Program -{ - private const string ApiVersion = "2024-08-01"; - - // The endpoint (and key) could be gotten from the Keys and Endpoint page in the Speech service resource. - // The endpoint would be like: https://.api.cognitive.microsoft.com or https://.cognitiveservices.azure.com - private static readonly string subscriptionKey = "SPEECH_KEY"; - private static readonly string speechEndpoint = "SPEECH_ENDPOINT"; - - private static readonly JsonSerializerOptions defaultJsonSerializerOptions = new(JsonSerializerDefaults.Web) - { - DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, - }; - - private static async Task Main(string[] args) - { - using var httpClient = new HttpClient(); - httpClient.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", subscriptionKey); - - // Create a job ID. - // Job ID must be unique within a speech resource. - var jobId = $"batchavatar-{Guid.NewGuid()}"; - - // - // Submit a batch avatar job - // - var jobUri = $"{speechEndpoint}/avatar/batchsyntheses/{jobId}?api-version={ApiVersion}"; - - try - { - var job = await CreateBatchAvatarJob(httpClient, jobUri); - Console.WriteLine($"Batch avatar synthesis job {jobId} created."); - - // - // Get job status - // - job = await GetJobAsync(httpClient, jobUri); - Console.WriteLine($"Batch avatar synthesis job {job.Id} is in {job.Status} status."); - - // - // Poll until job completes - // - while (job.Status is not ("Succeeded" or "Failed")) - { - Console.WriteLine($"Batch avatar synthesis job {job.Id} is still running."); - await Task.Delay(5000); - job = await GetJobAsync(httpClient, jobUri); - } - - if (job.Status is "Failed") - { - Console.WriteLine($"Batch avatar synthesis job {job.Id} failed."); - return; - } - - Console.WriteLine($"Batch avatar synthesis job {job.Id} completed successfully."); - - // - // Get outputs - // - Console.WriteLine("Video file can be downloaded from:"); - Console.WriteLine(job.Outputs!.Result); - Console.WriteLine("Summary file can be downloaded from:"); - Console.WriteLine(job.Outputs!.Summary); - - // - // Delete a job - // - await DeleteJobAsync(httpClient, jobUri); - Console.WriteLine($"Batch avatar synthesis job {job.Id} deleted."); - - // - // List jobs - // - var maxpagesize = 10; - var listUri = $"{speechEndpoint}/avatar/batchsyntheses?maxpagesize={maxpagesize}&api-version={ApiVersion}"; - var allJobs = await ListJobsAsync(httpClient, listUri); - Console.WriteLine($"Listed {allJobs.Count} jobs."); - } - catch (HttpRequestException exception) - { - Console.Error.WriteLine(exception.Message); - return; - } - } - - - private static async Task CreateBatchAvatarJob(HttpClient httpClient, string jobUri) - { - // To use SSML as input, please refer to RequestExamples.SsmlRequest - // To use your custom neural voice, please refer to RequestExamples.CustomVoiceRequest - var requestBody = new BatchAvatarRequest - { - InputKind = "PlainText", - Inputs = - [ - new BatchAvatarInput - { - Content = "Hi, I'm a virtual assistant created by Microsoft.", - }, - ], - SynthesisConfig = new() - { - Voice = "en-US-AvaMultilingualNeural", - }, - AvatarConfig = new() - { - TalkingAvatarCharacter = "lisa", // Avatar character - TalkingAvatarStyle = "graceful-sitting", // Avatar style, required for prebuilt avatar, optional for custom avatar - VideoFormat = "mp4", // mp4 or webm, webm is required for transparent background - VideoCodec = "h264", // hevc, h264 or vp9, vp9 is required for transparent background; default is hevc - SubtitleType = "soft_embedded", - BackgroundColor = "#FFFFFFFF", // background color in RGBA format, default is white; can be set to 'transparent' for transparent background - Customized = false, // Set to true if you want to use custom avatar - }, - }; - - var response = await httpClient.PutAsJsonAsync(jobUri, requestBody, defaultJsonSerializerOptions); - await PrintResponseOnError(response); - response.EnsureSuccessStatusCode(); - var createdJob = await response.Content.ReadFromJsonAsync(defaultJsonSerializerOptions); - return createdJob!; - } - - private static async Task GetJobAsync(HttpClient httpClient, string jobUri) - { - var response = await httpClient.GetAsync(jobUri); - await PrintResponseOnError(response); - response.EnsureSuccessStatusCode(); - var job = await response.Content.ReadFromJsonAsync(defaultJsonSerializerOptions); - return job!; - } - - private static async Task DeleteJobAsync(HttpClient httpClient, string jobUri) - { - var response = await httpClient.DeleteAsync(jobUri); - await PrintResponseOnError(response); - response.EnsureSuccessStatusCode(); - } - - private static async Task> ListJobsAsync(HttpClient httpClient, string listUri) - { - var allJobs = new List(); - var response = await httpClient.GetAsync(listUri); - await PrintResponseOnError(response); - response.EnsureSuccessStatusCode(); - - var pagedJobs = await response.Content.ReadFromJsonAsync>(defaultJsonSerializerOptions); - allJobs.AddRange(pagedJobs!.Value); - while (pagedJobs!.NextLink is not null) - { - response = await httpClient.GetAsync(pagedJobs.NextLink); - await PrintResponseOnError(response); - response.EnsureSuccessStatusCode(); - - pagedJobs = await response.Content.ReadFromJsonAsync>(defaultJsonSerializerOptions); - allJobs.AddRange(pagedJobs!.Value); - } - - return allJobs; - } - - private static async Task PrintResponseOnError(HttpResponseMessage response) - { - if (response.IsSuccessStatusCode) - { - return; - } - - var responseBody = await response.Content.ReadAsStringAsync(); - var requestId = response.Headers.GetValues("apim-request-id").FirstOrDefault(); - Console.Error.WriteLine(responseBody); - Console.Error.WriteLine($"Request ID: {requestId}"); - } -} \ No newline at end of file diff --git a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/RequestExamples.cs b/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/RequestExamples.cs deleted file mode 100644 index bbf35dfa6..000000000 --- a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/RequestExamples.cs +++ /dev/null @@ -1,68 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -namespace BatchAvatarSample; - -using BatchAvatarSample.dto; -using System; -using System.Collections.Generic; - -internal static class RequestExamples -{ - public static BatchAvatarRequest SsmlRequest = new() - { - InputKind = "Ssml", - Inputs = - [ - new BatchAvatarInput - { - Content = """ - hello, this is my talking avatar - """, - }, - ], - AvatarConfig = new() - { - TalkingAvatarCharacter = "lisa", - TalkingAvatarStyle = "graceful-sitting", - VideoFormat = "mp4", - VideoCodec = "h264", - SubtitleType = "soft_embedded", - BackgroundColor = "#FFFFFFFF", - }, - }; - - public static BatchAvatarRequest CustomVoiceRequest = new() - { - InputKind = "PlainText", - Inputs = - [ - new BatchAvatarInput - { - Content = "Hi, I'm a virtual assistant created by Microsoft.", - }, - ], - // Replace with your custom voice name and deployment ID if you want to use custom voice. - // Multiple voices are supported, the mixture of custom voices and platform voices is allowed. - // Invalid voice name or deployment ID will be rejected. - CustomVoices = new Dictionary - { - ["YOUR_CUSTOM_VOICE_NAME"] = Guid.Parse("YOUR_CUSTOM_VOICE_DEPLOYMENT_ID"), - }, - SynthesisConfig = new() - { - Voice = "YOUR_CUSTOM_VOICE_NAME", - }, - AvatarConfig = new() - { - TalkingAvatarCharacter = "lisa", - TalkingAvatarStyle = "graceful-sitting", - VideoFormat = "mp4", - VideoCodec = "h264", - SubtitleType = "soft_embedded", - BackgroundColor = "#FFFFFFFF", - }, - }; -} diff --git a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/dto/BatchAvatarJob.cs b/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/dto/BatchAvatarJob.cs deleted file mode 100644 index 34e465c3a..000000000 --- a/samples/batch-avatar/csharp/BatchAvatarSample/BatchAvatarSample/dto/BatchAvatarJob.cs +++ /dev/null @@ -1,165 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -namespace BatchAvatarSample.dto; -using System; - -public class BatchAvatarRequest -{ - /// - /// The synthesis text kind, `SSML` or `PlainText`. - /// - public required string InputKind { get; set; } - - public required IList Inputs { get; set; } - - public IReadOnlyDictionary? CustomVoices { get; set; } - - public BatchSynthesisconfig? SynthesisConfig { get; set; } - - public BatchSynthesisProperties? Properties { get; set; } - - public required BatchAvatarconfig AvatarConfig { get; set; } -} - -public class BatchAvatarJob -{ - public required string Id { get; set; } - - public string? Description { get; set; } - - public required string Status { get; set; } - - public DateTime CreatedDateTime { get; set; } - - public DateTime LastActionDateTime { get; set; } - - public IReadOnlyDictionary? CustomVoices { get; set; } - - public BatchSynthesisconfig? SynthesisConfig { get; set; } - - public BatchSynthesisProperties? Properties { get; set; } - - public required BatchAvatarconfig AvatarConfig { get; set; } - - public BatchSynthesisOutputs? Outputs { get; set; } -} - - -public class BatchAvatarInput -{ - public required string Content { get; set; } -} - - -public class BatchSynthesisProperties -{ - public int TimeToLiveInHours { get; set; } - - public Uri? DestinationContainerUrl { get; set; } - - public string? DestinationPath { get; set; } - - public int? sizeInBytes { get; set; } - - public int? succeededCount { get; set; } - - public int? failedCount { get; set; } - - public int? durationInMilliseconds { get; set; } - - public Billingdetails? billingDetails { get; set; } - - public Error? Error { get; set; } -} - -public class Billingdetails -{ - public int neuralCharacters { get; set; } - public int talkingAvatarDurationSeconds { get; set; } -} - -public class BatchSynthesisconfig -{ - public required string Voice { get; set; } - - public string? Style { get; set; } - - public string? Rate { get; set; } - - public string? Pitch { get; set; } - - public string? Volume { get; set; } -} - -public class BatchAvatarconfig -{ - public required string TalkingAvatarCharacter { get; set; } - - public string? TalkingAvatarStyle { get; set; } - - public string? VideoFormat { get; set; } - - public string? VideoCodec { get; set; } - - public string? SubtitleType { get; set; } - - public string? BackgroundColor { get; set; } - - public Uri? BackgroundImage { get; set; } - - public Coordinate? AvatarPosition { get; set; } - - public Coordinate? AvatarSize { get; set; } - - public Videocrop? VideoCrop { get; set; } - - public int? BitrateKbps { get; set; } - - public bool? Customized { get; set; } -} - -public class Videocrop -{ - public required Coordinate TopLeft { get; set; } - - public required Coordinate BottomRight { get; set; } -} - -public record Coordinate(int X, int Y); - -public class BatchSynthesisOutputs -{ - public required string Result { get; set; } - - public required string Summary { get; set; } -} - -public class BillingDetails -{ - public long NeuralCharacters { get; set; } - - public long CustomNeuralCharacters { get; set; } - - public long talkingAvatarDurationSeconds { get; set; } - - public long AoaiCharacters { get; set; } - - public long AoaiHDCharacters { get; set; } -} - -public class Error -{ - public required string Code { get; set; } - - public required string Message { get; set; } -} - -public class PaginatedResults -{ - public required IList Value { get; set; } - - public Uri? NextLink { get; set; } -} \ No newline at end of file diff --git a/samples/batch-avatar/python/README.md b/samples/batch-avatar/python/README.md index 94c859a25..bdb16c2b8 100644 --- a/samples/batch-avatar/python/README.md +++ b/samples/batch-avatar/python/README.md @@ -8,28 +8,20 @@ The sample uses the `requests` library. You can install it with the command pip install requests ``` -We recommend using a passwordless authentication provided by the `azure-identity` library. -You can install it with the command - -```sh -pip install azure-identity -``` - ## Run the sample code The sample code itself is [synthesis.py](synthesis.py) and can be run using Python 3.8 or higher. You will need to adapt the following information to run the sample: -1. Your Azure AI Speech Services. +1. Your Cognitive Services subscription key and region. Some notes: - - We recommend using a passwordless authentication provided by the `azure-identity` library. Your Microsoft Entra user account is need to be assigned with `Cognitive Services User` or `Cognitive Services Speech User` role. - - Alternatively, you can get the subscription key from the "Keys and Endpoint" tab on your Azure AI Speech resource in the Azure Portal. + - You can get the subscription key from the "Keys and Endpoint" tab on your Cognitive Services or Speech resource in the Azure Portal. - Batch avatar synthesis is only available for paid subscriptions, free subscriptions are not supported. - Batch avatar synthesis is only available in these service regions: `West US 2`, `West Europe` and `South East Asia` -2. (Optional:) The relationship between custom voice names and deployment ID, if you want to use custom voices. -3. (Optional:) The URI of a writable Azure blob container, if you want to store the synthesized files in your own Azure storage. +1. (Optional:) The relationship between custom voice names and deployment ID, if you want to use custom voices. +2. (Optional:) The URI of a writable Azure blob container, if you want to store the audio files in your own Azure storage. You can use a development environment like PyCharm or VS Code to edit, debug, and execute the sample. diff --git a/samples/batch-avatar/python/synthesis.py b/samples/batch-avatar/python/synthesis.py index 2b7f21acd..44856a44e 100644 --- a/samples/batch-avatar/python/synthesis.py +++ b/samples/batch-avatar/python/synthesis.py @@ -9,15 +9,16 @@ import os import sys import time -import uuid +from pathlib import Path -from azure.identity import DefaultAzureCredential import requests logging.basicConfig(stream=sys.stdout, level=logging.INFO, # set to logging.DEBUG for verbose output format="[%(asctime)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z") logger = logging.getLogger(__name__) +# Your Speech resource key and region +# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION" # The endpoint (and key) could be gotten from the Keys and Endpoint page in the Speech service resource. # The endpoint would be like: https://.api.cognitive.microsoft.com or https://.cognitiveservices.azure.com @@ -28,37 +29,17 @@ API_VERSION = "2024-08-01" -def _create_job_id(): - # the job ID must be unique in current speech resource - # you can use a GUID or a self-increasing number - return uuid.uuid4() - - -def _authenticate(): - if PASSWORDLESS_AUTHENTICATION: - # Refer to https://learn.microsoft.com/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential - # for more information about Azure Identity - # For example, your app can authenticate using your Azure CLI sign-in credentials with when developing locally. - # Your app can then use a managed identity once it has been deployed to Azure. No code changes are required for this transition. - - # When developing locally, make sure that the user account that is accessing batch avatar synthesis has the right permission. - # You'll need Cognitive Services User or Cognitive Services Speech User role to submit batch avatar synthesis jobs. - credential = DefaultAzureCredential() - token = credential.get_token('https://cognitiveservices.azure.com/.default') - return {'Authorization': f'Bearer {token.token}'} - else: - SUBSCRIPTION_KEY = os.environ.get('SPEECH_KEY') - return {'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY} - - -def submit_synthesis(job_id: str): - url = f'{SPEECH_ENDPOINT}/avatar/batchsyntheses/{job_id}?api-version={API_VERSION}' +def submit_synthesis(): + url = f'https://{SERVICE_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/talkingavatar' header = { + 'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY, 'Content-Type': 'application/json' } - header.update(_authenticate()) payload = { + 'displayName': NAME, + 'description': DESCRIPTION, + "textType": "PlainText", 'synthesisConfig': { "voice": "en-US-JennyNeural", }, @@ -68,37 +49,36 @@ def submit_synthesis(job_id: str): 'customVoices': { # "YOUR_CUSTOM_VOICE_NAME": "YOUR_CUSTOM_VOICE_ID" }, - "inputKind": "PlainText", # PlainText or SSML "inputs": [ { - "content": "Hi, I'm a virtual assistant created by Microsoft.", + "text": "Hi, I'm a virtual assistant created by Microsoft.", }, ], - "avatarConfig": { + "properties": { "customized": False, # set to True if you want to use customized avatar "talkingAvatarCharacter": "lisa", # talking avatar character "talkingAvatarStyle": "graceful-sitting", # talking avatar style, required for prebuilt avatar, optional for custom avatar - "videoFormat": "mp4", # mp4 or webm, webm is required for transparent background - "videoCodec": "h264", # hevc, h264 or vp9, vp9 is required for transparent background; default is hevc + "videoFormat": "webm", # mp4 or webm, webm is required for transparent background + "videoCodec": "vp9", # hevc, h264 or vp9, vp9 is required for transparent background; default is hevc "subtitleType": "soft_embedded", - "backgroundColor": "#FFFFFFFF", # background color in RGBA format, default is white; can be set to 'transparent' for transparent background - # "backgroundImage": "https://samples-files.com/samples/Images/jpg/1920-1080-sample.jpg", # background image URL, only support https, either backgroundImage or backgroundColor can be set + "backgroundColor": "transparent", } } - response = requests.put(url, json.dumps(payload), headers=header) + response = requests.post(url, json.dumps(payload), headers=header) if response.status_code < 400: logger.info('Batch avatar synthesis job submitted successfully') logger.info(f'Job ID: {response.json()["id"]}') - return True + return response.json()["id"] else: - logger.error(f'Failed to submit batch avatar synthesis job: [{response.status_code}], {response.text}') + logger.error(f'Failed to submit batch avatar synthesis job: {response.text}') def get_synthesis(job_id): - url = f'{SPEECH_ENDPOINT}/avatar/batchsyntheses/{job_id}?api-version={API_VERSION}' - header = _authenticate() - + url = f'https://{SERVICE_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/talkingavatar/{job_id}' + header = { + 'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY + } response = requests.get(url, headers=header) if response.status_code < 400: logger.debug('Get batch synthesis job successfully') @@ -110,11 +90,12 @@ def get_synthesis(job_id): logger.error(f'Failed to get batch synthesis job: {response.text}') -def list_synthesis_jobs(skip: int = 0, max_page_size: int = 100): +def list_synthesis_jobs(skip: int = 0, top: int = 100): """List all batch synthesis jobs in the subscription""" - url = f'{SPEECH_ENDPOINT}/avatar/batchsyntheses?api-version={API_VERSION}&skip={skip}&maxpagesize={max_page_size}' - header = _authenticate() - + url = f'https://{SERVICE_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/talkingavatar?skip={skip}&top={top}' + header = { + 'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY + } response = requests.get(url, headers=header) if response.status_code < 400: logger.info(f'List batch synthesis jobs successfully, got {len(response.json()["values"])} jobs') @@ -124,8 +105,8 @@ def list_synthesis_jobs(skip: int = 0, max_page_size: int = 100): if __name__ == '__main__': - job_id = _create_job_id() - if submit_synthesis(job_id): + job_id = submit_synthesis() + if job_id is not None: while True: status = get_synthesis(job_id) if status == 'Succeeded': diff --git a/samples/batch-synthesis/README.md b/samples/batch-synthesis/README.md index e90ef50ba..d1d9a74e7 100644 --- a/samples/batch-synthesis/README.md +++ b/samples/batch-synthesis/README.md @@ -1,8 +1,8 @@ # Examples to use Batch Synthesis -The Batch synthesis API provides asynchronous synthesis of long-form text to speech. The functionality is exposed through a REST API and is easy to access from many programming languages. The samples here do **NOT** require the installation of the Cognitive Service Speech SDK, but use the REST API directly instead. +The Batch synthesis API (Preview) provides asynchronous synthesis of long-form text to speech. The functionality is exposed through a REST API and is easy to access from many programming languages. The samples here do **NOT** require the installation of the Cognitive Service Speech SDK, but use the REST API directly instead. -For a detailed explanation see the [batch synthesis documentation](https://learn.microsoft.com/azure/ai-services/speech-service/batch-synthesis) and the `README.md` in the language specific subdirectories. +For a detailed explanation see the [batch synthesis documentation](https://docs.microsoft.com/azure/cognitive-services/speech-service/batch-synthesis) and the `README.md` in the language specific subdirectories. Available samples: diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/BatchSynthesisClient.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/BatchSynthesisClient.cs index cd1987d18..cef3c844f 100644 --- a/samples/batch-synthesis/csharp/BatchSynthesisSample/BatchSynthesisClient.cs +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/BatchSynthesisClient.cs @@ -3,8 +3,8 @@ // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // -using System.Text; -using System.Text.Json; +using Newtonsoft.Json; +using System.Net.Http.Formatting; public class BatchSynthesisClient { @@ -13,16 +13,14 @@ public class BatchSynthesisClient private readonly string hostName; private readonly string baseUri; private readonly string subscriptionKey; - private readonly string apiVersion; private readonly HttpClient client; - public BatchSynthesisClient(string hostName, string key, string apiVersion) + public BatchSynthesisClient(string hostName, string key) { this.hostName = hostName; this.subscriptionKey = key; - this.baseUri = $"{this.hostName}/texttospeech/batchsyntheses"; - this.apiVersion = apiVersion; + this.baseUri = $"{this.hostName}/api/texttospeech/3.1-preview1/batchsynthesis"; this.client = new HttpClient(); client.DefaultRequestHeaders.Add(OcpApimSubscriptionKey, this.subscriptionKey); @@ -31,8 +29,7 @@ public BatchSynthesisClient(string hostName, string key, string apiVersion) public async Task> GetAllSynthesesAsync() { var syntheses = new List(); - var uri = new Uri($"{this.baseUri}?api-version={this.apiVersion}"); - + var uri = new Uri(this.baseUri); do { var response = await this.client.GetAsync(uri).ConfigureAwait(false); @@ -44,7 +41,7 @@ public async Task> GetAllSynthesesAsync() } var pagedSyntheses = await response.Content.ReadAsAsync>().ConfigureAwait(false); - syntheses.AddRange(pagedSyntheses.Value); + syntheses.AddRange(pagedSyntheses.Values); uri = pagedSyntheses.NextLink; } while (uri != null); @@ -52,9 +49,9 @@ public async Task> GetAllSynthesesAsync() return syntheses; } - public async Task GetSynthesisAsync(string id) + public async Task GetSynthesisAsync(Guid id) { - var uri = new Uri(this.baseUri + $"/{id}?api-version={this.apiVersion}"); + var uri = new Uri(this.baseUri + $"/{id}"); var response = await this.client.GetAsync(uri).ConfigureAwait(false); if (!response.IsSuccessStatusCode) { @@ -65,9 +62,9 @@ public async Task GetSynthesisAsync(string id) return await response.Content.ReadAsAsync().ConfigureAwait(false); } - public async Task DeleteSynthesisAsync(string id) + public async Task DeleteSynthesisAsync(Guid id) { - var uri = new Uri(this.baseUri + $"/{id}?api-version={this.apiVersion}"); + var uri = new Uri(this.baseUri + $"/{id}"); var response = await this.client.DeleteAsync(uri).ConfigureAwait(false); if (!response.IsSuccessStatusCode) { @@ -75,40 +72,44 @@ public async Task DeleteSynthesisAsync(string id) } } - public async Task CreateSynthesisAsync( - string id, + public async Task CreateSynthesisAsync( string voiceName, + string displayName, + string description, string script, bool isSsml) { - var uri = new Uri($"{this.baseUri}/{id}?api-version={this.apiVersion}"); + var uri = new Uri(this.baseUri); var batchSynthesis = new BatchSynthesis { - InputKind = isSsml ? "Ssml" : "PlainText", - SynthesisConfig = new SynthesisConfig + DisplayName = displayName, + Description = description, + TextType = isSsml ? "Ssml" : "PlainText", + SynthesisConfig = new BatchSynthesisConfig { Voice = voiceName }, - Inputs = new List { new BatchSynthesisInputDefinition { Content = script } } + Inputs = new List { new BatchSynthesisInputDefinition { Text = script } } }; - // Create JsonSerializer instance with configured options - string jsonString = JsonSerializer.Serialize(batchSynthesis, new JsonSerializerOptions { IgnoreNullValues = true }); - var response = await this.client.PutAsync(uri, new StringContent(jsonString, Encoding.UTF8, "application/json")).ConfigureAwait(false); + StringContent content = new StringContent(JsonConvert.SerializeObject(batchSynthesis)); + content.Headers.ContentType = JsonMediaTypeFormatter.DefaultMediaType; + var response = await this.client.PostAsync(uri, content).ConfigureAwait(false); if (!response.IsSuccessStatusCode) { await HandleErrorResponse(response); + return null; } + + var location = response.Headers.GetValues("Location").FirstOrDefault(); + return new Uri(location); + } private static async Task HandleErrorResponse(HttpResponseMessage response) { var content = await response.Content.ReadAsStringAsync().ConfigureAwait(false); Console.WriteLine(content); - if (response.Headers.TryGetValues("apim-request-id", out var traceRequestId)) - { - Console.WriteLine($"Trace request ID: {traceRequestId.FirstOrDefault()}."); - } } } diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/Program.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/Program.cs index 8ad96afe8..2c88aaa27 100644 --- a/samples/batch-synthesis/csharp/BatchSynthesisSample/Program.cs +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/Program.cs @@ -8,7 +8,6 @@ // This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION" string speechRegion; string speechKey; -string apiVersion = "2024-04-01"; if (Environment.GetEnvironmentVariable("SPEECH_REGION") is string regionValue) { @@ -16,7 +15,7 @@ } else { - throw new ArgumentException($"Please set SPEECH_REGION environment variable."); + throw new ArgumentException($"Please set the SPEECH_REGION environment variable to set speech resource region."); } if (Environment.GetEnvironmentVariable("SPEECH_KEY") is string keyValue) @@ -28,52 +27,50 @@ throw new ArgumentException($"Please set the SPEECH_KEY environment variable to set speech resource key."); } -var host = $"https://{speechRegion}.api.cognitive.microsoft.com"; +var host = $"https://{speechRegion}.customvoice.api.speech.microsoft.com"; var sampleScript = await File.ReadAllTextAsync("Gatsby-chapter1.txt").ConfigureAwait(false); -var synthesisClient = new BatchSynthesisClient(host, speechKey, apiVersion); +var synthesisClient = new BatchSynthesisClient(host, speechKey); -// Get all synthesis jobs. +// Get all synthesis tasks. var synthesisJobs = await synthesisClient.GetAllSynthesesAsync().ConfigureAwait(false); -Console.WriteLine($"Found {synthesisJobs.Count()} jobs."); -var newJobId = $"SimpleJob-{DateTime.Now.ToString("u").Replace(":", "-").Replace(" ", "-")}"; +// Create a new synthesis task with plain text +var newSynthesisUri = await synthesisClient.CreateSynthesisAsync( + "en-US-JennyNeural", + "sample batch synthesis", + "sample description", + sampleScript, + false).ConfigureAwait(false); -// Create a new synthesis job with plain text -await synthesisClient.CreateSynthesisAsync( - newJobId, - "AvaNeural", - sampleScript, - false).ConfigureAwait(false); +var newSynthesisId = Guid.Parse(newSynthesisUri.Segments.Last()); -// Get a synthesis job. -var synthesis = await synthesisClient.GetSynthesisAsync(newJobId).ConfigureAwait(false); +// Get a synthesis task. +var synthesis = await synthesisClient.GetSynthesisAsync(newSynthesisId).ConfigureAwait(false); // Poll the synthesis until it completes var terminatedStates = new[] { "Succeeded", "Failed" }; while (!terminatedStates.Contains(synthesis.Status)) { - Console.WriteLine($"Synthesis {newJobId}. Status: {synthesis.Status}"); + Console.WriteLine($"Synthesis {newSynthesisId}. Status: {synthesis.Status}"); await Task.Delay(TimeSpan.FromSeconds(30)).ConfigureAwait(false); - synthesis = await synthesisClient.GetSynthesisAsync(newJobId).ConfigureAwait(false); + synthesis = await synthesisClient.GetSynthesisAsync(newSynthesisId).ConfigureAwait(false); } -Console.WriteLine($"Synthesis {newJobId}. Status: {synthesis.Status}"); - // Get outputs of the synthesis -if (!string.IsNullOrEmpty(synthesis.Outputs?.Result)) +if (!string.IsNullOrEmpty(synthesis.Outputs.Result)) { Console.WriteLine("Please download result from this URL before you delete the synthesis."); Console.WriteLine(synthesis.Outputs.Result); } -if (!string.IsNullOrEmpty(synthesis.Outputs?.Summary)) +if (!string.IsNullOrEmpty(synthesis.Outputs.Summary)) { Console.WriteLine("Please download summary file from this URL before you delete the synthesis."); Console.WriteLine(synthesis.Outputs.Summary); } // Delete a specific synthesis -await synthesisClient.DeleteSynthesisAsync(newJobId); -Console.WriteLine($"Deleted synthesis {newJobId}."); \ No newline at end of file +await synthesisClient.DeleteSynthesisAsync(newSynthesisId); +Console.WriteLine($"Deleted synthesis {newSynthesisId}."); diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/TimeSpanConverter.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/TimeSpanConverter.cs new file mode 100644 index 000000000..293e43366 --- /dev/null +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/TimeSpanConverter.cs @@ -0,0 +1,30 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +using System.Xml; +using Newtonsoft.Json; + +public class TimeSpanConverter : JsonConverter +{ + public override void WriteJson(JsonWriter writer, TimeSpan value, JsonSerializer serializer) + { + if (writer == null) + { + throw new ArgumentNullException(nameof(writer)); + } + + writer.WriteValue(XmlConvert.ToString(value)); + } + + public override TimeSpan ReadJson(JsonReader reader, Type objectType, TimeSpan existingValue, bool hasExistingValue, JsonSerializer serializer) + { + if (reader == null) + { + throw new ArgumentNullException(nameof(reader)); + } + + return XmlConvert.ToTimeSpan((string)reader.Value); + } +} diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesis.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesis.cs index 904a0ab89..9f2fbf20d 100644 --- a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesis.cs +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesis.cs @@ -8,22 +8,22 @@ public class BatchSynthesis : BatchSynthesisDefinition /// /// The identifier of this batch synthesis. /// - public string Id { get; set; } + public Guid Id { get; set; } /// /// The display name of the batch synthesis. /// - public string DisplayName { get; set; } + public string? DisplayName { get; set; } /// /// The description of the batch synthesis. /// - public string Description { get; set; } + public string? Description { get; set; } /// /// The status of the batch synthesis. /// - public string Status { get; set; } + public string? Status { get; set; } /// /// The time-stamp when the object was created. @@ -49,18 +49,3 @@ public class BatchSynthesis : BatchSynthesisDefinition /// public BatchSynthesisOutputs Outputs { get; set; } } - -public class BatchSynthesisOutputs -{ - /// - /// The sas url of the result file. - /// It would be relative path when "destinationContainerUrl" is specified. - /// - public string Result { get; set; } - - /// - /// The sas url of the summary file. - /// It would be relative path when "destinationContainerUrl" is specified. - /// - public string Summary { get; set; } -} diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisBillingDetails.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisBillingDetails.cs new file mode 100644 index 000000000..d9ab75501 --- /dev/null +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisBillingDetails.cs @@ -0,0 +1,17 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +public class BatchSynthesisBillingDetails +{ + /// + /// The number of billable characters for custom neural voices in the input file. + /// + public long CustomNeural { get; set; } + + /// + /// The number of billable characters for prebuild neural voices in the input file. + /// + public long Neural { get; set; } +} diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/SynthesisConfig.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisConfig.cs similarity index 61% rename from samples/batch-synthesis/csharp/BatchSynthesisSample/dto/SynthesisConfig.cs rename to samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisConfig.cs index a407717d0..6cbb4b490 100644 --- a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/SynthesisConfig.cs +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisConfig.cs @@ -3,18 +3,13 @@ // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // -public class SynthesisConfig +public class BatchSynthesisConfig { /// /// The voice name. /// public string Voice { get; set; } - /// - /// The speaker profile ID of target personal voice. - /// - public string SpeakerProfileId { get; init; } - /// /// The role name. /// @@ -44,20 +39,4 @@ public class SynthesisConfig /// The volume value. /// string Volume { get; set; } - - /// - /// The background audio configuration. - /// - public BackgroundAudioConfig BackgroundAudio { get; init; } -} - -public class BackgroundAudioConfig -{ - public Uri Src { get; init; } - - public long? Fadein { get; init; } - - public long? Fadeout { get; init; } - - public double? Volume { get; init; } } diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisDefinition.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisDefinition.cs index d67f362a0..940eff9a2 100644 --- a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisDefinition.cs +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisDefinition.cs @@ -7,9 +7,9 @@ public class BatchSynthesisDefinition { /// - /// The synthesis text type, `SSML` or `PlainText`. + /// The synthesis text type, SSMl or plain text. /// - public string InputKind { get; set; } + public string TextType { get; set; } /// /// The text inputs. @@ -20,7 +20,7 @@ public class BatchSynthesisDefinition /// The synthesis config. /// Required and only effective when text type is plain text. /// - public SynthesisConfig SynthesisConfig { get; set; } + public BatchSynthesisConfig SynthesisConfig { get; set; } /// /// The custom voice map of voice name and deployment ID . @@ -33,11 +33,3 @@ public class BatchSynthesisDefinition /// public BatchSynthesisPropertiesDefinition Properties { get; set; } } - -public class BatchSynthesisInputDefinition -{ - /// - /// The input text string, it could be SSML or plain text. - /// - public string Content { get; set; } -} diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisInputDefinition.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisInputDefinition.cs new file mode 100644 index 000000000..d63155b1b --- /dev/null +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisInputDefinition.cs @@ -0,0 +1,12 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +public class BatchSynthesisInputDefinition +{ + /// + /// The input text string, it could be SSML or plain text. + /// + public string Text { get; set; } +} diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisOutputs.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisOutputs.cs new file mode 100644 index 000000000..45663225e --- /dev/null +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisOutputs.cs @@ -0,0 +1,19 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +public class BatchSynthesisOutputs +{ + /// + /// The sas url of the result file. + /// It would be relative path when "destinationContainerUrl" is specified. + /// + public string Result { get; set; } + + /// + /// The sas url of the summary file. + /// It would be relative path when "destinationContainerUrl" is specified. + /// + public string Summary { get; set; } +} diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisProperties.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisProperties.cs index aebf0d43d..bfa0e9d06 100644 --- a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisProperties.cs +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisProperties.cs @@ -3,12 +3,14 @@ // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // +using Newtonsoft.Json; + public sealed class BatchSynthesisProperties : BatchSynthesisPropertiesDefinition { /// /// The value of generated audio size in bytes. /// - public long? SizeInBytes { get; set; } + public long? AudioSize { get; set; } /// /// The number of succeeded audios. /// @@ -20,31 +22,24 @@ public sealed class BatchSynthesisProperties : BatchSynthesisPropertiesDefinitio public int? FailedAudioCount { get; set; } /// - /// The total duration of generated audios. + /// The total audio duration in ticks. /// - public long? DurationInMilliseconds { get; set; } - + public long? DurationInTicks { get; set; } /// - /// The details of billable characters by voice type in the input file. + /// The duration of the transcription. The duration is encoded as ISO 8601 duration + /// ("PnYnMnDTnHnMnS", see https://en.wikipedia.org/wiki/ISO_8601#Durations). /// - public IDictionary BillingDetails { get; set; } + [JsonConverter(typeof(TimeSpanConverter))] + public TimeSpan Duration { get; set; } /// - /// The details of the error when the synthesis job is failed. - /// - public SynthesisError Error { get; set; } -} - -public class SynthesisError -{ - /// - /// The code of this error. + /// The details of billable characters by voice type in the input file. /// - public string Code { get; set; } + public BatchSynthesisBillingDetails BillingDetails { get; set; } /// - /// The message for this error. + /// The details of the error in case the entity is in a failed state. /// - public string Message { get; set; } + public EntityError Error { get; set; } } diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisPropertiesDefinition.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisPropertiesDefinition.cs index 5c6224f01..38f184cf0 100644 --- a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisPropertiesDefinition.cs +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/BatchSynthesisPropertiesDefinition.cs @@ -3,6 +3,8 @@ // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // +using Newtonsoft.Json; + public class BatchSynthesisPropertiesDefinition { /// @@ -13,38 +15,35 @@ public class BatchSynthesisPropertiesDefinition /// /// The value indicating whether synthesis result is concatenated. /// - public bool? ConcatenateResult { get; set; } + public bool ConcatenateResult { get; set; } /// /// The requested destination container. It is an URL with Service adhoc SAS to a writeable container in Azure Blob storage. /// public Uri DestinationContainerUrl { get; set; } - /// - /// The destination path of the output in the destination container. - /// - public string DestinationPath { get; set; } - /// /// The value indicating whether synthesis result is decopressed in target container. /// It only take effect when "destinationContainerUrl" is specified or "BYOS" is configured. /// - public bool? DecompressOutputFiles { get; set; } + public bool DecompressOutputFiles { get; set; } /// /// The value indicating whether the word boundary output will be generated. /// - public bool? WordBoundaryEnabled { get; set; } + public bool WordBoundaryEnabled { get; set; } /// /// The value indicating whether the sentence boundary output will be generated. /// - public bool? SentenceBoundaryEnabled { get; set; } + public bool SentenceBoundaryEnabled { get; set; } /// /// How long the batch task will be kept in the system after it has completed. Once the /// batch task reaches the time to live after completion (succeeded or failed) it will be automatically - /// deleted. The longest supported duration is 744 hours, the default vaule is 744. + /// deleted. The longest supported duration is 31 days, the default vaule is 31 days. + /// The duration is encoded as ISO 8601 duration ("PnYnMnDTnHnMnS", see https://en.wikipedia.org/wiki/ISO_8601#Durations). /// - public int? TimeToLiveInHours { get; set; } + [JsonConverter(typeof(TimeSpanConverter))] + public TimeSpan? TimeToLive { get; set; } } diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/EntityError.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/EntityError.cs new file mode 100644 index 000000000..6c38bef4c --- /dev/null +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/EntityError.cs @@ -0,0 +1,17 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +public class EntityError +{ + /// + /// The code of this error. + /// + public string Code { get; set; } + + /// + /// The message for this error. + /// + public string Message { get; set; } +} diff --git a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/PaginatedResults.cs b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/PaginatedResults.cs index b3637e483..360e9c1a8 100644 --- a/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/PaginatedResults.cs +++ b/samples/batch-synthesis/csharp/BatchSynthesisSample/dto/PaginatedResults.cs @@ -10,10 +10,11 @@ public class PaginatedResults /// /// The a list of entities limited by either the passed query parameters 'skip' and 'top' or their default values. /// - public IEnumerable Value { get; set; } + public IEnumerable? Values { get; set; } /// /// The a link to the next set of paginated results if there are more entities available; otherwise null. /// - public Uri NextLink { get; set; } + [JsonPropertyName("@nextLink")] + public Uri? NextLink { get; set; } } diff --git a/samples/batch-synthesis/java/.gitignore b/samples/batch-synthesis/java/.gitignore deleted file mode 100644 index 5ff6309b7..000000000 --- a/samples/batch-synthesis/java/.gitignore +++ /dev/null @@ -1,38 +0,0 @@ -target/ -!.mvn/wrapper/maven-wrapper.jar -!**/src/main/**/target/ -!**/src/test/**/target/ - -### IntelliJ IDEA ### -.idea/modules.xml -.idea/jarRepositories.xml -.idea/compiler.xml -.idea/libraries/ -*.iws -*.iml -*.ipr - -### Eclipse ### -.apt_generated -.classpath -.factorypath -.project -.settings -.springBeans -.sts4-cache - -### NetBeans ### -/nbproject/private/ -/nbbuild/ -/dist/ -/nbdist/ -/.nb-gradle/ -build/ -!**/src/main/**/build/ -!**/src/test/**/build/ - -### VS Code ### -.vscode/ - -### Mac OS ### -.DS_Store \ No newline at end of file diff --git a/samples/batch-synthesis/java/pom.xml b/samples/batch-synthesis/java/pom.xml deleted file mode 100644 index b73ea48ee..000000000 --- a/samples/batch-synthesis/java/pom.xml +++ /dev/null @@ -1,57 +0,0 @@ - - - 4.0.0 - - com.micosoft.api - msBatchVoice - 1.0-SNAPSHOT - - - 8 - 8 - UTF-8 - - - - com.fasterxml.jackson.datatype - jackson-datatype-jsr310 - 2.9.8 - - - - org.apache.httpcomponents - httpcomponents-core - 4.4.15 - - - org.apache.httpcomponents - httpclient - 4.5.14 - - - org.apache.httpcomponents - httpcore - 4.4.16 - - - - org.apache.commons - commons-collections4 - 4.4 - - - org.apache.commons - commons-io - 1.3.2 - - - - commons-io - commons-io - 2.11.0 - - - - \ No newline at end of file diff --git a/samples/batch-synthesis/java/src/main/java/com/micosoft/api/axample/BatchSynthesis.java b/samples/batch-synthesis/java/src/main/java/com/micosoft/api/axample/BatchSynthesis.java deleted file mode 100644 index 94542b4e7..000000000 --- a/samples/batch-synthesis/java/src/main/java/com/micosoft/api/axample/BatchSynthesis.java +++ /dev/null @@ -1,122 +0,0 @@ -package com.micosoft.api.axample; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.http.HttpHeaders; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpPut; -import org.apache.http.entity.StringEntity; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.util.EntityUtils; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.logging.Level; -import java.util.logging.Logger; - -public class BatchSynthesis { - - private static final String SUBSCRIPTION_KEY = "your-key"; - private static final String REGON="eastus"; - - private static final String SPEECH_ENDPOINT = System.getenv("SPEECH_ENDPOINT") != null ? System.getenv("SPEECH_ENDPOINT") - : "https://"+REGON+".api.cognitive.microsoft.com"; - private static final String API_VERSION = "2024-04-01"; - private static final Logger logger = Logger.getLogger(BatchSynthesis.class.getName()); - - private static UUID createJobId() { - return UUID.randomUUID(); - } - - private static boolean submitSynthesis(UUID jobId, - String voiceName, - String text - ) throws IOException { - String url = String.format("%s/texttospeech/batchsyntheses/%s?api-version=%s", SPEECH_ENDPOINT, jobId, API_VERSION); - CloseableHttpClient httpClient = HttpClients.createDefault(); - - Map payload = new HashMap<>(); - payload.put("inputKind", "PlainText"); - //or ssml - //payload.put("inputKind", "SSML"); - Map voice = new HashMap<>(); - voice.put("voice", voiceName); - payload.put("synthesisConfig", voice); - payload.put("customVoices", new HashMap<>()); - Map txt = new HashMap<>(); - txt.put("content", text); - payload.put("inputs", new Object[]{txt}); - Map properties = new HashMap<>(); - properties.put("outputFormat", "audio-24khz-160kbitrate-mono-mp3"); - properties.put("wordBoundaryEnabled", true); - properties.put("sentenceBoundaryEnabled", true); - payload.put("properties", properties); - - ObjectMapper objectMapper = new ObjectMapper(); - String requestBody = objectMapper.writeValueAsString(payload); - - HttpPut request = new HttpPut(url); - request.setHeader("Ocp-Apim-Subscription-Key", SUBSCRIPTION_KEY); - request.setHeader(HttpHeaders.CONTENT_TYPE, "application/json"); - request.setEntity(new StringEntity(requestBody, StandardCharsets.UTF_8)); - - try (CloseableHttpResponse response = httpClient.execute(request)) { - int statusCode = response.getStatusLine().getStatusCode(); - if (statusCode < 400) { - String responseBody = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - logger.log(Level.INFO, "Batch synthesis job submitted successfully"); - logger.log(Level.INFO, "Job ID: " + objectMapper.readValue(responseBody, Map.class).get("id")); - return true; - } else { - logger.log(Level.SEVERE, "Failed to submit batch synthesis job: " + response.getStatusLine().getReasonPhrase()); - return false; - } - } - } - - private static String getSynthesis(UUID jobId) throws IOException { - String url = String.format("%s/texttospeech/batchsyntheses/%s?api-version=%s", SPEECH_ENDPOINT, jobId, API_VERSION); - CloseableHttpClient httpClient = HttpClients.createDefault(); - - HttpGet request = new HttpGet(url); - request.setHeader("Ocp-Apim-Subscription-Key", SUBSCRIPTION_KEY); - - try (CloseableHttpResponse response = httpClient.execute(request)) { - int statusCode = response.getStatusLine().getStatusCode(); - if (statusCode < 400) { - String responseBody = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); - logger.log(Level.INFO, "Get batch synthesis job successfully"); - return (String) new ObjectMapper().readValue(responseBody, Map.class).get("status"); - } else { - logger.log(Level.SEVERE, "Failed to get batch synthesis job: " + response.getStatusLine().getReasonPhrase()); - return null; - } - } - } - - public static void main(String[] args) throws IOException, InterruptedException, JsonProcessingException { - UUID jobId = createJobId(); - String text="hello this is a test voice"; - String voiceName="en-US-AvaMultilingualNeural"; - if (submitSynthesis(jobId,voiceName,text)) { - while (true) { - String status = getSynthesis(jobId); - if ("Succeeded".equals(status)) { - logger.log(Level.INFO, "batch synthesis job succeeded"); - break; - } else if ("Failed".equals(status)) { - logger.log(Level.SEVERE, "batch synthesis job failed"); - break; - } else { - logger.log(Level.INFO, "batch synthesis job is still running, status [" + status + "]"); - Thread.sleep(5000); - } - } - } - } -} \ No newline at end of file diff --git a/samples/batch-synthesis/python/README.md b/samples/batch-synthesis/python/README.md index 7c40bd5d5..d547518bd 100644 --- a/samples/batch-synthesis/python/README.md +++ b/samples/batch-synthesis/python/README.md @@ -8,28 +8,20 @@ The sample uses the `requests` library. You can install it with the command pip install requests ``` -We recommend using a passwordless authentication provided by the `azure-identity` library. -You can install it with the command - -```sh -pip install azure-identity -``` - ## Run the sample code -The sample code itself is [synthesis.py](synthesis.py) and can be run using Python 3.8 or higher. +The sample code itself is [synthesis.py](synthesis.py) and can be run using Python 3.7 or higher. You will need to adapt the following information to run the sample: -1. Your Azure AI Speech Service. +1. Your Cognitive Services subscription key and region. Some notes: - - We recommend using a passwordless authentication provided by the `azure-identity` library. Your Microsoft Entra user account is need to be assigned with `Cognitive Services User` or `Cognitive Services Speech User` role. - - Alternatively, you can get the subscription key from the "Keys and Endpoint" tab on your Azure AI Speech resource in the Azure Portal. + - You can get the subscription key from the "Keys and Endpoint" tab on your Cognitive Services or Speech resource in the Azure Portal. - Batch synthesis is only available for paid subscriptions, free subscriptions are not supported. - - Please refer to [this page](https://learn.microsoft.com/azure/ai-services/speech-service/regions#rest-apis) for a complete list of region identifiers in the expected format. + - Please refer to [this page](https://docs.microsoft.com/azure/cognitive-services/speech-service/regions#rest-apis) for a complete list of region identifiers in the expected format. 2. (Optional:) The relationship between custom voice names and deployment ID, if you want to use custom voices. 3. (Optional:) The URI of a writable Azure blob container, if you want to store the audio files in your own Azure storage. -You can use a development environment like VS Code to edit, debug, and execute the sample. +You can use a development environment like PyCharm to edit, debug, and execute the sample. diff --git a/samples/batch-synthesis/python/synthesis.py b/samples/batch-synthesis/python/synthesis.py index c14321927..851eea2f5 100644 --- a/samples/batch-synthesis/python/synthesis.py +++ b/samples/batch-synthesis/python/synthesis.py @@ -9,70 +9,44 @@ import os import sys import time -import uuid from pathlib import Path -from azure.identity import DefaultAzureCredential import requests -logging.basicConfig(stream=sys.stdout, level=logging.INFO, # set to logging.DEBUG for verbose output +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="[%(asctime)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z") logger = logging.getLogger(__name__) -# The endpoint (and key) could be gotten from the Keys and Endpoint page in the Speech service resource. -# The endpoint would be like: https://.api.cognitive.microsoft.com or https://.cognitiveservices.azure.com -# If you want to use passwordless authentication, custom domain is required. -SPEECH_ENDPOINT = os.environ.get('SPEECH_ENDPOINT') -# We recommend to use passwordless authentication with Azure Identity here; meanwhile, you can also use a subscription key instead -PASSWORDLESS_AUTHENTICATION = True -if not SPEECH_ENDPOINT: - if PASSWORDLESS_AUTHENTICATION: - logger.error('SPEECH_ENDPOINT is required for passwordless authentication') - sys.exit(1) - SERVICE_REGION = os.environ.get('SPEECH_REGION') - SPEECH_ENDPOINT = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com' -if not PASSWORDLESS_AUTHENTICATION: - SUBSCRIPTION_KEY = os.environ.get('SPEECH_KEY') - - -API_VERSION = "2024-04-01" - -def _create_job_id(): - # the job ID must be unique in current speech resource - # you can use a GUID or a self-increasing number - return uuid.uuid4() - - -def _authenticate(): - if PASSWORDLESS_AUTHENTICATION: - # Refer to https://learn.microsoft.com/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential - # for more information about Azure Identity - # For example, your app can authenticate using your Azure CLI sign-in credentials with when developing locally. - # Your app can then use a managed identity once it has been deployed to Azure. No code changes are required for this transition. - - # When developing locally, make sure that the user account that is accessing batch avatar synthesis has the right permission. - # You'll need Cognitive Services User or Cognitive Services Speech User role to submit batch avatar synthesis jobs. - credential = DefaultAzureCredential() - token = credential.get_token('https://cognitiveservices.azure.com/.default') - return {'Authorization': f'Bearer {token.token}'} - else: - return {'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY} +# Your Speech resource key and region +# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION" + +SUBSCRIPTION_KEY = os.environ.get('SPEECH_KEY') +SERVICE_REGION = os.environ.get('SPEECH_REGION') + +NAME = "Simple synthesis" +DESCRIPTION = "Simple synthesis description" + +# The service host suffix. +# For azure.cn the host suffix is "customvoice.api.speech.azure.cn" +SERVICE_HOST = "customvoice.api.speech.microsoft.com" -def submit_synthesis(job_id: str) -> bool: - url = f'{SPEECH_ENDPOINT}/texttospeech/batchsyntheses/{job_id}?api-version={API_VERSION}' +def submit_synthesis(): + url = f'https://{SERVICE_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis' header = { + 'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY, 'Content-Type': 'application/json' } - header.update(_authenticate()) with open(Path(__file__).absolute().parent.parent / 'Gatsby-chapter1.txt', 'r') as f: text = f.read() payload = { - "inputKind": "PlainText", # or SSML + 'displayName': NAME, + 'description': DESCRIPTION, + "textType": "PlainText", 'synthesisConfig': { - "voice": "en-US-AvaMultilingualNeural", + "voice": "en-US-JennyNeural", }, # Replace with your custom voice name and deployment ID if you want to use custom voice. # Multiple voices are supported, the mixture of custom voices and platform voices is allowed. @@ -82,7 +56,7 @@ def submit_synthesis(job_id: str) -> bool: }, "inputs": [ { - "content": text + "text": text }, ], "properties": { @@ -91,19 +65,20 @@ def submit_synthesis(job_id: str) -> bool: }, } - response = requests.put(url, json.dumps(payload), headers=header) + response = requests.post(url, json.dumps(payload), headers=header) if response.status_code < 400: logger.info('Batch synthesis job submitted successfully') logger.info(f'Job ID: {response.json()["id"]}') - return True + return response.json()["id"] else: - logger.error(f'Failed to submit batch synthesis job: [{response.status_code}], {response.text}') - return False + logger.error(f'Failed to submit batch synthesis job: {response.text}') -def get_synthesis(job_id: str): - url = f'{SPEECH_ENDPOINT}/texttospeech/batchsyntheses/{job_id}?api-version={API_VERSION}' - header = _authenticate() +def get_synthesis(job_id): + url = f'https://{SERVICE_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/{job_id}' + header = { + 'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY + } response = requests.get(url, headers=header) if response.status_code < 400: logger.info('Get batch synthesis job successfully') @@ -113,10 +88,12 @@ def get_synthesis(job_id: str): logger.error(f'Failed to get batch synthesis job: {response.text}') -def list_synthesis_jobs(skip: int = 0, max_page_size: int = 100): +def list_synthesis_jobs(skip: int = 0, top: int = 100): """List all batch synthesis jobs in the subscription""" - url = f'{SPEECH_ENDPOINT}/texttospeech/batchsyntheses?api-version={API_VERSION}&skip={skip}&maxpagesize={max_page_size}' - header = _authenticate() + url = f'https://{SERVICE_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis?skip={skip}&top={top}' + header = { + 'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY + } response = requests.get(url, headers=header) if response.status_code < 400: logger.info(f'List batch synthesis jobs successfully, got {len(response.json()["values"])} jobs') @@ -126,8 +103,8 @@ def list_synthesis_jobs(skip: int = 0, max_page_size: int = 100): if __name__ == '__main__': - job_id = _create_job_id() - if submit_synthesis(job_id): + job_id = submit_synthesis() + if job_id is not None: while True: status = get_synthesis(job_id) if status == 'Succeeded': diff --git a/samples/cpp/embedded-speech/Makefile b/samples/cpp/embedded-speech/Makefile index 0a99ebdb6..a7693b8bd 100644 --- a/samples/cpp/embedded-speech/Makefile +++ b/samples/cpp/embedded-speech/Makefile @@ -10,7 +10,6 @@ # # Update SPEECHSDK_ROOT below, and check and adjust TARGET_PLATFORM if necessary # (see $SPEECHSDK_ROOT/runtimes). -# Note that RHEL/CentOS 7 is not supported with embedded speech. # SPEECHSDK_ROOT:=/change/to/point/to/extracted/SpeechSDK TARGET_PLATFORM:=linux-x64 diff --git a/samples/cpp/embedded-speech/README.md b/samples/cpp/embedded-speech/README.md index 3ca031e7d..b4760d7fc 100644 --- a/samples/cpp/embedded-speech/README.md +++ b/samples/cpp/embedded-speech/README.md @@ -18,7 +18,7 @@ See the [platform requirements for installing the Speech SDK](https://learn.micr Requirements specific to embedded speech samples are as follows. * Supported operating systems and architectures: * Windows - `x64`, `ARM64`. - * Linux - `x64`, `ARM64`. Note that embedded speech is not supported on RHEL/CentOS 7. + * Linux - `x64`, `ARM64`. * macOS - `x64`, `ARM64`. * If using Visual Studio (Windows): * [Microsoft Visual Studio 2022 or newer](https://www.visualstudio.com/). @@ -54,12 +54,10 @@ To tailor the sample to your configuration, there are two options: * Alternatively set corresponding environment variables (shown in parentheses in the list) before running the sample application. See details in [how to run the sample](#run-the-sample). Sample settings: -1. `SpeechRecognitionLocale` (`SPEECH_RECOGNITION_LOCALE`) - * Speech recognition locale in BCP-47 format, case-sensitive. If not set, en-US will be assumed. - * Setting `EmbeddedSpeechRecognitionModelName` overrides this for embedded speech. -1. `SpeechSynthesisLocale` (`SPEECH_SYNTHESIS_LOCALE`) - * Speech synthesis locale in BCP-47 format, case-sensitive. If not set, en-US will be assumed. - * Setting `EmbeddedSpeechSynthesisVoiceName` overrides this for embedded speech. +1. `EmbeddedSpeechModelLicense` (`EMBEDDED_SPEECH_MODEL_LICENSE`) + * Embedded speech model license (text). + * This applies to embedded speech recognition, synthesis and translation. + * It is presumed that all the customer's embedded speech models use the same license. 1. `EmbeddedSpeechRecognitionModelPath` (`EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH`) * Path to the local embedded speech recognition model(s) on the device file system. This may be a single model folder or a top-level folder for several models. @@ -69,10 +67,8 @@ Sample settings: not inside an archive, and they must be readable by the application process. The model internal subfolder structure must be intact i.e. as originally delivered. 1. `EmbeddedSpeechRecognitionModelName` (`EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME`) - * Name of the embedded speech recognition model to be used for recognition. If set, this overrides `SpeechRecognitionLocale` for embedded. + * Name of the embedded speech recognition model to be used for recognition. * The model name can be short (see https://aka.ms/speech/sr-languages, e.g. `en-US`) or full (e.g. `Microsoft Speech Recognizer en-US FP Model V8`). -1. `EmbeddedSpeechRecognitionModelKey` (`EMBEDDED_SPEECH_RECOGNITION_MODEL_KEY`) - * Decryption key of the (encrypted) embedded speech recognition model. 1. `EmbeddedSpeechSynthesisVoicePath` (`EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH`) * Path to the local embedded speech synthesis voice(s) on the device file system. This may be a single voice folder or a top-level folder for several voices. @@ -82,10 +78,8 @@ Sample settings: not inside an archive, and they must be readable by the application process. The voice internal subfolder structure must be intact i.e. as originally delivered. 1. `EmbeddedSpeechSynthesisVoiceName` (`EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME`) - * Name of the embedded speech synthesis voice to be used for synthesis. If set, this overrides `SpeechSynthesisLocale` for embedded. + * Name of the embedded speech synthesis voice to be used for synthesis. * The voice name can be short (see https://aka.ms/speech/tts-languages, e.g. `en-US-JennyNeural`) or full (e.g. `Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)`). -1. `EmbeddedSpeechSynthesisVoiceKey` (`EMBEDDED_SPEECH_SYNTHESIS_VOICE_KEY`) - * Decryption key of the (encrypted) embedded speech synthesis voice. 1. `EmbeddedSpeechTranslationModelPath` (`EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH`) * Path to the local embedded speech translation model(s) on the device file system. This may be a single model folder or a top-level folder for several models. @@ -97,12 +91,14 @@ Sample settings: 1. `EmbeddedSpeechTranslationModelName` (`EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME`) * Name of the embedded speech translation model to be used for translation. * The full model name must be given (e.g. `Microsoft Speech Translator Many-to-English Model V2`). -1. `EmbeddedSpeechTranslationModelKey` (`EMBEDDED_SPEECH_TRANSLATION_MODEL_KEY`) - * Decryption key of the (encrypted) embedded speech translation model. 1. `CloudSpeechSubscriptionKey` (`CLOUD_SPEECH_SUBSCRIPTION_KEY`) * Cloud speech service subscription key. This is needed with hybrid speech configuration. If not set, only embedded speech will be used. 1. `CloudSpeechServiceRegion` (`CLOUD_SPEECH_SERVICE_REGION`) * Cloud speech service region. This is needed with hybrid speech configuration. If not set, only embedded speech will be used. +1. `CloudSpeechRecognitionLanguage` (`CLOUD_SPEECH_RECOGNITION_LANGUAGE`) + * Cloud speech recognition language in BCP-47 format, case-sensitive. This is needed with hybrid speech configuration. If not set, en-US will be assumed. +1. `CloudSpeechSynthesisLanguage` (`CLOUD_SPEECH_SYNTHESIS_LANGUAGE`) + * Cloud speech synthesis language in BCP-47 format, case-sensitive. This is needed with hybrid speech configuration. If not set, en-US will be assumed. ### Visual Studio (Windows) diff --git a/samples/cpp/embedded-speech/samples/packages.config b/samples/cpp/embedded-speech/samples/packages.config index d2c06ae14..e3fd96589 100644 --- a/samples/cpp/embedded-speech/samples/packages.config +++ b/samples/cpp/embedded-speech/samples/packages.config @@ -1,9 +1,9 @@ - - - - - + + + + + \ No newline at end of file diff --git a/samples/cpp/embedded-speech/samples/samples.vcxproj b/samples/cpp/embedded-speech/samples/samples.vcxproj index 95bf7ffa4..b65112f56 100644 --- a/samples/cpp/embedded-speech/samples/samples.vcxproj +++ b/samples/cpp/embedded-speech/samples/samples.vcxproj @@ -77,11 +77,11 @@ - - - - - + + + + + @@ -241,11 +241,11 @@ This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - - - - - + + + + + \ No newline at end of file diff --git a/samples/cpp/embedded-speech/samples/settings.cpp b/samples/cpp/embedded-speech/samples/settings.cpp index 96efd22d9..ee1d101bd 100644 --- a/samples/cpp/embedded-speech/samples/settings.cpp +++ b/samples/cpp/embedded-speech/samples/settings.cpp @@ -21,13 +21,10 @@ using namespace Microsoft::CognitiveServices::Speech; // START OF CONFIGURABLE SETTINGS -// Locale to be used in speech recognition, cloud and embedded. In BCP-47 format, case-sensitive. -// If EmbeddedSpeechRecognitionModelName is changed from the default, it will override this for embedded. -const string SpeechRecognitionLocale = "en-US"; // or set SPEECH_RECOGNITION_LOCALE - -// Locale to be used in speech synthesis (text-to-speech), cloud and embedded. In BCP-47 format, case-sensitive. -// If EmbeddedSpeechSynthesisVoiceName is changed from the default, it will override this for embedded. -const string SpeechSynthesisLocale = "en-US"; // or set SPEECH_SYNTHESIS_LOCALE +// Embedded speech model license (text). +// This applies to embedded speech recognition, synthesis and translation. +// It is presumed that all the customer's embedded speech models use the same license. +const string EmbeddedSpeechModelLicense = "YourEmbeddedSpeechModelLicense"; // or set EMBEDDED_SPEECH_MODEL_LICENSE // Path to the local embedded speech recognition model(s) on the device file system. // This may be a single model folder or a top-level folder for several models. @@ -38,14 +35,9 @@ const string SpeechSynthesisLocale = "en-US"; // or set SPEECH_SYNTHESIS_LOCALE const string EmbeddedSpeechRecognitionModelPath = "YourEmbeddedSpeechRecognitionModelPath"; // or set EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH // Name of the embedded speech recognition model to be used for recognition. -// If changed from the default, this will override SpeechRecognitionLocale. // For example: "en-US" or "Microsoft Speech Recognizer en-US FP Model V8" const string EmbeddedSpeechRecognitionModelName = "YourEmbeddedSpeechRecognitionModelName"; // or set EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME -// Decryption key of the (encrypted) embedded speech recognition model. -// WARNING: The key may be visible in the program binary if hard-coded as a plain string. -const string EmbeddedSpeechRecognitionModelKey = "YourEmbeddedSpeechRecognitionModelKey"; // or set EMBEDDED_SPEECH_RECOGNITION_MODEL_KEY - // Path to the local embedded speech synthesis voice(s) on the device file system. // This may be a single voice folder or a top-level folder for several voices. // Use an absolute path or a path relative to the application working folder. @@ -55,14 +47,9 @@ const string EmbeddedSpeechRecognitionModelKey = "YourEmbeddedSpeechRecognitionM const string EmbeddedSpeechSynthesisVoicePath = "YourEmbeddedSpeechSynthesisVoicePath"; // or set EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH // Name of the embedded speech synthesis voice to be used for synthesis. -// If changed from the default, this will override SpeechSynthesisLocale. // For example: "en-US-JennyNeural" or "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)" const string EmbeddedSpeechSynthesisVoiceName = "YourEmbeddedSpeechSynthesisVoiceName"; // or set EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME -// Decryption key of the (encrypted) embedded speech synthesis voice. -// WARNING: The key may be visible in the program binary if hard-coded as a plain string. -const string EmbeddedSpeechSynthesisVoiceKey = "YourEmbeddedSpeechSynthesisVoiceKey"; // or set EMBEDDED_SPEECH_SYNTHESIS_VOICE_KEY - // Path to the local embedded speech translation model(s) on the device file system. // This may be a single model folder or a top-level folder for several models. // Use an absolute path or a path relative to the application working folder. @@ -75,14 +62,13 @@ const string EmbeddedSpeechTranslationModelPath = "YourEmbeddedSpeechTranslation // For example: "Microsoft Speech Translator Many-to-English Model V2" const string EmbeddedSpeechTranslationModelName = "YourEmbeddedSpeechTranslationModelName"; // or set EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME -// Decryption key of the (encrypted) embedded speech translation model. -// WARNING: The key may be visible in the program binary if hard-coded as a plain string. -const string EmbeddedSpeechTranslationModelKey = "YourEmbeddedSpeechTranslationModelKey"; // or set EMBEDDED_SPEECH_TRANSLATION_MODEL_KEY - -// Cloud speech service subscription information. -// This is needed with hybrid (cloud & embedded) speech configuration. +// Cloud speech service subscription and language settings. +// These are needed with hybrid (cloud & embedded) speech configuration. +// The language must be specified in BCP-47 format, case-sensitive. const string CloudSpeechSubscriptionKey = "YourCloudSpeechSubscriptionKey"; // or set CLOUD_SPEECH_SUBSCRIPTION_KEY const string CloudSpeechServiceRegion = "YourCloudSpeechServiceRegion"; // or set CLOUD_SPEECH_SERVICE_REGION +const string CloudSpeechRecognitionLanguage = "en-US"; // or set CLOUD_SPEECH_RECOGNITION_LANGUAGE +const string CloudSpeechSynthesisLanguage = "en-US"; // or set CLOUD_SPEECH_SYNTHESIS_LANGUAGE // END OF CONFIGURABLE SETTINGS @@ -113,17 +99,18 @@ const string GetSetting(const char* environmentVariableName, const string& defau // These are set in VerifySettings() after some basic verification. +string SpeechModelLicense; +string SpeechRecognitionModelPath; string SpeechRecognitionModelName; -string SpeechRecognitionModelKey; +string SpeechSynthesisVoicePath; string SpeechSynthesisVoiceName; -string SpeechSynthesisVoiceKey; +string SpeechTranslationModelPath; string SpeechTranslationModelName; -string SpeechTranslationModelKey; // Utility functions for main menu. bool HasSpeechRecognitionModel() { - if (SpeechRecognitionModelName.empty()) + if (SpeechRecognitionModelPath.empty() || SpeechRecognitionModelName.empty()) { cerr << "## ERROR: No speech recognition model specified.\n"; return false; @@ -133,7 +120,7 @@ bool HasSpeechRecognitionModel() bool HasSpeechSynthesisVoice() { - if (SpeechSynthesisVoiceName.empty()) + if (SpeechSynthesisVoicePath.empty() || SpeechSynthesisVoiceName.empty()) { cerr << "## ERROR: No speech synthesis voice specified.\n"; return false; @@ -143,7 +130,7 @@ bool HasSpeechSynthesisVoice() bool HasSpeechTranslationModel() { - if (SpeechTranslationModelName.empty()) + if (SpeechTranslationModelPath.empty() || SpeechTranslationModelName.empty()) { cerr << "## ERROR: No speech translation model specified.\n"; return false; @@ -157,20 +144,17 @@ shared_ptr CreateEmbeddedSpeechConfig() vector paths; // Add paths for offline data. - auto recognitionModelPath = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH", EmbeddedSpeechRecognitionModelPath); - if (!recognitionModelPath.empty() && recognitionModelPath.compare("YourEmbeddedSpeechRecognitionModelPath") != 0) + if (!SpeechRecognitionModelPath.empty()) { - paths.push_back(recognitionModelPath); + paths.push_back(SpeechRecognitionModelPath); } - auto synthesisVoicePath = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH", EmbeddedSpeechSynthesisVoicePath); - if (!synthesisVoicePath.empty() && synthesisVoicePath.compare("YourEmbeddedSpeechSynthesisVoicePath") != 0) + if (!SpeechSynthesisVoicePath.empty()) { - paths.push_back(synthesisVoicePath); + paths.push_back(SpeechSynthesisVoicePath); } - auto translationModelPath = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH", EmbeddedSpeechTranslationModelPath); - if (!translationModelPath.empty() && translationModelPath.compare("YourEmbeddedSpeechTranslationModelPath") != 0) + if (!SpeechTranslationModelPath.empty()) { - paths.push_back(translationModelPath); + paths.push_back(SpeechTranslationModelPath); } if (paths.size() == 0) @@ -194,13 +178,13 @@ shared_ptr CreateEmbeddedSpeechConfig() if (!SpeechRecognitionModelName.empty()) { // Mandatory configuration for embedded speech (and intent) recognition. - config->SetSpeechRecognitionModel(SpeechRecognitionModelName, SpeechRecognitionModelKey); + config->SetSpeechRecognitionModel(SpeechRecognitionModelName, SpeechModelLicense); } if (!SpeechSynthesisVoiceName.empty()) { // Mandatory configuration for embedded speech synthesis. - config->SetSpeechSynthesisVoice(SpeechSynthesisVoiceName, SpeechSynthesisVoiceKey); + config->SetSpeechSynthesisVoice(SpeechSynthesisVoiceName, SpeechModelLicense); if (SpeechSynthesisVoiceName.find("Neural") != string::npos) { // Embedded neural voices only support 24kHz sample rate. @@ -211,7 +195,7 @@ shared_ptr CreateEmbeddedSpeechConfig() if (!SpeechTranslationModelName.empty()) { // Mandatory configuration for embedded speech translation. - config->SetSpeechTranslationModel(SpeechTranslationModelName, SpeechTranslationModelKey); + config->SetSpeechTranslationModel(SpeechTranslationModelName, SpeechModelLicense); } // Disable profanity masking. @@ -235,8 +219,8 @@ shared_ptr CreateHybridSpeechConfig() // Also see // https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/cpp/windows/console // for complete Speech SDK samples using cloud speech services. - cloudSpeechConfig->SetSpeechRecognitionLanguage(GetSetting("SPEECH_RECOGNITION_LOCALE", SpeechRecognitionLocale)); - cloudSpeechConfig->SetSpeechSynthesisLanguage(GetSetting("SPEECH_SYNTHESIS_LOCALE", SpeechSynthesisLocale)); + cloudSpeechConfig->SetSpeechRecognitionLanguage(GetSetting("CLOUD_SPEECH_RECOGNITION_LANGUAGE", CloudSpeechRecognitionLanguage)); + cloudSpeechConfig->SetSpeechSynthesisLanguage(GetSetting("CLOUD_SPEECH_SYNTHESIS_LANGUAGE", CloudSpeechSynthesisLanguage)); auto embeddedSpeechConfig = CreateEmbeddedSpeechConfig(); @@ -261,93 +245,71 @@ bool VerifySettings() cout << "## WARNING: Cannot get the current working directory, errno=" << errno << endl; } - auto recognitionModelPath = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH", EmbeddedSpeechRecognitionModelPath); - if (recognitionModelPath.empty() || recognitionModelPath.compare("YourEmbeddedSpeechRecognitionModelPath") == 0) + SpeechModelLicense = GetSetting("EMBEDDED_SPEECH_MODEL_LICENSE", EmbeddedSpeechModelLicense); + if (SpeechModelLicense.empty() || SpeechModelLicense.compare("YourEmbeddedSpeechModelLicense") == 0) { - recognitionModelPath.clear(); + cerr << "## ERROR: The embedded speech model license is not set.\n"; + return false; } - auto synthesisVoicePath = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH", EmbeddedSpeechSynthesisVoicePath); - if (synthesisVoicePath.empty() || synthesisVoicePath.compare("YourEmbeddedSpeechSynthesisVoicePath") == 0) + SpeechRecognitionModelPath = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH", EmbeddedSpeechRecognitionModelPath); + if (SpeechRecognitionModelPath.compare("YourEmbeddedSpeechRecognitionModelPath") == 0) { - synthesisVoicePath.clear(); + SpeechRecognitionModelPath.clear(); } - - auto translationModelPath = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH", EmbeddedSpeechTranslationModelPath); - if (translationModelPath.empty() || translationModelPath.compare("YourEmbeddedSpeechTranslationModelPath") == 0) + SpeechRecognitionModelName = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME", EmbeddedSpeechRecognitionModelName); + if (SpeechRecognitionModelName.compare("YourEmbeddedSpeechRecognitionModelName") == 0) { - translationModelPath.clear(); + SpeechRecognitionModelName.clear(); } - // Find an embedded speech recognition model based on the name or locale. - if (!recognitionModelPath.empty()) + SpeechSynthesisVoicePath = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH", EmbeddedSpeechSynthesisVoicePath); + if (SpeechSynthesisVoicePath.compare("YourEmbeddedSpeechSynthesisVoicePath") == 0) + { + SpeechSynthesisVoicePath.clear(); + } + SpeechSynthesisVoiceName = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME", EmbeddedSpeechSynthesisVoiceName); + if (SpeechSynthesisVoiceName.compare("YourEmbeddedSpeechSynthesisVoiceName") == 0) { - auto modelName = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME", EmbeddedSpeechRecognitionModelName); - auto modelLocale = GetSetting("SPEECH_RECOGNITION_LOCALE", SpeechRecognitionLocale); + SpeechSynthesisVoiceName.clear(); + } - if (modelName.empty() || modelName.compare("YourEmbeddedSpeechRecognitionModelName") == 0) - { - modelName.clear(); // no name given -> search by locale - } + SpeechTranslationModelPath = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH", EmbeddedSpeechTranslationModelPath); + if (SpeechTranslationModelPath.compare("YourEmbeddedSpeechTranslationModelPath") == 0) + { + SpeechTranslationModelPath.clear(); + } + SpeechTranslationModelName = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME", EmbeddedSpeechTranslationModelName); + if (SpeechTranslationModelName.compare("YourEmbeddedSpeechTranslationModelName") == 0) + { + SpeechTranslationModelName.clear(); + } - auto config = EmbeddedSpeechConfig::FromPath(recognitionModelPath); + // Find an embedded speech recognition model based on the name. + if (!SpeechRecognitionModelPath.empty() && !SpeechRecognitionModelName.empty()) + { + auto config = EmbeddedSpeechConfig::FromPath(SpeechRecognitionModelPath); auto models = config->GetSpeechRecognitionModels(); auto result = find_if(models.begin(), models.end(), [&](shared_ptr model) { - if (modelName.empty()) - { - return model->Locales[0].compare(modelLocale) == 0; - } - else - { - return model->Name.compare(modelName) == 0 || model->Locales[0].compare(modelName) == 0; - } + return model->Name.compare(SpeechRecognitionModelName) == 0 || model->Locales[0].compare(SpeechRecognitionModelName) == 0; }); - if (result != models.end()) - { - SpeechRecognitionModelName = (*result)->Name; - } - - if (SpeechRecognitionModelName.empty()) - { - cout << "## WARNING: Cannot locate an embedded speech recognition model by "; - if (modelName.empty()) - { - cout << "locale \"" << modelLocale << "\". "; - } - else - { - cout << "name \"" << modelName << "\". "; - } - cout << "Current recognition model search path: " << recognitionModelPath << endl; - } - else + if (result == models.end()) { - SpeechRecognitionModelKey = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_KEY", EmbeddedSpeechRecognitionModelKey); - if (SpeechRecognitionModelKey.empty() || SpeechRecognitionModelKey.compare("YourEmbeddedSpeechRecognitionModelKey") == 0) - { - SpeechRecognitionModelKey.clear(); - cout << "## WARNING: The key for \"" << SpeechRecognitionModelName << "\" is not set.\n"; - } + cout << "## WARNING: Cannot locate an embedded speech recognition model \"" << SpeechRecognitionModelName << "\"\n"; } } - // Find an embedded speech synthesis voice based on the name or locale. - if (!synthesisVoicePath.empty()) + // Find an embedded speech synthesis voice based on the name. + if (!SpeechSynthesisVoicePath.empty() && !SpeechSynthesisVoiceName.empty()) { - auto voiceName = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME", EmbeddedSpeechSynthesisVoiceName); - auto voiceLocale = GetSetting("SPEECH_SYNTHESIS_LOCALE", SpeechSynthesisLocale); - - if (voiceName.empty() || voiceName.compare("YourEmbeddedSpeechSynthesisVoiceName") == 0) - { - voiceName.clear(); // no name given -> search by locale - } - - auto config = EmbeddedSpeechConfig::FromPath(synthesisVoicePath); + auto config = EmbeddedSpeechConfig::FromPath(SpeechSynthesisVoicePath); auto synthesizer = SpeechSynthesizer::FromConfig(config, nullptr); + + bool found = false; const auto voicesList = synthesizer->GetVoicesAsync("").get(); if (voicesList->Reason == ResultReason::VoicesListRetrieved) @@ -356,118 +318,48 @@ bool VerifySettings() auto result = find_if(voices.begin(), voices.end(), [&](shared_ptr voice) { - if (voiceName.empty()) - { - return voice->Locale.compare(voiceLocale) == 0; - } - else - { - return voice->Name.compare(voiceName) == 0 || voice->ShortName.compare(voiceName) == 0; - } + return voice->Name.compare(SpeechSynthesisVoiceName) == 0 || voice->ShortName.compare(SpeechSynthesisVoiceName) == 0; }); if (result != voices.end()) { - SpeechSynthesisVoiceName = (*result)->Name; + found = true; } } - if (SpeechSynthesisVoiceName.empty()) - { - cout << "## WARNING: Cannot locate an embedded speech synthesis voice by "; - if (voiceName.empty()) - { - cout << "locale \"" << voiceLocale << "\". "; - } - else - { - cout << "name \"" << voiceName << "\". "; - } - cout << "Current synthesis voice search path: " << synthesisVoicePath << endl; - } - else + if (!found) { - SpeechSynthesisVoiceKey = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_KEY", EmbeddedSpeechSynthesisVoiceKey); - if (SpeechSynthesisVoiceKey.empty() || SpeechSynthesisVoiceKey.compare("YourEmbeddedSpeechSynthesisVoiceKey") == 0) - { - SpeechSynthesisVoiceKey.clear(); - cout << "## WARNING: The key for \"" << SpeechSynthesisVoiceName << "\" is not set.\n"; - } + cout << "## WARNING: Cannot locate an embedded speech synthesis voice \"" << SpeechSynthesisVoiceName << "\"\n"; } } // Find an embedded speech translation model based on the name. - if (!translationModelPath.empty()) + if (!SpeechTranslationModelPath.empty() && !SpeechTranslationModelName.empty()) { - auto modelName = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME", EmbeddedSpeechTranslationModelName); - - auto config = EmbeddedSpeechConfig::FromPath(translationModelPath); + auto config = EmbeddedSpeechConfig::FromPath(SpeechTranslationModelPath); auto models = config->GetSpeechTranslationModels(); auto result = find_if(models.begin(), models.end(), [&](shared_ptr model) { - return model->Name.compare(modelName) == 0; + return model->Name.compare(SpeechTranslationModelName) == 0; }); - if (result != models.end()) - { - SpeechTranslationModelName = (*result)->Name; - } - - if (SpeechTranslationModelName.empty()) + if (result == models.end()) { - cout << "## WARNING: Cannot locate an embedded speech translation model by "; - cout << "name \"" << modelName << "\". "; - cout << "Current translation model search path: " << translationModelPath << endl; - } - else - { - SpeechTranslationModelKey = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_KEY", EmbeddedSpeechTranslationModelKey); - if (SpeechTranslationModelKey.empty() || SpeechTranslationModelKey.compare("YourEmbeddedSpeechTranslationModelKey") == 0) - { - SpeechTranslationModelKey.clear(); - cout << "## WARNING: The key for \"" << SpeechTranslationModelName << "\" is not set.\n"; - } + cout << "## WARNING: Cannot locate an embedded speech translation model \"" << SpeechTranslationModelName << "\"\n"; } } - auto maskValue = [](const string& value) - { - // Mask the string value, leave only the last 3 chars visible - string masked = value; - int visibleLength = masked.length() > 3 ? 3 : 0; - masked.replace(masked.begin(), masked.end() - visibleLength, masked.length() - visibleLength, '*'); - return masked; - }; - - cout << "Embedded speech recognition\n model search path: " << (recognitionModelPath.empty() ? "(not set)" : recognitionModelPath) << endl; - if (!recognitionModelPath.empty()) - { - cout << " model name: " << (SpeechRecognitionModelName.empty() ? "(not found)" : SpeechRecognitionModelName) << endl; - if (!SpeechRecognitionModelName.empty()) - { - cout << " model key: " << (SpeechRecognitionModelKey.empty() ? "(not set)" : maskValue(SpeechRecognitionModelKey)) << endl; - } - } - cout << "Embedded speech synthesis\n voice search path: " << (synthesisVoicePath.empty() ? "(not set)" : synthesisVoicePath) << endl; - if (!synthesisVoicePath.empty()) - { - cout << " voice name: " << (SpeechSynthesisVoiceName.empty() ? "(not found)" : SpeechSynthesisVoiceName) << endl; - if (!SpeechSynthesisVoiceName.empty()) - { - cout << " voice key: " << (SpeechSynthesisVoiceKey.empty() ? "(not set)" : maskValue(SpeechSynthesisVoiceKey)) << endl; - } - } - cout << "Embedded speech translation\n model search path: " << (translationModelPath.empty() ? "(not set)" : translationModelPath) << endl; - if (!translationModelPath.empty()) - { - cout << " model name: " << (SpeechTranslationModelName.empty() ? "(not found)" : SpeechTranslationModelName) << endl; - if (!SpeechTranslationModelName.empty()) - { - cout << " model key: " << (SpeechTranslationModelKey.empty() ? "(not set)" : maskValue(SpeechTranslationModelKey)) << endl; - } - } + cout << "Embedded speech recognition\n"; + cout << " model search path: " << (SpeechRecognitionModelPath.empty() ? "(not set)" : SpeechRecognitionModelPath) << endl; + cout << " model name: " << (SpeechRecognitionModelName.empty() ? "(not set)" : SpeechRecognitionModelName) << endl; + cout << "Embedded speech synthesis\n"; + cout << " voice search path: " << (SpeechSynthesisVoicePath.empty() ? "(not set)" : SpeechSynthesisVoicePath) << endl; + cout << " voice name: " << (SpeechSynthesisVoiceName.empty() ? "(not set)" : SpeechSynthesisVoiceName) << endl; + cout << "Embedded speech translation\n"; + cout << " model search path: " << (SpeechTranslationModelPath.empty() ? "(not set)" : SpeechTranslationModelPath) << endl; + cout << " model name: " << (SpeechTranslationModelName.empty() ? "(not set)" : SpeechTranslationModelName) << endl; return true; } diff --git a/samples/cpp/linux/compressed-audio-input/README.md b/samples/cpp/linux/compressed-audio-input/README.md index 8c968e115..eed9e7556 100644 --- a/samples/cpp/linux/compressed-audio-input/README.md +++ b/samples/cpp/linux/compressed-audio-input/README.md @@ -20,17 +20,6 @@ The compressed audio input stream should be either in MP3 or Opus format. sudo apt-get install libgstreamer1.0-0 gstreamer1.0-plugins-base gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly ``` -* On RHEL or CentOS, install these packages to build and run this sample: - - ```sh - sudo yum update - sudo yum groupinstall "Development tools" - sudo yum install alsa-lib openssl wget - sudo yum install gstreamer1 gstreamer1-plugins-base gstreamer1-plugins-good gstreamer1-plugins-ugly-free gstreamer1-plugins-bad-free - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - ## Build the sample * [Download the sample code to your development PC.](/README.md#get-the-samples) diff --git a/samples/cpp/windows/console/samples/Makefile b/samples/cpp/windows/console/samples/Makefile index 6c7dfc7b6..3cecd732b 100644 --- a/samples/cpp/windows/console/samples/Makefile +++ b/samples/cpp/windows/console/samples/Makefile @@ -22,7 +22,6 @@ SPEECHSDK_ROOT:=/change/to/point/to/extracted/SpeechSDK # - Linux x86 (32-bit), replace "x64" below with "x86". # - Linux ARM64 (64-bit), replace "x64" below with "arm64". # - Linux ARM32 (32-bit), replace "x64" below with "arm32". -# - CentOS 7 x64 (64-bit), replace "x64" below with "centos7-x64". TARGET_PLATFORM:=x64 CHECK_FOR_SPEECHSDK := $(shell test -f $(SPEECHSDK_ROOT)/lib/$(TARGET_PLATFORM)/libMicrosoft.CognitiveServices.Speech.core.so && echo Success) diff --git a/samples/cpp/windows/console/samples/packages.config b/samples/cpp/windows/console/samples/packages.config index 6f95a5152..01bc6219d 100644 --- a/samples/cpp/windows/console/samples/packages.config +++ b/samples/cpp/windows/console/samples/packages.config @@ -1,6 +1,6 @@ - - + + \ No newline at end of file diff --git a/samples/cpp/windows/console/samples/samples.vcxproj b/samples/cpp/windows/console/samples/samples.vcxproj index 828c7afd5..daf7b9274 100644 --- a/samples/cpp/windows/console/samples/samples.vcxproj +++ b/samples/cpp/windows/console/samples/samples.vcxproj @@ -56,8 +56,8 @@ - - + + @@ -215,8 +215,8 @@ This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - - + + \ No newline at end of file diff --git a/samples/cpp/windows/console/samples/speech_synthesis_samples.cpp b/samples/cpp/windows/console/samples/speech_synthesis_samples.cpp index 9189da676..03b08f245 100644 --- a/samples/cpp/windows/console/samples/speech_synthesis_samples.cpp +++ b/samples/cpp/windows/console/samples/speech_synthesis_samples.cpp @@ -114,11 +114,11 @@ void SpeechSynthesisWithVoice() auto config = SpeechConfig::FromSubscription("YourSubscriptionKey", "YourServiceRegion"); // Sets the voice name. - // e.g. "en-US-AndrewMultilingualNeural". + // e.g. "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)". // The full list of supported voices can be found here: // https://aka.ms/csspeech/voicenames // And, you can try GetVoicesAsync method to get all available voices (see SpeechSynthesisGetAvailableVoices() sample below). - auto voice = "en-US-AndrewMultilingualNeural"; + auto voice = "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)"; config->SetSpeechSynthesisVoiceName(voice); // Creates a speech synthesizer for the specified voice, using the default speaker as audio output. @@ -803,7 +803,7 @@ void SpeechSynthesisBookmarkEvent() }; // Bookmark tag is needed in the SSML, e.g. - const auto ssml = " one. two. three. four."; + const auto ssml = " one. two. three. four."; cout << "Press Enter to start synthesizing." << std::endl; std::string text; diff --git a/samples/csharp/dotnet-windows/console/samples/packages.config b/samples/csharp/dotnet-windows/console/samples/packages.config index c66333844..8e4086e88 100644 --- a/samples/csharp/dotnet-windows/console/samples/packages.config +++ b/samples/csharp/dotnet-windows/console/samples/packages.config @@ -1,5 +1,5 @@ - - + + \ No newline at end of file diff --git a/samples/csharp/dotnet-windows/console/samples/samples.csproj b/samples/csharp/dotnet-windows/console/samples/samples.csproj index 959508bc5..9fb755d93 100644 --- a/samples/csharp/dotnet-windows/console/samples/samples.csproj +++ b/samples/csharp/dotnet-windows/console/samples/samples.csproj @@ -144,9 +144,9 @@ 1.7.1 - - - + + + \ No newline at end of file diff --git a/samples/csharp/dotnet-windows/speechtotext-naudio/speechtotext-naudio/packages.config b/samples/csharp/dotnet-windows/speechtotext-naudio/speechtotext-naudio/packages.config index 5f6bbb71c..b5f521b19 100644 --- a/samples/csharp/dotnet-windows/speechtotext-naudio/speechtotext-naudio/packages.config +++ b/samples/csharp/dotnet-windows/speechtotext-naudio/speechtotext-naudio/packages.config @@ -1,6 +1,6 @@ - + diff --git a/samples/csharp/dotnet-windows/speechtotext-naudio/speechtotext-naudio/speechtotext-naudio.csproj b/samples/csharp/dotnet-windows/speechtotext-naudio/speechtotext-naudio/speechtotext-naudio.csproj index b85bc0e65..bbc4cd687 100644 --- a/samples/csharp/dotnet-windows/speechtotext-naudio/speechtotext-naudio/speechtotext-naudio.csproj +++ b/samples/csharp/dotnet-windows/speechtotext-naudio/speechtotext-naudio/speechtotext-naudio.csproj @@ -55,8 +55,8 @@ true - - ..\packages\Microsoft.CognitiveServices.Speech.1.38.0\lib\net462\Microsoft.CognitiveServices.Speech.csharp.dll + + ..\packages\Microsoft.CognitiveServices.Speech.1.40.0\lib\net462\Microsoft.CognitiveServices.Speech.csharp.dll ..\packages\Microsoft.Win32.Registry.4.7.0\lib\net461\Microsoft.Win32.Registry.dll @@ -106,11 +106,11 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/samples/csharp/dotnetcore/console/README.md b/samples/csharp/dotnetcore/console/README.md index 5d410cbe9..50c6cdf00 100644 --- a/samples/csharp/dotnetcore/console/README.md +++ b/samples/csharp/dotnetcore/console/README.md @@ -22,15 +22,6 @@ It runs under .NET 6.0 or later on Windows or Linux (see the list of [supported sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib dotnet-sdk-6.0 openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - ## Build the sample * **By building this sample you will download the Microsoft Cognitive Services Speech SDK. By downloading you acknowledge its license, see [Speech SDK license agreement](https://aka.ms/csspeech/license).** diff --git a/samples/csharp/dotnetcore/console/samples/samples.csproj b/samples/csharp/dotnetcore/console/samples/samples.csproj index a631fe34c..65498edd2 100644 --- a/samples/csharp/dotnetcore/console/samples/samples.csproj +++ b/samples/csharp/dotnetcore/console/samples/samples.csproj @@ -53,9 +53,9 @@ - - - + + + diff --git a/samples/csharp/dotnetcore/embedded-speech/README.md b/samples/csharp/dotnetcore/embedded-speech/README.md index 00d8a8e09..c3d275e8e 100644 --- a/samples/csharp/dotnetcore/embedded-speech/README.md +++ b/samples/csharp/dotnetcore/embedded-speech/README.md @@ -18,7 +18,7 @@ See the [platform requirements for installing the Speech SDK](https://learn.micr Requirements specific to embedded speech samples are as follows. * Supported operating systems and architectures: * Windows - `x64`, `ARM64`. - * Linux - `x64`, `ARM64`. Note that embedded speech is not supported on RHEL/CentOS 7. + * Linux - `x64`, `ARM64`. * macOS - `x64`, `ARM64`. * If using Visual Studio (Windows, macOS): * [Microsoft Visual Studio 2022 or newer](https://www.visualstudio.com/) with **.NET 6.0**. @@ -54,12 +54,10 @@ To tailor the sample to your configuration, there are two options: * Alternatively set corresponding environment variables (shown in parentheses in the list) before running the sample application. See details in [how to run the sample](#run-the-sample). Sample settings: -1. `SpeechRecognitionLocale` (`SPEECH_RECOGNITION_LOCALE`) - * Speech recognition locale in BCP-47 format, case-sensitive. If not set, en-US will be assumed. - * Setting `EmbeddedSpeechRecognitionModelName` overrides this for embedded speech. -1. `SpeechSynthesisLocale` (`SPEECH_SYNTHESIS_LOCALE`) - * Speech synthesis locale in BCP-47 format, case-sensitive. If not set, en-US will be assumed. - * Setting `EmbeddedSpeechSynthesisVoiceName` overrides this for embedded speech. +1. `EmbeddedSpeechModelLicense` (`EMBEDDED_SPEECH_MODEL_LICENSE`) + * Embedded speech model license (text). + * This applies to embedded speech recognition, synthesis and translation. + * It is presumed that all the customer's embedded speech models use the same license. 1. `EmbeddedSpeechRecognitionModelPath` (`EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH`) * Path to the local embedded speech recognition model(s) on the device file system. This may be a single model folder or a top-level folder for several models. @@ -69,10 +67,8 @@ Sample settings: not inside an archive, and they must be readable by the application process. The model internal subfolder structure must be intact i.e. as originally delivered. 1. `EmbeddedSpeechRecognitionModelName` (`EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME`) - * Name of the embedded speech recognition model to be used for recognition. If set, this overrides `SpeechRecognitionLocale` for embedded. + * Name of the embedded speech recognition model to be used for recognition. * The model name can be short (see https://aka.ms/speech/sr-languages, e.g. `en-US`) or full (e.g. `Microsoft Speech Recognizer en-US FP Model V8`). -1. `EmbeddedSpeechRecognitionModelKey` (`EMBEDDED_SPEECH_RECOGNITION_MODEL_KEY`) - * Decryption key of the (encrypted) embedded speech recognition model. 1. `EmbeddedSpeechSynthesisVoicePath` (`EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH`) * Path to the local embedded speech synthesis voice(s) on the device file system. This may be a single voice folder or a top-level folder for several voices. @@ -82,10 +78,8 @@ Sample settings: not inside an archive, and they must be readable by the application process. The voice internal subfolder structure must be intact i.e. as originally delivered. 1. `EmbeddedSpeechSynthesisVoiceName` (`EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME`) - * Name of the embedded speech synthesis voice to be used for synthesis. If set, this overrides `SpeechSynthesisLocale` for embedded. + * Name of the embedded speech synthesis voice to be used for synthesis. * The voice name can be short (see https://aka.ms/speech/tts-languages, e.g. `en-US-JennyNeural`) or full (e.g. `Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)`). -1. `EmbeddedSpeechSynthesisVoiceKey` (`EMBEDDED_SPEECH_SYNTHESIS_VOICE_KEY`) - * Decryption key of the (encrypted) embedded speech synthesis voice. 1. `EmbeddedSpeechTranslationModelPath` (`EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH`) * Path to the local embedded speech translation model(s) on the device file system. This may be a single model folder or a top-level folder for several models. @@ -97,12 +91,14 @@ Sample settings: 1. `EmbeddedSpeechTranslationModelName` (`EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME`) * Name of the embedded speech translation model to be used for translation. * The full model name must be given (e.g. `Microsoft Speech Translator Many-to-English Model V2`). -1. `EmbeddedSpeechTranslationModelKey` (`EMBEDDED_SPEECH_TRANSLATION_MODEL_KEY`) - * Decryption key of the (encrypted) embedded speech translation model. 1. `CloudSpeechSubscriptionKey` (`CLOUD_SPEECH_SUBSCRIPTION_KEY`) * Cloud speech service subscription key. This is needed with hybrid speech configuration. If not set, only embedded speech will be used. 1. `CloudSpeechServiceRegion` (`CLOUD_SPEECH_SERVICE_REGION`) * Cloud speech service region. This is needed with hybrid speech configuration. If not set, only embedded speech will be used. +1. `CloudSpeechRecognitionLanguage` (`CLOUD_SPEECH_RECOGNITION_LANGUAGE`) + * Cloud speech recognition language in BCP-47 format, case-sensitive. This is needed with hybrid speech configuration. If not set, en-US will be assumed. +1. `CloudSpeechSynthesisLanguage` (`CLOUD_SPEECH_SYNTHESIS_LANGUAGE`) + * Cloud speech synthesis language in BCP-47 format, case-sensitive. This is needed with hybrid speech configuration. If not set, en-US will be assumed. ### Visual Studio diff --git a/samples/csharp/dotnetcore/embedded-speech/samples/Settings.cs b/samples/csharp/dotnetcore/embedded-speech/samples/Settings.cs index a6e760322..d027691f2 100644 --- a/samples/csharp/dotnetcore/embedded-speech/samples/Settings.cs +++ b/samples/csharp/dotnetcore/embedded-speech/samples/Settings.cs @@ -17,13 +17,10 @@ public class Settings { // START OF CONFIGURABLE SETTINGS - // Locale to be used in speech recognition, cloud and embedded. In BCP-47 format, case-sensitive. - // If EmbeddedSpeechRecognitionModelName is changed from the default, it will override this for embedded. - private static readonly string SpeechRecognitionLocale = "en-US"; // or set SPEECH_RECOGNITION_LOCALE - - // Locale to be used in speech synthesis (text-to-speech), cloud and embedded. In BCP-47 format, case-sensitive. - // If EmbeddedSpeechSynthesisVoiceName is changed from the default, it will override this for embedded. - private static readonly string SpeechSynthesisLocale = "en-US"; // or set SPEECH_SYNTHESIS_LOCALE + // Embedded speech model license (text). + // This applies to embedded speech recognition, synthesis and translation. + // It is presumed that all the customer's embedded speech models use the same license. + private static readonly string EmbeddedSpeechModelLicense = "YourEmbeddedSpeechModelLicense"; // or set EMBEDDED_SPEECH_MODEL_LICENSE // Path to the local embedded speech recognition model(s) on the device file system. // This may be a single model folder or a top-level folder for several models. @@ -34,14 +31,9 @@ public class Settings private static readonly string EmbeddedSpeechRecognitionModelPath = @"YourEmbeddedSpeechRecognitionModelPath"; // or set EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH // Name of the embedded speech recognition model to be used for recognition. - // If changed from the default, this will override SpeechRecognitionLocale. // For example: "en-US" or "Microsoft Speech Recognizer en-US FP Model V8" private static readonly string EmbeddedSpeechRecognitionModelName = "YourEmbeddedSpeechRecognitionModelName"; // or set EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME - // Decryption key of the (encrypted) embedded speech recognition model. - // WARNING: The key may be visible in the program binary if hard-coded as a plain string. - private static readonly string EmbeddedSpeechRecognitionModelKey = "YourEmbeddedSpeechRecognitionModelKey"; // or set EMBEDDED_SPEECH_RECOGNITION_MODEL_KEY - // Path to the local embedded speech synthesis voice(s) on the device file system. // This may be a single voice folder or a top-level folder for several voices. // Use an absolute path or a path relative to the application working folder. @@ -51,14 +43,9 @@ public class Settings private static readonly string EmbeddedSpeechSynthesisVoicePath = @"YourEmbeddedSpeechSynthesisVoicePath"; // or set EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH // Name of the embedded speech synthesis voice to be used for synthesis. - // If changed from the default, this will override SpeechSynthesisLocale. // For example: "en-US-JennyNeural" or "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)" private static readonly string EmbeddedSpeechSynthesisVoiceName = "YourEmbeddedSpeechSynthesisVoiceName"; // or set EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME - // Decryption key of the (encrypted) embedded speech synthesis voice. - // WARNING: The key may be visible in the program binary if hard-coded as a plain string. - private static readonly string EmbeddedSpeechSynthesisVoiceKey = "YourEmbeddedSpeechSynthesisVoiceKey"; // or set EMBEDDED_SPEECH_SYNTHESIS_VOICE_KEY - // Path to the local embedded speech translation model(s) on the device file system. // This may be a single model folder or a top-level folder for several models. // Use an absolute path or a path relative to the application working folder. @@ -71,14 +58,13 @@ public class Settings // For example: "Microsoft Speech Translator Many-to-English Model V2" private static readonly string EmbeddedSpeechTranslationModelName = "YourEmbeddedSpeechTranslationModelName"; // or set EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME - // Decryption key of the (encrypted) embedded speech translation model. - // WARNING: The key may be visible in the program binary if hard-coded as a plain string. - private static readonly string EmbeddedSpeechTranslationModelKey = "YourEmbeddedSpeechTranslationModelKey"; // or set EMBEDDED_SPEECH_TRANSLATION_MODEL_KEY - - // Cloud speech service subscription information. - // This is needed with hybrid (cloud & embedded) speech configuration. + // Cloud speech service subscription and language settings. + // These are needed with hybrid (cloud & embedded) speech configuration. + // The language must be specified in BCP-47 format, case-sensitive. private static readonly string CloudSpeechSubscriptionKey = "YourCloudSpeechSubscriptionKey"; // or set CLOUD_SPEECH_SUBSCRIPTION_KEY private static readonly string CloudSpeechServiceRegion = "YourCloudSpeechServiceRegion"; // or set CLOUD_SPEECH_SERVICE_REGION + private static readonly string CloudSpeechRecognitionLanguage = "en-US"; // or set CLOUD_SPEECH_RECOGNITION_LANGUAGE + private static readonly string CloudSpeechSynthesisLanguage = "en-US"; // or set CLOUD_SPEECH_SYNTHESIS_LANGUAGE // END OF CONFIGURABLE SETTINGS @@ -108,17 +94,18 @@ private static string GetSetting(string environmentVariableName, string defaultV // These are set in VerifySettingsAsync() after some basic verification. + private static string SpeechModelLicense; + private static string SpeechRecognitionModelPath; private static string SpeechRecognitionModelName; - private static string SpeechRecognitionModelKey; + private static string SpeechSynthesisVoicePath; private static string SpeechSynthesisVoiceName; - private static string SpeechSynthesisVoiceKey; + private static string SpeechTranslationModelPath; private static string SpeechTranslationModelName; - private static string SpeechTranslationModelKey; // Utility functions for main menu. public static bool HasSpeechRecognitionModel() { - if (string.IsNullOrEmpty(SpeechRecognitionModelName)) + if (string.IsNullOrEmpty(SpeechRecognitionModelPath) || string.IsNullOrEmpty(SpeechRecognitionModelName)) { Console.Error.WriteLine("## ERROR: No speech recognition model specified."); return false; @@ -128,7 +115,7 @@ public static bool HasSpeechRecognitionModel() public static bool HasSpeechSynthesisVoice() { - if (string.IsNullOrEmpty(SpeechSynthesisVoiceName)) + if (string.IsNullOrEmpty(SpeechSynthesisVoicePath) || string.IsNullOrEmpty(SpeechSynthesisVoiceName)) { Console.Error.WriteLine("## ERROR: No speech synthesis voice specified."); return false; @@ -138,7 +125,7 @@ public static bool HasSpeechSynthesisVoice() public static bool HasSpeechTranslationModel() { - if (string.IsNullOrEmpty(SpeechTranslationModelName)) + if (string.IsNullOrEmpty(SpeechTranslationModelPath) || string.IsNullOrEmpty(SpeechTranslationModelName)) { Console.Error.WriteLine("## ERROR: No speech translation model specified."); return false; @@ -152,20 +139,17 @@ public static EmbeddedSpeechConfig CreateEmbeddedSpeechConfig() List paths = new List(); // Add paths for offline data. - var recognitionModelPath = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH", EmbeddedSpeechRecognitionModelPath); - if (!string.IsNullOrEmpty(recognitionModelPath) && !recognitionModelPath.Equals("YourEmbeddedSpeechRecognitionModelPath")) + if (!string.IsNullOrEmpty(SpeechRecognitionModelPath)) { - paths.Add(recognitionModelPath); + paths.Add(SpeechRecognitionModelPath); } - var synthesisVoicePath = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH", EmbeddedSpeechSynthesisVoicePath); - if (!string.IsNullOrEmpty(synthesisVoicePath) && !synthesisVoicePath.Equals("YourEmbeddedSpeechSynthesisVoicePath")) + if (!string.IsNullOrEmpty(SpeechSynthesisVoicePath)) { - paths.Add(synthesisVoicePath); + paths.Add(SpeechSynthesisVoicePath); } - var translationModelPath = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH", EmbeddedSpeechTranslationModelPath); - if (!string.IsNullOrEmpty(translationModelPath) && !translationModelPath.Equals("YourEmbeddedSpeechTranslationModelPath")) + if (!string.IsNullOrEmpty(SpeechTranslationModelPath)) { - paths.Add(translationModelPath); + paths.Add(SpeechTranslationModelPath); } if (paths.Count == 0) @@ -188,14 +172,14 @@ public static EmbeddedSpeechConfig CreateEmbeddedSpeechConfig() if (!string.IsNullOrEmpty(SpeechRecognitionModelName)) { - // Mandatory configuration for embedded speech recognition. - config.SetSpeechRecognitionModel(SpeechRecognitionModelName, SpeechRecognitionModelKey); + // Mandatory configuration for embedded speech (and intent) recognition. + config.SetSpeechRecognitionModel(SpeechRecognitionModelName, SpeechModelLicense); } if (!string.IsNullOrEmpty(SpeechSynthesisVoiceName)) { // Mandatory configuration for embedded speech synthesis. - config.SetSpeechSynthesisVoice(SpeechSynthesisVoiceName, SpeechSynthesisVoiceKey); + config.SetSpeechSynthesisVoice(SpeechSynthesisVoiceName, SpeechModelLicense); if (SpeechSynthesisVoiceName.Contains("Neural")) { // Embedded neural voices only support 24kHz sample rate. @@ -206,7 +190,7 @@ public static EmbeddedSpeechConfig CreateEmbeddedSpeechConfig() if (!string.IsNullOrEmpty(SpeechTranslationModelName)) { // Mandatory configuration for embedded speech translation. - config.SetSpeechTranslationModel(SpeechTranslationModelName, SpeechTranslationModelKey); + config.SetSpeechTranslationModel(SpeechTranslationModelName, SpeechModelLicense); } // Disable profanity masking. @@ -230,8 +214,8 @@ public static HybridSpeechConfig CreateHybridSpeechConfig() // Also see // https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/csharp/dotnetcore/console // for complete Speech SDK samples using cloud speech services. - cloudSpeechConfig.SpeechRecognitionLanguage = GetSetting("SPEECH_RECOGNITION_LOCALE", SpeechRecognitionLocale); - cloudSpeechConfig.SpeechSynthesisLanguage = GetSetting("SPEECH_SYNTHESIS_LOCALE", SpeechSynthesisLocale); + cloudSpeechConfig.SpeechRecognitionLanguage = GetSetting("CLOUD_SPEECH_RECOGNITION_LANGUAGE", CloudSpeechRecognitionLanguage); + cloudSpeechConfig.SpeechSynthesisLanguage = GetSetting("CLOUD_SPEECH_SYNTHESIS_LANGUAGE", CloudSpeechSynthesisLanguage); var embeddedSpeechConfig = CreateEmbeddedSpeechConfig(); @@ -247,200 +231,111 @@ public static async Task VerifySettingsAsync() string cwd = System.IO.Directory.GetCurrentDirectory(); Console.WriteLine($"Current working directory: {cwd}"); - var recognitionModelPath = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH", EmbeddedSpeechRecognitionModelPath); - if (string.IsNullOrEmpty(recognitionModelPath) || recognitionModelPath.Equals("YourEmbeddedSpeechRecognitionModelPath")) + SpeechModelLicense = GetSetting("EMBEDDED_SPEECH_MODEL_LICENSE", EmbeddedSpeechModelLicense); + if (string.IsNullOrEmpty(SpeechModelLicense) || SpeechModelLicense.Equals("YourEmbeddedSpeechModelLicense")) { - recognitionModelPath = null; + Console.Error.WriteLine("## ERROR: The embedded speech model license is not set."); + return false; } - var synthesisVoicePath = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH", EmbeddedSpeechSynthesisVoicePath); - if (string.IsNullOrEmpty(synthesisVoicePath) || synthesisVoicePath.Equals("YourEmbeddedSpeechSynthesisVoicePath")) + SpeechRecognitionModelPath = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH", EmbeddedSpeechRecognitionModelPath); + if (SpeechRecognitionModelPath.Equals("YourEmbeddedSpeechRecognitionModelPath")) { - synthesisVoicePath = null; + SpeechRecognitionModelPath = null; } - - var translationModelPath = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH", EmbeddedSpeechTranslationModelPath); - if (string.IsNullOrEmpty(translationModelPath) || translationModelPath.Equals("YourEmbeddedSpeechTranslationModelPath")) + SpeechRecognitionModelName = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME", EmbeddedSpeechRecognitionModelName); + if (SpeechRecognitionModelName.Equals("YourEmbeddedSpeechRecognitionModelName")) { - translationModelPath = null; + SpeechRecognitionModelName = null; } - // Find an embedded speech recognition model based on the name or locale. - - SpeechRecognitionModelName = null; - - if (recognitionModelPath != null) + SpeechSynthesisVoicePath = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH", EmbeddedSpeechSynthesisVoicePath); + if (SpeechSynthesisVoicePath.Equals("YourEmbeddedSpeechSynthesisVoicePath")) { - var modelName = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME", EmbeddedSpeechRecognitionModelName); - var modelLocale = GetSetting("SPEECH_RECOGNITION_LOCALE", SpeechRecognitionLocale); + SpeechSynthesisVoicePath = null; + } + SpeechSynthesisVoiceName = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME", EmbeddedSpeechSynthesisVoiceName); + if (SpeechSynthesisVoiceName.Equals("YourEmbeddedSpeechSynthesisVoiceName")) + { + SpeechSynthesisVoiceName = null; + } - if (string.IsNullOrEmpty(modelName) || modelName.Equals("YourEmbeddedSpeechRecognitionModelName")) - { - modelName = null; // no name given -> search by locale - } + SpeechTranslationModelPath = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH", EmbeddedSpeechTranslationModelPath); + if (SpeechTranslationModelPath.Equals("YourEmbeddedSpeechTranslationModelPath")) + { + SpeechTranslationModelPath = null; + } + SpeechTranslationModelName = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME", EmbeddedSpeechTranslationModelName); + if (SpeechTranslationModelName.Equals("YourEmbeddedSpeechTranslationModelName")) + { + SpeechTranslationModelName = null; + } - var config = EmbeddedSpeechConfig.FromPath(recognitionModelPath); + // Find an embedded speech recognition model based on the name. + if (!string.IsNullOrEmpty(SpeechRecognitionModelPath) && !string.IsNullOrEmpty(SpeechRecognitionModelName)) + { + var config = EmbeddedSpeechConfig.FromPath(SpeechRecognitionModelPath); var models = config.GetSpeechRecognitionModels(); var result = models.FirstOrDefault(model => - (modelName == null && model.Locales[0].Equals(modelLocale)) || - (modelName != null && (model.Name.Equals(modelName) || model.Locales[0].Equals(modelName)))); + model.Name.Equals(SpeechRecognitionModelName) || model.Locales[0].Equals(SpeechRecognitionModelName)); - if (result != null) + if (result == null) { - SpeechRecognitionModelName = result.Name; - } - - if (string.IsNullOrEmpty(SpeechRecognitionModelName)) - { - Console.Write("## WARNING: Cannot locate an embedded speech recognition model by "); - if (modelName == null) - { - Console.Write($"locale \"{modelLocale}\". "); - } - else - { - Console.Write($"name \"{modelName}\". "); - } - Console.WriteLine($"Current recognition model search path: {recognitionModelPath}"); - } - else - { - SpeechRecognitionModelKey = GetSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_KEY", EmbeddedSpeechRecognitionModelKey); - if (string.IsNullOrEmpty(SpeechRecognitionModelKey) || SpeechRecognitionModelKey.Equals("YourEmbeddedSpeechRecognitionModelKey")) - { - SpeechRecognitionModelKey = null; - Console.WriteLine($"## WARNING: The key for \"{SpeechRecognitionModelName}\" is not set."); - } + Console.WriteLine("## WARNING: Cannot locate an embedded speech recognition model \"{SpeechRecognitionModelName}\""); } } - // Find an embedded speech synthesis voice based on the name or locale. - - SpeechSynthesisVoiceName = null; - - if (synthesisVoicePath != null) + // Find an embedded speech synthesis voice based on the name. + if (!string.IsNullOrEmpty(SpeechSynthesisVoicePath) && !string.IsNullOrEmpty(SpeechSynthesisVoiceName)) { - var voiceName = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME", EmbeddedSpeechSynthesisVoiceName); - var voiceLocale = GetSetting("SPEECH_SYNTHESIS_LOCALE", SpeechSynthesisLocale); - - if (string.IsNullOrEmpty(voiceName) || voiceName.Equals("YourEmbeddedSpeechSynthesisVoiceName")) - { - voiceName = null; // no name given -> search by locale - } - - var config = EmbeddedSpeechConfig.FromPath(synthesisVoicePath); - + var config = EmbeddedSpeechConfig.FromPath(SpeechSynthesisVoicePath); using var synthesizer = new SpeechSynthesizer(config, null); + + bool found = false; using var voicesList = await synthesizer.GetVoicesAsync(""); if (voicesList.Reason == ResultReason.VoicesListRetrieved) { var result = voicesList.Voices.FirstOrDefault(voice => - (voiceName == null && voice.Locale.Equals(voiceLocale)) || - (voiceName != null && (voice.Name.Equals(voiceName) || voice.ShortName.Equals(voiceName)))); + voice.Name.Equals(SpeechSynthesisVoiceName) || voice.ShortName.Equals(SpeechSynthesisVoiceName)); if (result != null) { - SpeechSynthesisVoiceName = result.Name; + found = true; } } - if (string.IsNullOrEmpty(SpeechSynthesisVoiceName)) + if (!found) { - Console.Write("## WARNING: Cannot locate an embedded speech synthesis voice by "); - if (voiceName == null) - { - Console.Write($"locale \"{voiceLocale}\". "); - } - else - { - Console.Write($"name \"{voiceName}\". "); - } - Console.WriteLine($"Current synthesis voice search path: {synthesisVoicePath}"); - } - else - { - SpeechSynthesisVoiceKey = GetSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_KEY", EmbeddedSpeechSynthesisVoiceKey); - if (string.IsNullOrEmpty(SpeechSynthesisVoiceKey) || SpeechSynthesisVoiceKey.Equals("YourEmbeddedSpeechSynthesisVoiceKey")) - { - SpeechSynthesisVoiceKey = null; - Console.WriteLine($"## WARNING: The key for \"{SpeechSynthesisVoiceName}\" is not set."); - } + Console.WriteLine("## WARNING: Cannot locate an embedded speech synthesis voice \"{SpeechSynthesisVoiceName}\""); } } // Find an embedded speech translation model based on the name. - - SpeechTranslationModelName = null; - - if (translationModelPath != null) + if (!string.IsNullOrEmpty(SpeechTranslationModelPath) && !string.IsNullOrEmpty(SpeechTranslationModelName)) { - var modelName = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME", EmbeddedSpeechTranslationModelName); - - var config = EmbeddedSpeechConfig.FromPath(translationModelPath); + var config = EmbeddedSpeechConfig.FromPath(SpeechTranslationModelPath); var models = config.GetSpeechTranslationModels(); var result = models.FirstOrDefault(model => - (modelName != null && model.Name.Equals(modelName))); - - if (result != null) - { - SpeechTranslationModelName = result.Name; - } + model.Name.Equals(SpeechTranslationModelName)); - if (string.IsNullOrEmpty(SpeechTranslationModelName)) + if (result == null) { - Console.Write("## WARNING: Cannot locate an embedded speech translation model by "); - Console.Write($"name \"{modelName}\". "); - Console.WriteLine($"Current translation model search path: {translationModelPath}"); - } - else - { - SpeechTranslationModelKey = GetSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_KEY", EmbeddedSpeechTranslationModelKey); - if (string.IsNullOrEmpty(SpeechTranslationModelKey) || SpeechTranslationModelKey.Equals("YourEmbeddedSpeechTranslationModelKey")) - { - SpeechTranslationModelKey = null; - Console.WriteLine($"## WARNING: The key for \"{SpeechTranslationModelName}\" is not set."); - } + Console.WriteLine("## WARNING: Cannot locate an embedded speech translation model \"{SpeechTranslationModelName}\". "); } } - Func maskValue = (string value) => - { - // Mask the string value, leave only the last 3 chars visible - int visibleLength = value.Length > 3 ? 3 : 0; - string masked = new string('*', value.Length - visibleLength) + value.Substring(value.Length - visibleLength); - return masked; - }; - - Console.WriteLine($"Embedded speech recognition\n model search path: {(recognitionModelPath == null ? "(not set)" : recognitionModelPath)}"); - if (recognitionModelPath != null) - { - Console.WriteLine($" model name: {(string.IsNullOrEmpty(SpeechRecognitionModelName) ? "(not found)" : SpeechRecognitionModelName)}"); - if (!string.IsNullOrEmpty(SpeechRecognitionModelName)) - { - Console.WriteLine($" model key: {(string.IsNullOrEmpty(SpeechRecognitionModelKey) ? "(not set)" : maskValue(SpeechRecognitionModelKey))}"); - } - } - Console.WriteLine($"Embedded speech synthesis\n voice search path: {(synthesisVoicePath == null ? "(not set)" : synthesisVoicePath)}"); - if (synthesisVoicePath != null) - { - Console.WriteLine($" voice name: {(string.IsNullOrEmpty(SpeechSynthesisVoiceName) ? "(not found)" : SpeechSynthesisVoiceName)}"); - if (!string.IsNullOrEmpty(SpeechSynthesisVoiceName)) - { - Console.WriteLine($" voice key: {(string.IsNullOrEmpty(SpeechSynthesisVoiceKey) ? "(not set)" : maskValue(SpeechSynthesisVoiceKey))}"); - } - } - Console.WriteLine($"Embedded speech translation\n model search path: {(translationModelPath == null ? "(not set)" : translationModelPath)}"); - if (translationModelPath != null) - { - Console.WriteLine($" model name: {(string.IsNullOrEmpty(SpeechTranslationModelName) ? "(not found)" : SpeechTranslationModelName)}"); - if (!string.IsNullOrEmpty(SpeechTranslationModelName)) - { - Console.WriteLine($" model key: {(string.IsNullOrEmpty(SpeechTranslationModelKey) ? "(not set)" : maskValue(SpeechTranslationModelKey))}"); - } - } + Console.WriteLine($"Embedded speech recognition"); + Console.WriteLine($" model search path: {(string.IsNullOrEmpty(SpeechRecognitionModelPath) ? "(not set)" : SpeechRecognitionModelPath)}"); + Console.WriteLine($" model name: {(string.IsNullOrEmpty(SpeechRecognitionModelName) ? "(not set)" : SpeechRecognitionModelName)}"); + Console.WriteLine($"Embedded speech synthesis"); + Console.WriteLine($" voice search path: {(string.IsNullOrEmpty(SpeechSynthesisVoicePath) ? "(not set)" : SpeechSynthesisVoicePath)}"); + Console.WriteLine($" voice name: {(string.IsNullOrEmpty(SpeechSynthesisVoiceName) ? "(not set)" : SpeechSynthesisVoiceName)}"); + Console.WriteLine($"Embedded speech translation"); + Console.WriteLine($" model search path: {(string.IsNullOrEmpty(SpeechTranslationModelPath) ? "(not set)" : SpeechTranslationModelPath)}"); + Console.WriteLine($" model name: {(string.IsNullOrEmpty(SpeechTranslationModelName) ? "(not set)" : SpeechTranslationModelName)}"); return true; } diff --git a/samples/csharp/dotnetcore/embedded-speech/samples/samples.csproj b/samples/csharp/dotnetcore/embedded-speech/samples/samples.csproj index 9f48d761f..0285039f4 100644 --- a/samples/csharp/dotnetcore/embedded-speech/samples/samples.csproj +++ b/samples/csharp/dotnetcore/embedded-speech/samples/samples.csproj @@ -24,11 +24,11 @@ - - - - - + + + + + diff --git a/samples/csharp/maui/embedded-speech/README.md b/samples/csharp/maui/embedded-speech/README.md index b0a7401a3..68e7056fd 100644 --- a/samples/csharp/maui/embedded-speech/README.md +++ b/samples/csharp/maui/embedded-speech/README.md @@ -32,8 +32,8 @@ It is recommended to try them out before this MAUI specific sample. * The model internal subfolder structure must be intact i.e. as originally delivered. * Start Microsoft Visual Studio 2022 and select **File** \> **Open** \> **Project/Solution**. * Find and select the solution file in this sample folder. -* Edit the `MainPage.xaml.cs` source file under the main project and update the settings marked as configurable (model name, key, and list of files). - If either recognition or synthesis is not needed, leave the corresponding default values unchanged. +* Edit the `MainPage.xaml.cs` source file under the main project and update the settings marked as configurable (model name, license, and list of files). + If either recognition or synthesis is not needed, leave the corresponding name string empty. ## Build and run the sample for Windows * Press Ctrl+Shift+B, or select **Build** \> **Build Solution**. diff --git a/samples/csharp/maui/embedded-speech/embedded-speech/MainPage.xaml.cs b/samples/csharp/maui/embedded-speech/embedded-speech/MainPage.xaml.cs index 1cb80c5d1..3ba01c7e5 100644 --- a/samples/csharp/maui/embedded-speech/embedded-speech/MainPage.xaml.cs +++ b/samples/csharp/maui/embedded-speech/embedded-speech/MainPage.xaml.cs @@ -13,10 +13,9 @@ public partial class MainPage : ContentPage /********************************** * START OF CONFIGURABLE SETTINGS * **********************************/ - private static readonly string recognitionModelName = ""; // e.g. "en-US" or "Microsoft Speech Recognizer en-US FP Model V8.1" - private static readonly string recognitionModelKey = ""; // model decryption key - private static readonly string synthesisVoiceName = ""; // e.g. "en-US-AriaNeural" or "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)" - private static readonly string synthesisVoiceKey = ""; // voice decryption key + private static readonly string embeddedSpeechLicense = ""; // embedded speech model license (text) + private static readonly string recognitionModelName = ""; // e.g. "en-US" or "Microsoft Speech Recognizer en-US FP Model V8.1" + private static readonly string synthesisVoiceName = ""; // e.g. "en-US-AriaNeural" or "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)" // Embedded speech recognition models and synthesis voices must reside // as normal individual files on the device filesystem and they need to @@ -161,8 +160,8 @@ private void OnInitObjectsButtonClicked(object sender, EventArgs e) var config = EmbeddedSpeechConfig.FromPath(modelRootPath); // Selects embedded speech models to use. - config.SetSpeechRecognitionModel(recognitionModelName, recognitionModelKey); - config.SetSpeechSynthesisVoice(synthesisVoiceName, synthesisVoiceKey); + config.SetSpeechRecognitionModel(recognitionModelName, embeddedSpeechLicense); + config.SetSpeechSynthesisVoice(synthesisVoiceName, embeddedSpeechLicense); if (synthesisVoiceName.Contains("Neural")) { @@ -178,7 +177,7 @@ private void OnInitObjectsButtonClicked(object sender, EventArgs e) // With embedded speech, this can take a moment due to loading // of the model. To avoid unnecessary delays when recognition is // started, create the recognizer well in advance. - if (!string.IsNullOrEmpty(recognitionModelName) && !string.IsNullOrEmpty(recognitionModelKey)) + if (!string.IsNullOrEmpty(recognitionModelName) && !string.IsNullOrEmpty(embeddedSpeechLicense)) { recognizer = new SpeechRecognizer(config); sb.Append(" recognizer"); @@ -186,7 +185,7 @@ private void OnInitObjectsButtonClicked(object sender, EventArgs e) // Creates a speech synthesizer instance using the device default // speaker for audio output. - if (!string.IsNullOrEmpty(synthesisVoiceName) && !string.IsNullOrEmpty(synthesisVoiceKey)) + if (!string.IsNullOrEmpty(synthesisVoiceName) && !string.IsNullOrEmpty(embeddedSpeechLicense)) { synthesizer = new SpeechSynthesizer(config); if (recognizer != null) diff --git a/samples/csharp/maui/embedded-speech/embedded-speech/embedded-speech.csproj b/samples/csharp/maui/embedded-speech/embedded-speech/embedded-speech.csproj index 6a9142a73..633fb47fb 100644 --- a/samples/csharp/maui/embedded-speech/embedded-speech/embedded-speech.csproj +++ b/samples/csharp/maui/embedded-speech/embedded-speech/embedded-speech.csproj @@ -55,11 +55,11 @@ - - - - - + + + + + diff --git a/samples/csharp/maui/speech-to-text/speech-to-text/speech-to-text.csproj b/samples/csharp/maui/speech-to-text/speech-to-text/speech-to-text.csproj index 3c4f8c1da..0352c9a91 100644 --- a/samples/csharp/maui/speech-to-text/speech-to-text/speech-to-text.csproj +++ b/samples/csharp/maui/speech-to-text/speech-to-text/speech-to-text.csproj @@ -60,7 +60,7 @@ - + diff --git a/samples/csharp/sharedcontent/console/speech_synthesis_samples.cs b/samples/csharp/sharedcontent/console/speech_synthesis_samples.cs index ac61fb174..8229446a9 100644 --- a/samples/csharp/sharedcontent/console/speech_synthesis_samples.cs +++ b/samples/csharp/sharedcontent/console/speech_synthesis_samples.cs @@ -115,11 +115,11 @@ public static async Task SynthesisWithVoiceAsync() var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion"); // Sets the voice name. - // e.g. "en-US-AndrewMultilingualNeural". + // e.g. "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)". // The full list of supported voices can be found here: // https://aka.ms/csspeech/voicenames // And, you can try GetVoicesAsync method to get all available voices (see SynthesisGetAvailableVoicesAsync() sample below). - var voice = "en-US-AndrewMultilingualNeural"; + var voice = "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)"; config.SpeechSynthesisVoiceName = voice; // Creates a speech synthesizer for the specified voice, using the default speaker as audio output. @@ -803,7 +803,7 @@ public static async Task SynthesisBookmarkEventAsync() Console.ReadLine(); // Bookmark tag is needed in the SSML, e.g. - var ssml = " one. two. three. four."; + var ssml = " one. two. three. four."; using (var result = await synthesizer.SpeakSsmlAsync(ssml)) { @@ -971,7 +971,7 @@ public static async Task SynthesizeOnceUseCustomVoiceToSpeakerAsyncSwitchPlatfor || details.ErrorCode == CancellationErrorCode.ServiceTimeout || details.ErrorDetails.Contains("Error code: 1007")) { - // Synthesize using a standard platform voice, e.g. en-US-AvaMultilingualNeural + // Synthesize using a standard platform voice, e.g. en-US-JennyNeural synthesisResult = await SynthesizeOnceAsyncInternal("YourSubscriptionKey", "YourServiceRegion", null, "YourPlatformVoiceName"); } } diff --git a/samples/csharp/sharedcontent/console/speech_synthesis_server_scenario_sample.cs b/samples/csharp/sharedcontent/console/speech_synthesis_server_scenario_sample.cs index 9b78c654e..6c3b61a77 100644 --- a/samples/csharp/sharedcontent/console/speech_synthesis_server_scenario_sample.cs +++ b/samples/csharp/sharedcontent/console/speech_synthesis_server_scenario_sample.cs @@ -256,7 +256,7 @@ public class SpeechSynthesisServerScenarioSample public static void SpeechSynthesizeWithPool() { SynthesisServer server = new SynthesisServer(subscriptionKey, region, - "en-US-AvaMultilingualNeural", SpeechSynthesisOutputFormat.Audio24Khz48KBitRateMonoMp3, concurrency); + "en-US-JennyNeural", SpeechSynthesisOutputFormat.Audio24Khz48KBitRateMonoMp3, concurrency); for (var turn = 0; turn < 3; turn++) { diff --git a/samples/csharp/tts-text-stream/README.md b/samples/csharp/tts-text-stream/README.md deleted file mode 100644 index 3b1411613..000000000 --- a/samples/csharp/tts-text-stream/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Examples to synthesis with input text stream - -The input text stream API is designed to generate audio from text that is being streamed or generated in chunks. A typical scenario is to speak text generated from GPT-like models. Compared to non-text stream APIs, the text stream API significantly reduces TTS latency. - -| | Non text stream | Text Stream | -| ---------- | -------- | ----------- | -| Input Type | Whole GPT response | Each GPT output chunk | -| Latency | High: Time of full GPT response + Time of TTS | Low: Time of few GPT chunks + Time of TTS | - -### Available samples: - -| Language | Directory | Description | -| ---------- | -------- | ----------- | -| C# | [csharp](console) | synthesis with text stream API, the text stream generated by AOAI GPT chat model | - -## API overview -### Create text stream request -To use the text stream API, you have to use the websocket V2 endpoint. -```wss://{region}.tts.speech.microsoft.com/cognitiveservices/websocket/v2``` - -### Set global properties -Since the input of text stream API is parital text. SSML, which is based on XML, is not supported. And thus properties that set in SSML should be set in a new way. - -For now we only support set voice name and output format. - -### Create input text stream -Please specify SpeechSynthesisRequestInputType.TextStream when creating the request. - -### Send text to stream -For each text that generated from GPT, call `request.InputStream.Write(text);` to send text to the stream. - -### Close text stream -When GPT finished the output, call `request.InputStream.Close();` to close the stream. - diff --git a/samples/csharp/tts-text-stream/console/Program.cs b/samples/csharp/tts-text-stream/console/Program.cs deleted file mode 100644 index c4f05616a..000000000 --- a/samples/csharp/tts-text-stream/console/Program.cs +++ /dev/null @@ -1,182 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -namespace Demo -{ - using Azure; - using Azure.AI.OpenAI; - using Microsoft.CognitiveServices.Speech; - using System; - using System.Collections.Generic; - using System.Linq; - using System.Text; - using System.Threading.Tasks; - using static System.Net.Mime.MediaTypeNames; - - internal class Program - { - private static object consoleLock = new(); - private static OpenAIClient? aoaiClient; - private static SpeechSynthesizer? speechSynthesizer; - private static MemoryStream audioData; - - public static async Task Main() - { - // setup AOAI client - aoaiClient = new OpenAIClient( - new Uri(Environment.GetEnvironmentVariable("AZURE_OPENAI_API_ENDPOINT")), - new AzureKeyCredential(Environment.GetEnvironmentVariable("AZURE_OPENAI_API_KEY"))); - - // setup speech synthesizer - // IMPORTANT: MUST use the websocket v2 endpoint - var ttsEndpoint = $"wss://{Environment.GetEnvironmentVariable("AZURE_TTS_REGION")}.tts.speech.microsoft.com/cognitiveservices/websocket/v2"; - var speechConfig = SpeechConfig.FromEndpoint( - new Uri(ttsEndpoint), - Environment.GetEnvironmentVariable("AZURE_TTS_API_KEY")); - - // set output format - speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm); - - // set a voice name - speechConfig.SetProperty(PropertyId.SpeechServiceConnection_SynthVoice, "en-US-AvaMultilingualNeural"); - - // set timeout value to bigger ones to avoid sdk cancel the request when GPT latency too high - speechConfig.SetProperty("SpeechSynthesis_FrameTimeoutInterval", "10000"); - speechConfig.SetProperty("SpeechSynthesis_RtfTimeoutThreshold", "10"); - - speechSynthesizer = new SpeechSynthesizer(speechConfig); - speechSynthesizer.Synthesizing += SpeechSynthesizer_Synthesizing; - - // create request with TextStream input type - using var request = new SpeechSynthesisRequest(SpeechSynthesisRequestInputType.TextStream); - - var ttsTask = speechSynthesizer.SpeakAsync(request); - - audioData = new MemoryStream(); - - string query = "tell me a joke in 100 words"; - - // Get GPT output stream - using var gptTokenStream = await aoaiClient.GetChatCompletionsStreamingAsync( - new ChatCompletionsOptions() - { - Messages = - { - new ChatRequestSystemMessage(@"You are an AI assistant that helps people find information."), - new ChatRequestUserMessage(query) - }, - Temperature = (float)0.7, - MaxTokens = 800, - - - NucleusSamplingFactor = (float)0.95, - FrequencyPenalty = 0, - PresencePenalty = 0, - DeploymentName = "gpt-4-turbo" - }); - - await foreach (var message in gptTokenStream.EnumerateValues()) - { - var text = message.ContentUpdate; - if (string.IsNullOrEmpty(text)) - { - continue; - } - - lock (consoleLock) - { - Console.ForegroundColor = ConsoleColor.DarkBlue; - Console.Write($"{text}"); - Console.ResetColor(); - } - - // send the gpt token to tts input stream - request.InputStream.Write(text); - } - - // close tts input stream when GPT finished - request.InputStream.Close(); - lock (consoleLock) - { - Console.ForegroundColor = ConsoleColor.DarkBlue; - Console.Write($"[GPT END]"); - Console.ResetColor(); - } - - // wait all tts audio bytes return - var result = await ttsTask; - lock (consoleLock) - { - Console.ForegroundColor = ConsoleColor.Yellow; - Console.Write($"[TTS END]"); - Console.ResetColor(); - } - - var totalSampleCount = audioData.Length * 8 / 16; - WriteWavHeader(audioData, false, 1, 16, 24000, (int)totalSampleCount, 0); - File.WriteAllBytes("streaming.wav", audioData.ToArray()); - audioData.Close(); - } - - private static void SpeechSynthesizer_Synthesizing(object? sender, SpeechSynthesisEventArgs e) - { - lock (consoleLock) - { - Console.ForegroundColor = ConsoleColor.Yellow; - Console.Write($"[audio]"); - Console.ResetColor(); - } - - audioData.Write(e.Result.AudioData, 0, e.Result.AudioData.Length); - } - - public static void WriteWavHeader(MemoryStream stream, bool isFloatingPoint, ushort channelCount, ushort bitDepth, int sampleRate, int totalSampleCount, int extraChunkSize) - { - stream.Position = 0; - - // RIFF header. - // Chunk ID. - stream.Write(System.Text.Encoding.ASCII.GetBytes("RIFF"), 0, 4); - - // Chunk size. - stream.Write(BitConverter.GetBytes(((bitDepth / 8) * totalSampleCount) + 36 + extraChunkSize), 0, 4); - - // Format. - stream.Write(System.Text.Encoding.ASCII.GetBytes("WAVE"), 0, 4); - - // Sub-chunk 1. - // Sub-chunk 1 ID. - stream.Write(System.Text.Encoding.ASCII.GetBytes("fmt "), 0, 4); - - // Sub-chunk 1 size. - stream.Write(BitConverter.GetBytes(16), 0, 4); - - // Audio format (floating point (3) or PCM (1)). Any other format indicates compression. - stream.Write(BitConverter.GetBytes((ushort)(isFloatingPoint ? 3 : 1)), 0, 2); - - // Channels. - stream.Write(BitConverter.GetBytes(channelCount), 0, 2); - - // Sample rate. - stream.Write(BitConverter.GetBytes(sampleRate), 0, 4); - - // Bytes rate. - stream.Write(BitConverter.GetBytes(sampleRate * channelCount * (bitDepth / 8)), 0, 4); - - // Block align. - stream.Write(BitConverter.GetBytes((ushort)channelCount * (bitDepth / 8)), 0, 2); - - // Bits per sample. - stream.Write(BitConverter.GetBytes(bitDepth), 0, 2); - - // Sub-chunk 2. - // Sub-chunk 2 ID. - stream.Write(System.Text.Encoding.ASCII.GetBytes("data"), 0, 4); - - // Sub-chunk 2 size. - stream.Write(BitConverter.GetBytes((bitDepth / 8) * totalSampleCount), 0, 4); - } - } -} diff --git a/samples/csharp/tts-text-stream/console/TtsTextStreamSample.csproj b/samples/csharp/tts-text-stream/console/TtsTextStreamSample.csproj deleted file mode 100644 index 38a56c2fb..000000000 --- a/samples/csharp/tts-text-stream/console/TtsTextStreamSample.csproj +++ /dev/null @@ -1,15 +0,0 @@ - - - - Exe - net6.0 - enable - enable - - - - - - - - diff --git a/samples/csharp/tts-text-stream/console/TtsTextStreamSample.sln b/samples/csharp/tts-text-stream/console/TtsTextStreamSample.sln deleted file mode 100644 index a541b7bd6..000000000 --- a/samples/csharp/tts-text-stream/console/TtsTextStreamSample.sln +++ /dev/null @@ -1,25 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.9.34321.82 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TtsTextStreamSample", "TtsTextStreamSample.csproj", "{0BF9C9DB-4265-499B-8EFA-A91D5AC98608}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Release|Any CPU = Release|Any CPU - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {0BF9C9DB-4265-499B-8EFA-A91D5AC98608}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {0BF9C9DB-4265-499B-8EFA-A91D5AC98608}.Debug|Any CPU.Build.0 = Debug|Any CPU - {0BF9C9DB-4265-499B-8EFA-A91D5AC98608}.Release|Any CPU.ActiveCfg = Release|Any CPU - {0BF9C9DB-4265-499B-8EFA-A91D5AC98608}.Release|Any CPU.Build.0 = Release|Any CPU - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {93FB3636-D038-47BB-A579-FFC28B468DC7} - EndGlobalSection -EndGlobal diff --git a/samples/csharp/unity/embedded-speech/Assets/Scripts/HelloWorld.cs b/samples/csharp/unity/embedded-speech/Assets/Scripts/HelloWorld.cs index a5551631a..2e7f5c36a 100644 --- a/samples/csharp/unity/embedded-speech/Assets/Scripts/HelloWorld.cs +++ b/samples/csharp/unity/embedded-speech/Assets/Scripts/HelloWorld.cs @@ -21,10 +21,9 @@ public class HelloWorld : MonoBehaviour /********************************** * START OF CONFIGURABLE SETTINGS * **********************************/ - private static readonly string recognitionModelName = ""; // e.g. "en-US" or "Microsoft Speech Recognizer en-US FP Model V8.1" - private static readonly string recognitionModelKey = ""; // model decryption key - private static readonly string synthesisVoiceName = ""; // e.g. "en-US-AriaNeural" or "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)" - private static readonly string synthesisVoiceKey = ""; // voice decryption key + private static readonly string embeddedSpeechLicense = ""; // embedded speech model license (text) + private static readonly string recognitionModelName = ""; // e.g. "en-US" or "Microsoft Speech Recognizer en-US FP Model V8.1" + private static readonly string synthesisVoiceName = ""; // e.g. "en-US-AriaNeural" or "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)" #if PLATFORM_ANDROID // Embedded speech recognition models and synthesis voices must reside @@ -180,8 +179,8 @@ public void OnInitObjectsButtonClicked() var config = EmbeddedSpeechConfig.FromPath(modelRootPath); // Selects embedded speech models to use. - config.SetSpeechRecognitionModel(recognitionModelName, recognitionModelKey); - config.SetSpeechSynthesisVoice(synthesisVoiceName, synthesisVoiceKey); + config.SetSpeechRecognitionModel(recognitionModelName, embeddedSpeechLicense); + config.SetSpeechSynthesisVoice(synthesisVoiceName, embeddedSpeechLicense); if (synthesisVoiceName.Contains("Neural")) { @@ -197,7 +196,7 @@ public void OnInitObjectsButtonClicked() // With embedded speech, this can take a moment due to loading // of the model. To avoid unnecessary delays when recognition is // started, create the recognizer well in advance. - if (!string.IsNullOrEmpty(recognitionModelName) && !string.IsNullOrEmpty(recognitionModelKey)) + if (!string.IsNullOrEmpty(recognitionModelName) && !string.IsNullOrEmpty(embeddedSpeechLicense)) { recognizer = new SpeechRecognizer(config); sb.Append(" recognizer"); @@ -205,7 +204,7 @@ public void OnInitObjectsButtonClicked() // Creates a speech synthesizer instance using the device default // speaker for audio output. - if (!string.IsNullOrEmpty(synthesisVoiceName) && !string.IsNullOrEmpty(synthesisVoiceKey)) + if (!string.IsNullOrEmpty(synthesisVoiceName) && !string.IsNullOrEmpty(embeddedSpeechLicense)) { synthesizer = new SpeechSynthesizer(config); if (recognizer != null) diff --git a/samples/csharp/unity/embedded-speech/README.md b/samples/csharp/unity/embedded-speech/README.md index 567ca0688..264fb7924 100644 --- a/samples/csharp/unity/embedded-speech/README.md +++ b/samples/csharp/unity/embedded-speech/README.md @@ -57,8 +57,8 @@ Requirements specific to this embedded speech sample are as follows. 1. Review and update the sample code. * In the Project window, navigate to **Assets** > **Scripts** and double-click the `HelloWorld` C# script to edit it. (This script is used as a component of the canvas object in the HelloWorld scene.) - * Update the settings marked as configurable (model/voice name and key, also the list of files if Android is the target). - If either recognition or synthesis is not needed, leave the corresponding default values unchanged. + * Update the settings marked as configurable (model/voice name and license, also the list of files if Android is the target). + If either recognition or synthesis is not needed, leave the corresponding name string empty. 1. Build and run the sample. * In the Unity Editor, use the **Play** button in the toolbar. * For a stand-alone application, use **File** > **Build Settings**. diff --git a/samples/csharp/uwp/speechtotext-uwp/speechtotext-uwp/speechtotext-uwp.csproj b/samples/csharp/uwp/speechtotext-uwp/speechtotext-uwp/speechtotext-uwp.csproj index 0b5b51253..5a2cd78fd 100644 --- a/samples/csharp/uwp/speechtotext-uwp/speechtotext-uwp/speechtotext-uwp.csproj +++ b/samples/csharp/uwp/speechtotext-uwp/speechtotext-uwp/speechtotext-uwp.csproj @@ -108,7 +108,7 @@ - 1.38.0 + 1.40.0 6.2.8 diff --git a/samples/csharp/uwp/texttospeech-uwp/texttospeech-uwp/texttospeech-uwp.csproj b/samples/csharp/uwp/texttospeech-uwp/texttospeech-uwp/texttospeech-uwp.csproj index 416c52ee7..f207706aa 100644 --- a/samples/csharp/uwp/texttospeech-uwp/texttospeech-uwp/texttospeech-uwp.csproj +++ b/samples/csharp/uwp/texttospeech-uwp/texttospeech-uwp/texttospeech-uwp.csproj @@ -107,7 +107,7 @@ - 1.38.0 + 1.40.0 6.2.8 diff --git a/samples/csharp/uwp/virtualassistant-uwp/VirtualAssistantPreview.csproj b/samples/csharp/uwp/virtualassistant-uwp/VirtualAssistantPreview.csproj index a9567f846..1f0122d4c 100644 --- a/samples/csharp/uwp/virtualassistant-uwp/VirtualAssistantPreview.csproj +++ b/samples/csharp/uwp/virtualassistant-uwp/VirtualAssistantPreview.csproj @@ -165,7 +165,7 @@ 4.3.2 - 1.38.0 + 1.40.0 6.2.8 diff --git a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.Android/Properties/AndroidManifest.xml b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.Android/Properties/AndroidManifest.xml index e5ecbe6ed..2fb0ca707 100644 --- a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.Android/Properties/AndroidManifest.xml +++ b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.Android/Properties/AndroidManifest.xml @@ -1,6 +1,6 @@ - + diff --git a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.Android/kws-xamarin.Android.csproj b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.Android/kws-xamarin.Android.csproj index feaf5a2d5..8baa7de8d 100644 --- a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.Android/kws-xamarin.Android.csproj +++ b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.Android/kws-xamarin.Android.csproj @@ -16,7 +16,7 @@ Resources Assets false - v12.0 + v10.0 true true Xamarin.Android.Net.AndroidClientHandler @@ -54,7 +54,7 @@ - 1.38.0 + 1.40.0 diff --git a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.UWP/kws-xamarin.UWP.csproj b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.UWP/kws-xamarin.UWP.csproj index 79740839b..9ef3e7df1 100644 --- a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.UWP/kws-xamarin.UWP.csproj +++ b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.UWP/kws-xamarin.UWP.csproj @@ -11,7 +11,7 @@ kws-xamarin.UWP en-US UAP - 10.0.22000.0 + 10.0.17763.0 10.0.16299.0 14 true @@ -148,7 +148,7 @@ - 1.38.0 + 1.40.0 diff --git a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.iOS/kws-xamarin.iOS.csproj b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.iOS/kws-xamarin.iOS.csproj index 121f6a91c..066085ccf 100644 --- a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.iOS/kws-xamarin.iOS.csproj +++ b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin.iOS/kws-xamarin.iOS.csproj @@ -124,7 +124,7 @@ - 1.38.0 + 1.40.0 diff --git a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin/kws-xamarin.csproj b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin/kws-xamarin.csproj index 0b7372d9e..cd7da4d15 100644 --- a/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin/kws-xamarin.csproj +++ b/samples/csharp/xamarin/kws-xamarin/kws-xamarin/kws-xamarin/kws-xamarin.csproj @@ -10,7 +10,7 @@ - + diff --git a/samples/custom-voice/README.md b/samples/custom-voice/README.md deleted file mode 100644 index 42d80b836..000000000 --- a/samples/custom-voice/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Examples to use Custom Voice - -The Custom Voice API (Preview) is designed to create professional voice and personal voice. The functionality is exposed through a REST API and is easy to access from many programming languages. - -For a detailed explanation see the [custom neural voice documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/custom-neural-voice) and the `README.md` in the language specific subdirectories. - -REST API doc: [custom voice REST API](https://learn.microsoft.com/rest/api/aiservices/speechapi/operation-groups?view=rest-aiservices-speechapi-2024-02-01-preview). - -Available samples: - -| Language | Directory | Description | -| ---------- | -------- | ----------- | -| C# | [csharp](csharp) | C# client calling custom voice REST API through System.Net.Http | -| Python | [python](python) | Python client calling custom voice REST API | - -## Note - -1. You need a Cognitive Services subscription key to run sample code here. - - You can get the subscription key from the "Keys and Endpoint" tab on your Cognitive Services or Speech resource in the Azure Portal. - - Custom Voice is only available for paid subscriptions, free subscriptions are not supported. -2. Both professional voice and personal voice access are [limited](https://learn.microsoft.com/en-us/legal/cognitive-services/speech-service/custom-neural-voice/limited-access-custom-neural-voice?context=%2fazure%2fcognitive-services%2fspeech-service%2fcontext%2fcontext) based on eligibility and usage criteria. Please [request access](https://aka.ms/customneural) before using sample code here. -3. Personal voice is available in these regions: West Europe, East US, and South East Asia. diff --git a/samples/custom-voice/csharp/CustomVoiceSample.sln b/samples/custom-voice/csharp/CustomVoiceSample.sln deleted file mode 100644 index 68ecacf71..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample.sln +++ /dev/null @@ -1,25 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.8.34601.278 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CustomVoiceSample", "CustomVoiceSample\CustomVoiceSample.csproj", "{9BACF1B3-A122-43E8-8484-9689EB0934D2}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Release|Any CPU = Release|Any CPU - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {9BACF1B3-A122-43E8-8484-9689EB0934D2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {9BACF1B3-A122-43E8-8484-9689EB0934D2}.Debug|Any CPU.Build.0 = Debug|Any CPU - {9BACF1B3-A122-43E8-8484-9689EB0934D2}.Release|Any CPU.ActiveCfg = Release|Any CPU - {9BACF1B3-A122-43E8-8484-9689EB0934D2}.Release|Any CPU.Build.0 = Release|Any CPU - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {54274AE0-DEA5-4638-A157-6A30B5008CF4} - EndGlobalSection -EndGlobal diff --git a/samples/custom-voice/csharp/CustomVoiceSample/CustomVoiceClient.cs b/samples/custom-voice/csharp/CustomVoiceSample/CustomVoiceClient.cs deleted file mode 100644 index c67ee7595..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/CustomVoiceClient.cs +++ /dev/null @@ -1,632 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -using System.Net.Http.Formatting; -using System.Net.Http.Headers; -using Newtonsoft.Json; -using Newtonsoft.Json.Converters; - -public class CustomVoiceClient -{ - private const string OcpApimSubscriptionKey = "Ocp-Apim-Subscription-Key"; - private const string ApiVersion = "api-version=2023-12-01-preview"; - - private readonly string baseUri; - - private readonly HttpClient client; - - public static JsonSerializerSettings ReaderSettings { get; } = new JsonSerializerSettings - { - ConstructorHandling = ConstructorHandling.AllowNonPublicDefaultConstructor, - Converters = new List { new StringEnumConverter() { AllowIntegerValues = true } }, - Formatting = Formatting.Indented - }; - - public static JsonSerializerSettings WriterSettings { get; } = new JsonSerializerSettings - { - ConstructorHandling = ConstructorHandling.AllowNonPublicDefaultConstructor, - Converters = new List { new StringEnumConverter() { AllowIntegerValues = false } }, - DateFormatString = "yyyy-MM-ddTHH\\:mm\\:ss.fffZ", - NullValueHandling = NullValueHandling.Ignore, - Formatting = Formatting.Indented, - ReferenceLoopHandling = ReferenceLoopHandling.Ignore - }; - - public CustomVoiceClient(string region, string key) - { - this.baseUri = $"https://{region}.api.cognitive.microsoft.com/customvoice"; - - this.client = new HttpClient(); - client.DefaultRequestHeaders.Add(OcpApimSubscriptionKey, key); - } - - - #region Project Operations - - public async Task> GetAllProjectsAsync() - { - var projects = new List(); - var uri = new Uri($"{this.baseUri}/projects?{ApiVersion}"); - do - { - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var pagedProjects = await response.Content.ReadAsAsync>().ConfigureAwait(false); - projects.AddRange(pagedProjects.Value); - uri = pagedProjects.NextLink; - } - while (uri != null); - - return projects; - } - - public async Task GetProjectAsync(string projectId) - { - var uri = new Uri($"{this.baseUri}/projects/{projectId}?{ApiVersion}"); - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - return await response.Content.ReadAsAsync().ConfigureAwait(false); - } - - public async Task CreateProjectAsync( - string projectId, - ProjectKind projectKind, - string description) - { - var uri = new Uri($"{this.baseUri}/projects/{projectId}?{ApiVersion}"); - - var projectDefinition = new Project - { - Description = description, - Kind = projectKind, - }; - - var content = new StringContent(JsonConvert.SerializeObject(projectDefinition, WriterSettings)); - content.Headers.ContentType = JsonMediaTypeFormatter.DefaultMediaType; - var response = await this.client.PutAsync(uri, content).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var project = await response.Content.ReadAsAsync().ConfigureAwait(false); - - return project; - } - - public async Task DeleteProjectAsync(string projectId) - { - var uri = new Uri($"{this.baseUri}/projects/{projectId}?{ApiVersion}"); - var response = await this.client.DeleteAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - } - - #endregion - - #region Consent Operations - - public async Task> GetAllConsentsAsync() - { - var consents = new List(); - var uri = new Uri($"{this.baseUri}/consents?{ApiVersion}"); - do - { - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var pagedConsents = await response.Content.ReadAsAsync>().ConfigureAwait(false); - consents.AddRange(pagedConsents.Value); - uri = pagedConsents.NextLink; - } - while (uri != null); - - return consents; - } - - public async Task GetConsentAsync(string consentId) - { - var uri = new Uri($"{this.baseUri}/consents/{consentId}?{ApiVersion}"); - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var consent = await response.Content.ReadAsAsync().ConfigureAwait(false); - - return consent; - } - - public async Task CreateConsentAsync( - string consentId, - string projectId, - string voiceTalentName, - string companyName, - string locale, - Uri audioUrl) - { - var uri = new Uri($"{this.baseUri}/consents/{consentId}?{ApiVersion}"); - - var consentDefinition = new Consent - { - ProjectId = projectId, - VoiceTalentName = voiceTalentName, - CompanyName = companyName, - Locale = locale, - AudioUrl = audioUrl - }; - - var content = new StringContent(JsonConvert.SerializeObject(consentDefinition, WriterSettings)); - content.Headers.ContentType = JsonMediaTypeFormatter.DefaultMediaType; - var response = await this.client.PutAsync(uri, content).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var consent = await response.Content.ReadAsAsync().ConfigureAwait(false); - - // Wait for consent ready. It takes 2-3 seconds. - while (consent.Status != Status.Succeeded && consent.Status != Status.Failed) - { - await Task.Delay(1000).ConfigureAwait(false); - consent = await this.GetConsentAsync(consentId).ConfigureAwait(false); - } - - return consent; - } - - public async Task UploadConsentAsync( - string consentId, - string projectId, - string voiceTalentName, - string companyName, - string locale, - string audioFilePath) - { - var uri = new Uri($"{this.baseUri}/consents/{consentId}?{ApiVersion}"); - - var audioDataConent = new StreamContent(File.OpenRead(audioFilePath)); - audioDataConent.Headers.ContentType = new MediaTypeHeaderValue("audio/wav"); - - var multipartContent = new MultipartFormDataContent - { - { new StringContent(projectId), "projectId" }, - { new StringContent(voiceTalentName), "voiceTalentName" }, - { new StringContent(companyName), "companyName" }, - { new StringContent(locale), "locale" }, - { audioDataConent, "audioData", Path.GetFileName(audioFilePath) } - }; - - var response = await this.client.PostAsync(uri, multipartContent).ConfigureAwait(false); - - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var consent = await response.Content.ReadAsAsync().ConfigureAwait(false); - - // Wait for consent ready. It takes 2-3 seconds. - while (consent.Status != Status.Succeeded && consent.Status != Status.Failed) - { - await Task.Delay(1000).ConfigureAwait(false); - consent = await this.GetConsentAsync(consentId).ConfigureAwait(false); - } - - return consent; - } - - public async Task DeleteConsentAsync(string consentId) - { - var uri = new Uri($"{this.baseUri}/consents/{consentId}?{ApiVersion}"); - var response = await this.client.DeleteAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - } - - #endregion - - #region TrainingSet Operations - - public async Task> GetAllTrainingSetsAsync() - { - var trainingSets = new List(); - var uri = new Uri($"{this.baseUri}/trainingsets?{ApiVersion}"); - do - { - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var pagedTrainingSets = await response.Content.ReadAsAsync>().ConfigureAwait(false); - trainingSets.AddRange(pagedTrainingSets.Value); - uri = pagedTrainingSets.NextLink; - } - while (uri != null); - - return trainingSets; - } - - public async Task GetTrainingSetAsync(string trainingSetId) - { - var uri = new Uri($"{this.baseUri}/trainingsets/{trainingSetId}?{ApiVersion}"); - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var trainingSet = await response.Content.ReadAsAsync().ConfigureAwait(false); - - return trainingSet; - } - - public async Task CreateTrainingSetAsync( - string trainingSetId, - string projectId, - string description, - string locale) - { - var uri = new Uri($"{this.baseUri}/trainingsets/{trainingSetId}?{ApiVersion}"); - - var trainingSetDefinition = new TrainingSet - { - ProjectId = projectId, - Description = description, - Locale = locale - }; - - var content = new StringContent(JsonConvert.SerializeObject(trainingSetDefinition, WriterSettings)); - content.Headers.ContentType = JsonMediaTypeFormatter.DefaultMediaType; - var response = await this.client.PutAsync(uri, content).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var trainingSet = await response.Content.ReadAsAsync().ConfigureAwait(false); - - return trainingSet; - } - - public async Task DeleteTrainingSetAsync(string trainingSetId) - { - var uri = new Uri($"{this.baseUri}/trainingsets/{trainingSetId}?{ApiVersion}"); - var response = await this.client.DeleteAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - } - - public async Task UploadDataThroughAzureBlobAsync( - string trainingSetId, - DatasetKind kind, - AzureBlobContentSource audios, - AzureBlobContentSource scripts) - { - var uri = new Uri($"{this.baseUri}/trainingsets/{trainingSetId}:upload?{ApiVersion}"); - - var datasetDefinition = new Dataset - { - Kind = kind, - Audios = audios, - Scripts = scripts - }; - - var content = new StringContent(JsonConvert.SerializeObject(datasetDefinition, WriterSettings)); - content.Headers.ContentType = JsonMediaTypeFormatter.DefaultMediaType; - var response = await this.client.PostAsync(uri, content).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - } - - #endregion - - #region Model Operations - - public async Task> GetAllModelsAsync() - { - var models = new List(); - var uri = new Uri($"{this.baseUri}/models?{ApiVersion}"); - do - { - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var pagedModels = await response.Content.ReadAsAsync>().ConfigureAwait(false); - models.AddRange(pagedModels.Value); - uri = pagedModels.NextLink; - } - while (uri != null); - - return models; - } - - public async Task GetModelAsync(string modelId) - { - var uri = new Uri($"{this.baseUri}/models/{modelId}?{ApiVersion}"); - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var model = await response.Content.ReadAsAsync().ConfigureAwait(false); - - return model; - } - - public async Task CreateModelAsync( - string modelId, - string projectId, - string description, - string consentId, - string trainingSetId, - string voiceName, - RecipeKind recipeKind, - string locale, - ModelProperties properties) - { - if (recipeKind == RecipeKind.Default && !string.IsNullOrEmpty(locale)) - { - throw new ArgumentException("Do not need 'locale' parameter for Default recipe."); - } - - if (recipeKind == RecipeKind.CrossLingual && string.IsNullOrEmpty(locale)) - { - throw new ArgumentException("Need 'locale' parameter to specify the locale of voice model for CrossLingual recipe."); - } - - if (recipeKind == RecipeKind.MultiStyle && properties == null) - { - throw new ArgumentException("Need 'properties' parameter to specify style for MultiStyle recipe."); - } - - var uri = new Uri($"{this.baseUri}/models/{modelId}?{ApiVersion}"); - - var modelDefinition = new Model - { - ProjectId = projectId, - Description = description, - ConsentId = consentId, - TrainingSetId = trainingSetId, - VoiceName = voiceName, - Recipe = new Recipe { Kind = recipeKind }, - Locale = locale, - Properties = properties - }; - - var content = new StringContent(JsonConvert.SerializeObject(modelDefinition, WriterSettings)); - content.Headers.ContentType = JsonMediaTypeFormatter.DefaultMediaType; - var response = await this.client.PutAsync(uri, content).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var model = await response.Content.ReadAsAsync().ConfigureAwait(false); - - return model; - } - - public async Task DeleteModelAsync(string modelId) - { - var uri = new Uri($"{this.baseUri}/models/{modelId}?{ApiVersion}"); - var response = await this.client.DeleteAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - } - - #endregion - - #region PersonalVoice Operations - - public async Task> GetAllPersonalVoicesAsync() - { - var personalVoices = new List(); - var uri = new Uri($"{this.baseUri}/personalvoices?{ApiVersion}"); - do - { - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var pagedPersonalVoices = await response.Content.ReadAsAsync>().ConfigureAwait(false); - personalVoices.AddRange(pagedPersonalVoices.Value); - uri = pagedPersonalVoices.NextLink; - } - while (uri != null); - - return personalVoices; - } - - public async Task GetPersonalVoiceAsync(string personalVoiceId) - { - var uri = new Uri($"{this.baseUri}/personalvoices/{personalVoiceId}?{ApiVersion}"); - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var personalVoice = await response.Content.ReadAsAsync().ConfigureAwait(false); - - return personalVoice; - } - - public async Task CreatePersonalVoiceAsync( - string personalVoiceId, - string projectId, - string description, - string consentId, - string audiosFolder) - { - if (!Directory.Exists(audiosFolder)) - { - throw new ArgumentException($"Can't find '{audiosFolder}'."); - } - - var uri = new Uri($"{this.baseUri}/personalvoices/{personalVoiceId}?{ApiVersion}"); - - var multipartContent = new MultipartFormDataContent - { - { new StringContent(projectId), "projectId" }, - { new StringContent(consentId), "consentId" }, - { new StringContent(description), "description" } - }; - - foreach (var file in Directory.EnumerateFiles(audiosFolder)) - { - multipartContent.Add(new StreamContent(File.OpenRead(file)), "audioData", Path.GetFileName(file)); - } - - var response = await this.client.PostAsync(uri, multipartContent).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var personalVoice = await response.Content.ReadAsAsync().ConfigureAwait(false); - - // Wait for consent ready. It takes 2-3 seconds. - while (personalVoice.Status != Status.Succeeded && personalVoice.Status != Status.Failed) - { - await Task.Delay(1000).ConfigureAwait(false); - personalVoice = await this.GetPersonalVoiceAsync(personalVoiceId).ConfigureAwait(false); - } - - return personalVoice; - } - - public async Task DeletePersonalVoiceAsync(string personalVoiceId) - { - var uri = new Uri($"{this.baseUri}/personalvoices/{personalVoiceId}?{ApiVersion}"); - var response = await this.client.DeleteAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - } - - #endregion - - #region Endpoint Operations - - public async Task> GetAllEndpointsAsync() - { - var endpoints = new List(); - var uri = new Uri($"{this.baseUri}/endpoints?{ApiVersion}"); - do - { - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var pagedEndpoints = await response.Content.ReadAsAsync>().ConfigureAwait(false); - endpoints.AddRange(pagedEndpoints.Value); - uri = pagedEndpoints.NextLink; - } - while (uri != null); - - return endpoints; - } - - public async Task GetEndpointAsync(Guid endpointId) - { - var uri = new Uri($"{this.baseUri}/endpoints/{endpointId}?{ApiVersion}"); - var response = await this.client.GetAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var endpoint = await response.Content.ReadAsAsync().ConfigureAwait(false); - - return endpoint; - } - - public async Task CreateEndpointAsync( - Guid endpointId, - string projectId, - string description, - string modelId) - { - var uri = new Uri($"{this.baseUri}/endpoints/{endpointId}?{ApiVersion}"); - - var endpointDefinition = new Endpoint - { - ProjectId = projectId, - Description = description, - ModelId = modelId - }; - - var content = new StringContent(JsonConvert.SerializeObject(endpointDefinition, WriterSettings)); - content.Headers.ContentType = JsonMediaTypeFormatter.DefaultMediaType; - var response = await this.client.PutAsync(uri, content).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - - var endpoint = await response.Content.ReadAsAsync().ConfigureAwait(false); - - return endpoint; - } - - public async Task DeleteEndpointAsync(Guid endpointId) - { - var uri = new Uri($"{this.baseUri}/endpoints/{endpointId}?{ApiVersion}"); - var response = await this.client.DeleteAsync(uri).ConfigureAwait(false); - if (!response.IsSuccessStatusCode) - { - await HandleErrorResponse(response); - } - } - - #endregion - - private static async Task HandleErrorResponse(HttpResponseMessage response) - { - var content = await response.Content.ReadAsStringAsync().ConfigureAwait(false); - throw new HttpRequestException($"Request failed with status code {response.StatusCode}. {content}"); - } -} \ No newline at end of file diff --git a/samples/custom-voice/csharp/CustomVoiceSample/CustomVoiceSample.csproj b/samples/custom-voice/csharp/CustomVoiceSample/CustomVoiceSample.csproj deleted file mode 100644 index 87642918e..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/CustomVoiceSample.csproj +++ /dev/null @@ -1,21 +0,0 @@ - - - Exe - net6.0 - enable - - - - - - - - - - PreserveNewest - - - PreserveNewest - - - diff --git a/samples/custom-voice/csharp/CustomVoiceSample/PersonalVoiceSample.cs b/samples/custom-voice/csharp/CustomVoiceSample/PersonalVoiceSample.cs deleted file mode 100644 index 093b2dada..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/PersonalVoiceSample.cs +++ /dev/null @@ -1,100 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -using Microsoft.CognitiveServices.Speech; -using Microsoft.CognitiveServices.Speech.Audio; - -public class PersonalVoiceSample -{ - // Update your key and region here. - private const string subscriptionKey = "YourSubscriptionKey"; - private const string region = "YourServiceRegion. E.g., 'eastus'"; // # eastus, westeurope, southeastasia - - public static async Task PersonalVoiceTestAsync() - { - var client = new CustomVoiceClient(region, subscriptionKey); - - var projectId = "personal-voice-project-1"; - var consentId = "personal-voice-consent-1"; - var personalVoiceId = "personal-voice-1"; - - try - { - Console.WriteLine("Personal voice test starts."); - - // Step 1: Create a project - var project = await client.CreateProjectAsync(projectId, ProjectKind.PersonalVoice, "Test project for personal voice"); - Console.WriteLine($"Project created. project id: {project.Id}"); - - // Step 2: Create a consent - var consent = await client.UploadConsentAsync( - consentId, - projectId, - "Sample Voice Actor", - "Contoso", - "en-US", - "TestData/VoiceTalentVerbalStatement.wav"); - Console.WriteLine($"Consent created. consent id: {consent.Id}"); - - // Step 3: Create a personal voice - var personalVoice = await client.CreatePersonalVoiceAsync( - personalVoiceId, - projectId, - "personal voice create test", - consentId, - "TestData/PersonalVoice").ConfigureAwait(false); - - // Step 4: speak test - var outputFilePath = "personalvoice_output.wav"; - await PersonalVoiceSpeechSynthesisToWaveFileAsync("This is a personal voice test!", outputFilePath, personalVoice.SpeakerProfileId); - } - catch (Exception ex) - { - Console.WriteLine($"Error: {ex.Message}"); - } - finally - { - // Step 5: clean up - Console.WriteLine("Uncomment below and Clean up resources if you don't need them."); - - // await client.DeleteConsentAsync(consentId).ConfigureAwait(false); - // await client.DeletePersonalVoiceAsync(personalVoiceId).ConfigureAwait(false); - // await client.DeleteProjectAsync(projectId).ConfigureAwait(false); - } - } - - private static async Task PersonalVoiceSpeechSynthesisToWaveFileAsync(string text, string outputFilePath, Guid speakerProfileId) - { - var speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region); - using var audioConfig = AudioConfig.FromWavFileOutput(outputFilePath); - using var synthesizer = new SpeechSynthesizer(speechConfig, audioConfig); - - var ssml = $"" + - "" + - $"" + - "" + - $" {text} " + - "" + - " "; - - using var result = await synthesizer.SpeakSsmlAsync(ssml).ConfigureAwait(false); - - if (result.Reason == ResultReason.SynthesizingAudioCompleted) - { - Console.WriteLine($"Speech synthesis succeeded. The audio was saved to {outputFilePath}"); - } - else if (result.Reason == ResultReason.Canceled) - { - var cancellation = SpeechSynthesisCancellationDetails.FromResult(result); - Console.WriteLine($"Speech synthesis canceled: Reason={cancellation.Reason}"); - if (cancellation.Reason == CancellationReason.Error) - { - Console.WriteLine($"Result id: {result.ResultId}"); - Console.WriteLine($"Error details: {cancellation.ErrorDetails}"); - } - } - } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/ProfessionalVoiceSample.cs b/samples/custom-voice/csharp/CustomVoiceSample/ProfessionalVoiceSample.cs deleted file mode 100644 index dd1e800ac..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/ProfessionalVoiceSample.cs +++ /dev/null @@ -1,344 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -using Azure.Storage; -using Azure.Storage.Blobs; -using Azure.Storage.Sas; -using Microsoft.CognitiveServices.Speech; -using Microsoft.CognitiveServices.Speech.Audio; - -public class ProfessionalVoiceSample -{ - // Update your key and region here. - private const string subscriptionKey = "YourSubscriptionKey"; - private const string region = "YourServiceRegion. E.g., 'eastus'"; - - private const string projectId = "professional-voice-project-1"; - private const string consentId = "professional-voice-consent-1"; - private const string trainingSetId = "professional-voice-training-set-1"; - private const string modelId = "professional-voice-model-1"; - private static Guid endpointId = Guid.NewGuid(); - - // Below are parameters for blob storage. You need to replace them with your own blob storage account information. - private const string blobAccountName = "YourBlobAccountName. E.g., 'voicetest'."; - private const string blobAccountKey = "YourBlobAccountKey"; - private const string containerName = "YourContainerName."; - private const string consentblobPrefix = "YourBlobPrefix. E.g., consents"; - private const string trainingDataBlobPrefix = "YourTrainingDataBlobPrefix. E.g., professionalvoice/trainingset"; - - private static CustomVoiceClient client = new CustomVoiceClient(region, subscriptionKey); - - public static async Task ProfessionalVoiceTestAsync() - { - try - { - // Step 1: Create a project - var project = await CreateProjectAsync().ConfigureAwait(false); - - // Step 2: Create a consent - var consent = await CreateConsentAsync().ConfigureAwait(false); - - // Step 3: Create a training set and upload training data - var trainingSet = await UploadTrainingSetAsync().ConfigureAwait(false); - - // Step 4: Create a model - var model = await CreateModelAsync().ConfigureAwait(false); - - // Step 5: deploy model - var endpoint = await DeployModel().ConfigureAwait(false); - - // Step 6: speak test - await ProfessionalVoiceSpeechSynthesisToWaveFileAsync(model.VoiceName, endpoint.Id); - } - catch (Exception ex) - { - Console.WriteLine($"Error: {ex.Message}"); - } - finally - { - // Step 5: clean up - Console.WriteLine("Uncomment below and Clean up resources if you don't need them."); - - // await client.DeleteEndpointAsync(endpointId).ConfigureAwait(false); - // await client.DeleteConsentAsync(consentId).ConfigureAwait(false); - // await client.DeleteModelAsync(modelId).ConfigureAwait(false); - // await client.DeleteTrainingSetAsync(trainingSetId).ConfigureAwait(false); - // await client.DeleteProjectAsync(projectId).ConfigureAwait(false); - } - } - - private static async Task CreateProjectAsync() - { - var project = await client.CreateProjectAsync( - projectId, - ProjectKind.ProfessionalVoice, - "Test project for professional voice").ConfigureAwait(false); - Console.WriteLine($"Project created. project id: {project.Id}"); - - return project; - } - - private static async Task CreateConsentAsync() - { - // Step 1: Upload a consent audio file to blob storage - var audioUrl = await UploadSingleFileIntoBlobAsync( - "TestData/VoiceTalentVerbalStatement.wav", - blobAccountName, - blobAccountKey, - containerName, - consentblobPrefix).ConfigureAwait(false); - - // Step 2: Create a consent - var consent = await client.CreateConsentAsync( - consentId, - projectId, - "Sample Voice Actor", - "Contoso", - "en-US", - audioUrl).ConfigureAwait(false); - Console.WriteLine($"Consent created. consent id: {consent.Id}"); - - return consent; - } - - private static async Task UploadTrainingSetAsync() - { - // You can find sample script and audio file here. - // https://github.com/Azure-Samples/Cognitive-Speech-TTS/blob/master/CustomVoice/Sample%20Data/Individual%20utterances%20%2B%20matching%20script/SampleScript.txt - // https://github.com/Azure-Samples/Cognitive-Speech-TTS/blob/master/CustomVoice/Sample%20Data/Individual%20utterances%20%2B%20matching%20script/SampleAudios.zip - //Pleae unzip audio file. Put both audio and script file in foler below. - var localTrainingDataFolder = "YourLocalTrainingDataFolder. E.g., C:\\trainingset"; - - // Step 3.1: Create a training set - var trainingSet = await client.CreateTrainingSetAsync( - trainingSetId, - projectId, - "Test training set", - "en-US").ConfigureAwait(false); - Console.WriteLine($"Training set created. training set id: {trainingSet.Id}"); - - // Step 3.2: Upload training data to blob storage - var trainingDataUrl = await UploadFolderIntoBlobAsync( - localTrainingDataFolder, - blobAccountName, - blobAccountKey, - containerName, - trainingDataBlobPrefix).ConfigureAwait(false); - - // Step 3.3: Add training data to the training set - var audios = new AzureBlobContentSource() - { - ContainerUrl = trainingDataUrl, - Prefix = trainingDataBlobPrefix, - Extensions = new List { ".wav" } - }; - - var scripts = new AzureBlobContentSource() - { - ContainerUrl = trainingDataUrl, - Prefix = trainingDataBlobPrefix, - Extensions = new List { ".txt" } - }; - - await client.UploadDataThroughAzureBlobAsync(trainingSetId, DatasetKind.AudioAndScript, audios, scripts).ConfigureAwait(false); - - Console.Write("Uploading data into training set. It takes around 5 minutes to 1 hour depend on data size."); - - do - { - await Task.Delay(1000 * 10).ConfigureAwait(false); - trainingSet = await client.GetTrainingSetAsync(trainingSetId).ConfigureAwait(false); - Console.Write("."); - } - while (trainingSet.Status != Status.Succeeded && trainingSet.Status != Status.Failed); - - if (trainingSet.Status == Status.Failed) - { - throw new InvalidOperationException("Training set upload failed."); - } - - return trainingSet; - } - - private static async Task CreateModelAsync() - { - var model = await client.CreateModelAsync( - modelId, - projectId, - "Test model", - consentId, - trainingSetId, - "SampleVoiceNeural", - RecipeKind.Default, - locale: null, - properties: null).ConfigureAwait(false); - - Console.WriteLine(); - Console.WriteLine($"Model created. model id: {model.Id}"); - - Console.Write("Model is training in server. It takes around 24 hours."); - while (model.Status != Status.Succeeded && model.Status != Status.Failed) - { - Console.Write("."); - await Task.Delay(1000 * 300).ConfigureAwait(false); - model = await client.GetModelAsync(modelId).ConfigureAwait(false); - } - - if (model.Status == Status.Failed) - { - throw new InvalidOperationException("Model training failed."); - } - - return model; - } - - private static async Task DeployModel() - { - var endpoint = await client.CreateEndpointAsync( - endpointId, - projectId, - "Test endpoint", - modelId).ConfigureAwait(false); - - Console.WriteLine(); - Console.WriteLine($"Start deploying model. endpoint id: {endpoint.Id}"); - Console.Write("Deploying model. It takes around 1 to 5 minutes."); - while (endpoint.Status != Status.Succeeded && endpoint.Status != Status.Failed) - { - Console.Write("."); - await Task.Delay(1000 * 10).ConfigureAwait(false); - endpoint = await client.GetEndpointAsync(endpointId).ConfigureAwait(false); - } - - if (endpoint.Status == Status.Failed) - { - throw new InvalidOperationException("Model deployment failed."); - } - - return endpoint; - } - - // blobPrefix is the folder name in the blob container - private static async Task UploadSingleFileIntoBlobAsync( - string localFilePath, - string blobAccountName, - string blobAccountKey, - string containerName, - string blobPrefix) - { - // Put your storage account name and key here - var storageSharedKeyCredential = new StorageSharedKeyCredential(blobAccountName, blobAccountKey); - var blobServiceClient = new BlobServiceClient(new Uri($"https://{blobAccountName}.blob.core.windows.net"), storageSharedKeyCredential); - var containerClient = blobServiceClient.GetBlobContainerClient(containerName); - var blobName = $"{blobPrefix}/{Path.GetFileName(localFilePath)}"; - var blobClient = containerClient.GetBlobClient(blobName); - using var fileStream = File.OpenRead(localFilePath); - await blobClient.UploadAsync(fileStream, true).ConfigureAwait(false); - - // Generate a SAS token for the blob - var sasUri = GenerateBlobSasTokenUrl(blobAccountName, blobAccountKey, containerName, blobName); - return sasUri; - } - - // Upload local folder to blob container - private static async Task UploadFolderIntoBlobAsync( - string localFolderPath, - string blobAccountName, - string blobAccountKey, - string containerName, - string blobPrefix) - { - var storageSharedKeyCredential = new StorageSharedKeyCredential(blobAccountName, blobAccountKey); - var blobServiceClient = new BlobServiceClient(new Uri($"https://{blobAccountName}.blob.core.windows.net"), storageSharedKeyCredential); - var containerClient = blobServiceClient.GetBlobContainerClient(containerName); - - if (!Directory.Exists(localFolderPath)) - { - throw new ArgumentException($"Local folder {localFolderPath} does not exist."); - } - - var directory = new DirectoryInfo(localFolderPath); - foreach (var file in directory.GetFiles()) - { - var blobName = $"{blobPrefix}/{file.Name}"; - var blobClient = containerClient.GetBlobClient(blobName); - using var fileStream = File.OpenRead(file.FullName); - await blobClient.UploadAsync(fileStream, true).ConfigureAwait(false); - } - - // Generate a SAS token for the blob - var sasUri = GenerateContainerSasTokenUrl(blobAccountName, blobAccountKey, containerName); - return sasUri; - } - - private static Uri GenerateBlobSasTokenUrl(string accountName, string accountKey, string containerName, string blobPrefix) - { - var staorageSharedKeyCredential = new StorageSharedKeyCredential(accountName, accountKey); - var blobServiceClient = new BlobServiceClient(new Uri($"https://{accountName}.blob.core.windows.net"), staorageSharedKeyCredential); - var containerClient = blobServiceClient.GetBlobContainerClient(containerName); - var sasBuilder = new BlobSasBuilder - { - BlobContainerName = containerName, - BlobName = blobPrefix, - Resource = "b", - StartsOn = DateTimeOffset.UtcNow, - ExpiresOn = DateTimeOffset.UtcNow.AddHours(3) - }; - sasBuilder.SetPermissions(BlobSasPermissions.Read); - var sasToken = sasBuilder.ToSasQueryParameters(staorageSharedKeyCredential); - var sasUri = new Uri($"{containerClient.Uri}/{blobPrefix}?{sasToken}"); - return sasUri; - } - - private static Uri GenerateContainerSasTokenUrl(string accountName, string accountKey, string containerName) - { - var staorageSharedKeyCredential = new StorageSharedKeyCredential(accountName, accountKey); - var blobServiceClient = new BlobServiceClient(new Uri($"https://{accountName}.blob.core.windows.net"), staorageSharedKeyCredential); - var containerClient = blobServiceClient.GetBlobContainerClient(containerName); - var sasBuilder = new BlobSasBuilder - { - BlobContainerName = containerName, - Resource = "c", - StartsOn = DateTimeOffset.UtcNow, - ExpiresOn = DateTimeOffset.UtcNow.AddHours(3) - }; - sasBuilder.SetPermissions(BlobContainerSasPermissions.Read | BlobContainerSasPermissions.List); - var sasToken = sasBuilder.ToSasQueryParameters(staorageSharedKeyCredential); - var sasUri = new Uri($"{containerClient.Uri}?{sasToken}"); - return sasUri; - } - - private static async Task ProfessionalVoiceSpeechSynthesisToWaveFileAsync( - string voiceName, - Guid endpointId) - { - var text = "This is a professional voice test!"; - var outputFilePath = "professionalvoice_output.mp3"; - - var speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region); - speechConfig.EndpointId = endpointId.ToString(); - speechConfig.SpeechSynthesisVoiceName = voiceName; - speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Audio24Khz160KBitRateMonoMp3); - using var audioConfig = AudioConfig.FromWavFileOutput(outputFilePath); - using var synthesizer = new SpeechSynthesizer(speechConfig, audioConfig); - - using var result = await synthesizer.SpeakTextAsync(text).ConfigureAwait(false); - - if (result.Reason == ResultReason.SynthesizingAudioCompleted) - { - Console.WriteLine($"Speech synthesis succeeded. The audio was saved to {outputFilePath}"); - } - else if (result.Reason == ResultReason.Canceled) - { - var cancellation = SpeechSynthesisCancellationDetails.FromResult(result); - Console.WriteLine($"Speech synthesis canceled: Reason={cancellation.Reason}"); - if (cancellation.Reason == CancellationReason.Error) - { - Console.WriteLine($"Result id: {result.ResultId}"); - Console.WriteLine($"Error details: {cancellation.ErrorDetails}"); - } - } - } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/Program.cs b/samples/custom-voice/csharp/CustomVoiceSample/Program.cs deleted file mode 100644 index 7376daf39..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/Program.cs +++ /dev/null @@ -1,15 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public class Program -{ - public static void Main(string[] args) - { - // Uncomment to run Professional Voice Sample - // ProfessionalVoiceSample.ProfessionalVoiceTestAsync().Wait(); - - PersonalVoiceSample.PersonalVoiceTestAsync().Wait(); - } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/TestData/License-MustRead.md b/samples/custom-voice/csharp/CustomVoiceSample/TestData/License-MustRead.md deleted file mode 100644 index e254c06dd..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/TestData/License-MustRead.md +++ /dev/null @@ -1,3 +0,0 @@ -# License - -This sample test data ("Dataset") owned by Microsoft includes a sample audio and a sample voice talent verbal statement. The Dataset can only be used as a reference for your training data collection and format checking when you start to use Microsoft Azure Cognitive Services Speech Studio and Custom Neural Voice ("Services"). To test the voice training process of the Services, you can upload this Dataset to your account at Microsoft Azure Cognitive Services Speech Studio and create a sample voice model. You may be charged for using the Services when you create the sample voice model using this Dataset. You are not permitted to use the Dataset in any other ways or for any other purposes that have not been stipulated hereby. diff --git a/samples/custom-voice/csharp/CustomVoiceSample/TestData/PersonalVoice/sample.wav b/samples/custom-voice/csharp/CustomVoiceSample/TestData/PersonalVoice/sample.wav deleted file mode 100644 index a084e878b..000000000 Binary files a/samples/custom-voice/csharp/CustomVoiceSample/TestData/PersonalVoice/sample.wav and /dev/null differ diff --git a/samples/custom-voice/csharp/CustomVoiceSample/TestData/VoiceTalentVerbalStatement.wav b/samples/custom-voice/csharp/CustomVoiceSample/TestData/VoiceTalentVerbalStatement.wav deleted file mode 100644 index d03ccd4cc..000000000 Binary files a/samples/custom-voice/csharp/CustomVoiceSample/TestData/VoiceTalentVerbalStatement.wav and /dev/null differ diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/AzureBlobContentSource.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/AzureBlobContentSource.cs deleted file mode 100644 index ffc62a51b..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/AzureBlobContentSource.cs +++ /dev/null @@ -1,13 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public class AzureBlobContentSource -{ - public Uri ContainerUrl { get; set; } - - public string Prefix { get; set; } - - public IEnumerable Extensions { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/Consent.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/Consent.cs deleted file mode 100644 index d18ff29c1..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/Consent.cs +++ /dev/null @@ -1,27 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public class Consent -{ - public string Id { get; set; } - - public string Description { get; set; } - - public string VoiceTalentName { get; set; } - - public string CompanyName { get; set; } - - public string Locale { get; set; } - - public string ProjectId { get; set; } - - public Uri AudioUrl { get; set; } - - public DateTime CreatedDateTime { get; set; } - - public DateTime LastActionDateTime { get; set; } - - public Status? Status { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/Dataset.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/Dataset.cs deleted file mode 100644 index fd87cf9ff..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/Dataset.cs +++ /dev/null @@ -1,20 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public enum DatasetKind -{ - AudioAndScript = 1, - LongAudio = 2, - AudioOnly = 3 -} - -public class Dataset -{ - public DatasetKind Kind { get; set; } - - public AzureBlobContentSource Audios { get; set; } - - public AzureBlobContentSource Scripts { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/Endpoint.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/Endpoint.cs deleted file mode 100644 index 11a4aec2d..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/Endpoint.cs +++ /dev/null @@ -1,21 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public class Endpoint -{ - public Guid Id { get; set; } - - public string Description { get; set; } - - public string ModelId { get; set; } - - public string ProjectId { get; set; } - - public DateTime CreatedDateTime { get; set; } - - public DateTime LastActionDateTime { get; set; } - - public Status? Status { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/Model.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/Model.cs deleted file mode 100644 index 949f67d10..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/Model.cs +++ /dev/null @@ -1,33 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public class Model -{ - public string Id { get; set; } - - public string VoiceName { get; set; } - - public string Description { get; set; } - - public Recipe Recipe { get; set; } - - public string Locale { get; set; } - - public string TrainingSetId { get; set; } - - public string ProjectId { get; set; } - - public string ConsentId { get; set; } - - public ModelProperties Properties { get; set; } - - public string EngineVersion { get; set; } - - public DateTime CreatedDateTime { get; set; } - - public DateTime LastActionDateTime { get; set; } - - public Status? Status { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/ModelProperties.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/ModelProperties.cs deleted file mode 100644 index c83e96d0d..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/ModelProperties.cs +++ /dev/null @@ -1,13 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public class ModelProperties -{ - public string FailureReason { get; set; } - - public IEnumerable PresetStyles { get; set; } - - public IReadOnlyDictionary StyleTrainingSetIds { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/PaginatedResources.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/PaginatedResources.cs deleted file mode 100644 index 8f311d5b3..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/PaginatedResources.cs +++ /dev/null @@ -1,11 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public class PaginatedResources -{ - public IEnumerable Value { get; set; } - - public Uri NextLink { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/PersonalVoice.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/PersonalVoice.cs deleted file mode 100644 index f15a76fe5..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/PersonalVoice.cs +++ /dev/null @@ -1,27 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public class PersonalVoice -{ - public string Id { get; set; } - - public string Description { get; set; } - - public string ConsentId { get; set; } - - public AzureBlobContentSource Audios { get; set; } - - public string ProjectId { get; set; } - - public IReadOnlyDictionary Properties { get; set; } - - public DateTime CreatedDateTime { get; set; } - - public DateTime LastActionDateTime { get; set; } - - public Status? Status { get; set; } - - public Guid SpeakerProfileId { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/Project.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/Project.cs deleted file mode 100644 index 4c4bb6bb1..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/Project.cs +++ /dev/null @@ -1,26 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public enum ProjectKind -{ - ProfessionalVoice, - PersonalVoice -} - -public class Project -{ - public string Id { get; set; } - - public string Description { get; set; } - - public ProjectKind Kind { get; set; } - - /// - /// The time-stamp when the object was created. - /// The time stamp is encoded as ISO 8601 date and time format - /// ("YYYY-MM-DDThh:mm:ssZ", see https://en.wikipedia.org/wiki/ISO_8601#Combined_date_and_time_representations). - /// - public DateTime CreatedDateTime { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/Recipe.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/Recipe.cs deleted file mode 100644 index 2fb887d0d..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/Recipe.cs +++ /dev/null @@ -1,20 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public enum RecipeKind -{ - Default = 1, - CrossLingual = 2, - MultiStyle = 3 -} - -public class Recipe -{ - public string Version { get; set; } - - public RecipeKind Kind { get; set; } - - public string Description { get; set; } -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/Status.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/Status.cs deleted file mode 100644 index b51299312..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/Status.cs +++ /dev/null @@ -1,14 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public enum Status -{ - NotStarted = 1, - Running = 2, - Succeeded = 3, - Failed = 4, - Disabling = 5, - Disabled = 6 -} diff --git a/samples/custom-voice/csharp/CustomVoiceSample/dto/TrainingSet.cs b/samples/custom-voice/csharp/CustomVoiceSample/dto/TrainingSet.cs deleted file mode 100644 index 310e7f2cc..000000000 --- a/samples/custom-voice/csharp/CustomVoiceSample/dto/TrainingSet.cs +++ /dev/null @@ -1,21 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -public class TrainingSet -{ - public string Id { get; set; } - - public string Description { get; set; } - - public string Locale { get; set; } - - public string ProjectId { get; set; } - - public DateTime CreatedDateTime { get; set; } - - public DateTime LastActionDateTime { get; set; } - - public Status? Status { get; set; } -} diff --git a/samples/custom-voice/python/README.md b/samples/custom-voice/python/README.md deleted file mode 100644 index 4f94cf5b5..000000000 --- a/samples/custom-voice/python/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# How to use the Speech Services Custom Voice API from Python - -## Install dependencies - -The sample uses the `requests` and `Cognitive Services Speech SDK` library. You can install it with the command - -```sh -pip install requests -pip install azure-cognitiveservices-speech -``` - -## Run the sample code - -The professional voice sample code is [professional_voice_sample.py](professional_voice_sample.py). The personal voice sample code is [personal_voice_sample.py](personal_voice_sample.py). They can be run using Python 3.7 or higher. - diff --git a/samples/custom-voice/python/TestData/License-MustRead.md b/samples/custom-voice/python/TestData/License-MustRead.md deleted file mode 100644 index 5ad52b187..000000000 --- a/samples/custom-voice/python/TestData/License-MustRead.md +++ /dev/null @@ -1,3 +0,0 @@ -# License - -This sample training set ("Dataset") owned by Microsoft includes a recording script, audios and a sample voice talent verbal statement. The Dataset can only be used as a reference for your training data collection and format checking when you start to use Microsoft Azure Cognitive Services Speech Studio and Custom Neural Voice ("Services"). To test the voice training process of the Services, you can upload this Dataset to your account at Microsoft Azure Cognitive Services Speech Studio and create a sample voice model. You may be charged for using the Services when you create the sample voice model using this Dataset. You are not permitted to use the Dataset in any other ways or for any other purposes that have not been stipulated hereby. diff --git a/samples/custom-voice/python/TestData/VoiceTalentVerbalStatement.wav b/samples/custom-voice/python/TestData/VoiceTalentVerbalStatement.wav deleted file mode 100644 index d03ccd4cc..000000000 Binary files a/samples/custom-voice/python/TestData/VoiceTalentVerbalStatement.wav and /dev/null differ diff --git a/samples/custom-voice/python/TestData/voice/CNVSample192.wav b/samples/custom-voice/python/TestData/voice/CNVSample192.wav deleted file mode 100644 index c2456a308..000000000 Binary files a/samples/custom-voice/python/TestData/voice/CNVSample192.wav and /dev/null differ diff --git a/samples/custom-voice/python/customvoice/__init__.py b/samples/custom-voice/python/customvoice/__init__.py deleted file mode 100644 index 0563dba66..000000000 --- a/samples/custom-voice/python/customvoice/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -from .config import * -from .customvoice_object import * -from .status_object import * -from .project import * -from .consent import * -from .training_set import * -from .model import * -from .endpoint import * -from .personal_voice import * \ No newline at end of file diff --git a/samples/custom-voice/python/customvoice/config.py b/samples/custom-voice/python/customvoice/config.py deleted file mode 100644 index d80d1b6c4..000000000 --- a/samples/custom-voice/python/customvoice/config.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -import logging - -class Config: - api_version = 'api-version=2024-02-01-preview' - - def __init__(self, key: str, region: str, logger: logging.Logger = None): - self.key = key - self.region = region - self.url_prefix = 'https://' + region + '.api.cognitive.microsoft.com/customvoice/' - if logger is None: - self.logger = logging.getLogger() - else: - self.logger = logger diff --git a/samples/custom-voice/python/customvoice/consent.py b/samples/custom-voice/python/customvoice/consent.py deleted file mode 100644 index 78b638b56..000000000 --- a/samples/custom-voice/python/customvoice/consent.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -import json -import requests -import logging -from time import sleep - -from .helper import * -from .config import Config -from .customvoice_object import CustomVoiceObject -from .status_object import * - - -class Consent(StatusObject): - def __init__(self, json_dict: dict): - super().__init__(json_dict) - if 'voiceTalentName' not in json_dict: - raise ValueError("could not find 'voiceTalentName' in json_dict") - self.voice_talent_name = json_dict['voiceTalentName'] - if 'companyName' not in json_dict: - raise ValueError("could not find 'companyName' in json_dict") - self.company_name = json_dict['companyName'] - if 'locale' not in json_dict: - raise ValueError("could not find 'locale' in json_dict") - self.locale = json_dict['locale'] - if 'projectId' not in json_dict: - raise ValueError("could not find 'projectId' in json_dict") - self.project_id = json_dict['projectId'] - - # get all consents in project - # when project_id is None, get all consents in current speech account - @staticmethod - def list(config: Config, project_id: str = None): - config.logger.debug('Consent.list') - consents = [] - api_url = config.url_prefix + 'consents' + '?' + config.api_version - if project_id is not None and len(project_id) > 0: - api_url += "&filter=projectId eq '%s'" % project_id - headers = {'Ocp-Apim-Subscription-Key':config.key} - while api_url is not None and len(api_url) > 0: - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - response_dict= response.json() - for json_dict in response_dict['value']: - consent = Consent(json_dict) - consents.append(consent) - if 'nextLink' in response_dict: - api_url = response_dict['nextLink'] - else: - api_url = None - return consents - - @staticmethod - def get(config: Config, consent_id: str): - config.logger.debug('Consent.get consent_id = %s' % consent_id) - if consent_id is None or len(consent_id) == 0: - raise ValueError("'consent_id' is None or empty") - api_url = config.url_prefix + 'consents/' + consent_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - consent = Consent(response.json()) - return consent - - @staticmethod - def create(config: Config, project_id: str, consent_id: str, voice_talent_name: str, company_name: str, - audio_file_path: str, locale: str, description: str = None): - config.logger.debug('Consent.create consent_id = %s' % consent_id) - if project_id is None or len(project_id) == 0: - raise ValueError("'project_id' is None or empty") - if consent_id is None or len(consent_id) == 0: - raise ValueError("'consent_id' is None or empty") - if voice_talent_name is None or len(voice_talent_name) == 0: - raise ValueError("'voice_talent_name' is None or empty") - if company_name is None or len(company_name) == 0: - raise ValueError("'company_name' is None or empty") - if audio_file_path is None or len(audio_file_path) == 0: - raise ValueError("'audio_file_path' is None or empty") - if locale is None or len(locale) == 0: - raise ValueError("'locale' is None or empty") - if not os.path.exists(audio_file_path): - raise ValueError("can't find file 'audio_file_path' = %s" % audio_file_path) - audio_file_name = os.path.basename(audio_file_path) - - headers = { 'Ocp-Apim-Subscription-Key': config.key } - api_url = config.url_prefix + 'consents/' + consent_id + '?' + config.api_version - request_dict = { - 'projectId': project_id, - 'voiceTalentName': voice_talent_name, - 'companyName': company_name, - 'locale': locale, - 'description': description - } - file=('audiodata', (audio_file_name, open(audio_file_path, 'rb'), 'audio/wav')) - response = requests.post(api_url, data=request_dict, headers=headers, files=[file]) - raise_exception_when_reqeust_failed('POST', api_url, response, config.logger) - consent = Consent(response.json()) - consent_id = consent.id - - # Wait for consent ready. It takes 2-3 seconds. - while (consent.status != Status.Succeeded and consent.status != Status.Failed): - sleep(1) - consent = Consent.get(config, consent_id) - if consent.status == Status.Succeeded: - config.logger.debug('Consent.create succeeded consent_id = %s' % consent_id) - elif consent.status == Status.Failed: - config.logger.warning('Consent.create failed consent_id = %s' % consent_id) - return consent - - @staticmethod - def delete(config: Config, consent_id: str): - config.logger.debug('Consent.delete consent_id = %s' % consent_id) - if consent_id is None or len(consent_id) == 0: - raise ValueError("'consent_id' is None or empty") - api_url = config.url_prefix + 'consents/' + consent_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.delete(api_url, headers=headers) - raise_exception_when_reqeust_failed('DELETE', api_url, response, config.logger) diff --git a/samples/custom-voice/python/customvoice/customvoice_object.py b/samples/custom-voice/python/customvoice/customvoice_object.py deleted file mode 100644 index fb04009f9..000000000 --- a/samples/custom-voice/python/customvoice/customvoice_object.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -import json - -class CustomVoiceObject(object): - def __init__(self, json_dict: dict): - if json_dict is None: - raise TypeError - if 'id' not in json_dict: - raise ValueError("could not find 'id' in json_dict") - self.id = json_dict['id'] - if 'displayName' in json_dict: - self.display_name = json_dict['displayName'] - else: - self.display_name = '' - if 'description' in json_dict: - self.description = json_dict['description'] - else: - self.description = '' - if 'createdDateTime' in json_dict: - self.created_date_time = json_dict['createdDateTime'] - else: - self.created_date_time = '' diff --git a/samples/custom-voice/python/customvoice/endpoint.py b/samples/custom-voice/python/customvoice/endpoint.py deleted file mode 100644 index 64d862e90..000000000 --- a/samples/custom-voice/python/customvoice/endpoint.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -import json -import requests -import uuid -import logging - -from .helper import * -from .config import Config -from .customvoice_object import CustomVoiceObject -from .status_object import StatusObject - - -class Endpoint(StatusObject): - def __init__(self, json_dict: dict): - super().__init__(json_dict) - if 'projectId' not in json_dict: - raise ValueError("could not find 'projectId' in json_dict") - self.project_id = json_dict['projectId'] - if 'modelId' not in json_dict: - raise ValueError("could not find 'modelId' in json_dict") - self.model_id = json_dict['modelId'] - - - # get all endpoints in project - # when project_id is None, get all endpoints in current speech account - @staticmethod - def list(config: Config, project_id: str = None): - config.logger.debug('Endpoint.list') - endpoints = [] - api_url = config.url_prefix + 'endpoints' + '?' + config.api_version - if project_id is not None and len(project_id) > 0: - api_url += "&filter=projectId eq '%s'" % project_id - headers = {'Ocp-Apim-Subscription-Key':config.key} - while api_url is not None and len(api_url) > 0: - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - response_dict= response.json() - for json_dict in response_dict['value']: - endpoint = Endpoint(json_dict) - endpoints.append(endpoint) - if 'nextLink' in response_dict: - api_url = response_dict['nextLink'] - else: - api_url = None - return endpoints - - - @staticmethod - def get(config: Config, endpoint_id: str): - config.logger.debug('Endpoint.get endpoint_id = %s' % endpoint_id) - if endpoint_id is None or len(endpoint_id) == 0: - raise ValueError("'endpoint_id' is None or empty") - try: - uuid_obj = uuid.UUID(endpoint_id) - except ValueError: - raise ValueError("'endpoint_id' should be UUID") - api_url = config.url_prefix + 'endpoints/' + endpoint_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - endpoint = Endpoint(response.json()) - return endpoint - - - @staticmethod - def create(config: Config, project_id: str, endpoint_id: str, model_id: str, description: str = None): - config.logger.debug('Endpoint.create endpoint_id = %s' % endpoint_id) - if project_id is None or len(project_id) == 0: - raise ValueError("'project_id' is None or empty") - if endpoint_id is None or len(endpoint_id) == 0: - raise ValueError("'endpoint_id' is None or empty") - try: - uuid_obj = uuid.UUID(endpoint_id) - except ValueError: - raise ValueError("'endpoint_id' should be UUID") - if model_id is None or len(model_id) == 0: - raise ValueError("'consent_id' is None or empty") - - api_url = config.url_prefix + 'endpoints/' + endpoint_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - request_dict = { - 'description': description, - 'projectId': project_id, - 'modelId': model_id - } - response = requests.put(api_url, json=request_dict, headers=headers) - raise_exception_when_reqeust_failed('PUT', api_url, response, config.logger) - endpoint = Endpoint(response.json()) - return endpoint - - - @staticmethod - def delete(config: Config, endpoint_id: str): - config.logger.debug('Endpoint.delete endpoint_id = %s' % endpoint_id) - if endpoint_id is None or len(endpoint_id) == 0: - raise ValueError("'endpoint_id' is None or empty") - try: - uuid_obj = uuid.UUID(endpoint_id) - except ValueError: - raise ValueError("'endpoint_id' should be UUID") - api_url = config.url_prefix + 'endpoints/' + endpoint_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.delete(api_url, headers=headers) - raise_exception_when_reqeust_failed('DELETE', api_url, response, config.logger) diff --git a/samples/custom-voice/python/customvoice/helper.py b/samples/custom-voice/python/customvoice/helper.py deleted file mode 100644 index 5c9628d9d..000000000 --- a/samples/custom-voice/python/customvoice/helper.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -import os -import json -import requests -import logging - -def raise_exception_when_reqeust_failed(method: str, api_url: str, response: requests.Response, logger: logging.Logger): - if response is None: - return - if response.status_code >= 400: - message = ('Service return error' + os.linesep + - 'Request URL: %s ' % method + api_url + os.linesep + - 'status code: %s' % response.status_code + os.linesep + - 'response:' + os.linesep + - json.dumps(response.json(), indent = 4)) - logger.error(message) - raise Exception(message) diff --git a/samples/custom-voice/python/customvoice/model.py b/samples/custom-voice/python/customvoice/model.py deleted file mode 100644 index a3be3f55a..000000000 --- a/samples/custom-voice/python/customvoice/model.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -from enum import Enum -import json -import requests -import logging -from typing import NamedTuple - -from .helper import * -from .config import Config -from .customvoice_object import CustomVoiceObject -from .status_object import StatusObject - - -class RecipeKind(Enum): - Default = 1 - CrossLingual = 2 - MultiStyle = 3 - # Lite = 4 - - -class Model(StatusObject): - def __init__(self, json_dict: dict): - super().__init__(json_dict) - if 'locale' not in json_dict: - raise ValueError("could not find 'locale' in json_dict") - self.locale = json_dict['locale'] - if 'projectId' not in json_dict: - raise ValueError("could not find 'projectId' in json_dict") - self.project_id = json_dict['projectId'] - if 'voiceName' not in json_dict: - raise ValueError("could not find 'voiceName' in json_dict") - self.voice_name = json_dict['voiceName'] - # Seems no consentId property in some model. Maybe because it's a copied model. So, didn't treat consentId as required. - if 'consentId' in json_dict: - self.consent_id = json_dict['consentId'] - else: - self.consent_id = '' - # Seems no trainingSetId property in CNV Lite model. So, didn't treat trainingSetId as required. - if 'trainingSetId' in json_dict: - self.training_set_id = json_dict['trainingSetId'] - else: - self.training_set_id = '' - # Seems no raise property in CNV Lite model. So, didn't treat raise as required. - if 'recipe' in json_dict: - recipe = json_dict['recipe'] - if 'kind' not in recipe: - raise ValueError("could not find 'kind' in json_dict['recipe']") - self.recipe_kind = recipe['kind'] - if 'version' in recipe: - self.recipe_version = recipe['version'] - if 'properties' in json_dict: - properties = json_dict['properties'] - if 'failureReason' in properties: - self.failure_reason = properties['failureReason'] - - - # get all models in project - # when project_id is None, get all models in current speech account - @staticmethod - def list(config: Config, project_id: str = None): - config.logger.debug('Model.list') - models = [] - api_url = config.url_prefix + 'models' + '?' + config.api_version - if project_id is not None and len(project_id) > 0: - api_url += "&filter=projectId eq '%s'" % project_id - headers = {'Ocp-Apim-Subscription-Key':config.key} - while api_url is not None and len(api_url) > 0: - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - response_dict= response.json() - for json_dict in response_dict['value']: - model = Model(json_dict) - models.append(model) - if 'nextLink' in response_dict: - api_url = response_dict['nextLink'] - else: - api_url = None - return models - - - @staticmethod - def get(config: Config, model_id: str): - config.logger.debug('Model.get model_id = %s' % model_id) - if model_id is None or len(model_id) == 0: - raise ValueError("'model_id' is None or empty") - api_url = config.url_prefix + 'models/' + model_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - model = Model(response.json()) - return model - - # Use locale parameter to specific voice model target locale for CrossLingual voice. - # Use properties parameter to specific styles for MultiStyle voice. - @staticmethod - def create(config: Config, project_id: str, model_id: str, voice_name:str, recipe_kind: str, consent_id: str, training_set_id: str, - description: str = None, locale: str = None, properties: dict = None): - config.logger.debug('Model.create model_id = %s' % model_id) - if project_id is None or len(project_id) == 0: - raise ValueError("'project_id' is None or empty") - if model_id is None or len(model_id) == 0: - raise ValueError("'model_id' is None or empty") - if voice_name is None or len(voice_name) == 0: - raise ValueError("'voice_name' is None or empty") - if not voice_name.endswith('Neural'): - raise ValueError("'voice_name' should endwith 'Neural'") - if recipe_kind is None or len(recipe_kind) == 0: - raise ValueError("'recipe_kind' is None or empty") - if consent_id is None or len(consent_id) == 0: - raise ValueError("'consent_id' is None or empty") - if training_set_id is None or len(training_set_id) == 0: - raise ValueError("'training_set_id' is None or empty") - if recipe_kind == RecipeKind.Default.name and locale is not None and len(locale) > 0: - raise ValueError("Needn't provide 'locale' for Default recipe. The locale of voice model will be the same as training set.") - if recipe_kind == RecipeKind.CrossLingual.name and (locale is None or len(locale) == 0): - raise ValueError("Need 'locale' parameter to specify the locale of voice model for CrossLingual recipe.") - if recipe_kind == RecipeKind.MultiStyle.name and properties is None: - raise ValueError("Need 'properties' parameter to specify style for MultiStyle recipe.") - - api_url = config.url_prefix + 'models/' + model_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - request_dict = { - 'voiceName': voice_name, - 'description': description, - 'recipe': {'kind': recipe_kind}, - 'projectId': project_id, - 'consentId': consent_id, - 'trainingSetId': training_set_id - } - if locale is not None and len(locale) > 0: - request_dict['locale'] = locale - if properties is not None: - request_dict['properties'] = properties - response = requests.put(api_url, json=request_dict, headers=headers) - raise_exception_when_reqeust_failed('PUT', api_url, response, config.logger) - model = Model(response.json()) - return model - - - @staticmethod - def delete(config: Config, model_id: str): - config.logger.debug('Model.delete model_id = %s' % model_id) - if model_id is None or len(model_id) == 0: - raise ValueError("'model_id' is None or empty") - api_url = config.url_prefix + 'models/' + model_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.delete(api_url, headers=headers) - raise_exception_when_reqeust_failed('DELETE', api_url, response, config.logger) diff --git a/samples/custom-voice/python/customvoice/personal_voice.py b/samples/custom-voice/python/customvoice/personal_voice.py deleted file mode 100644 index a7c2ee9c6..000000000 --- a/samples/custom-voice/python/customvoice/personal_voice.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -import json -import requests -import logging -from time import sleep - -from .helper import * -from .config import Config -from .customvoice_object import CustomVoiceObject -from .status_object import * - - -class PersonalVoice(StatusObject): - def __init__(self, json_dict: dict): - super().__init__(json_dict) - if 'projectId' not in json_dict: - raise ValueError("could not find 'projectId' in json_dict") - self.project_id = json_dict['projectId'] - if 'consentId' not in json_dict: - raise ValueError("could not find 'consentId' in json_dict") - self.consent_id = json_dict['consentId'] - if 'speakerProfileId' not in json_dict: - raise ValueError("could not find 'speakerProfileId' in json_dict") - self.speaker_profile_id = json_dict['speakerProfileId'] - - - # get all personal voice in project - # when project_id is None, get all personal voices in current speech account - @staticmethod - def list(config: Config, project_id: str = None): - config.logger.debug('PersonalVoice.list') - personal_voice_list = [] - api_url = config.url_prefix + 'personalvoices' + '?' + config.api_version - if project_id is not None and len(project_id) > 0: - api_url += "&filter=projectId eq '%s'" % project_id - headers = {'Ocp-Apim-Subscription-Key':config.key} - while api_url is not None and len(api_url) > 0: - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - response_dict= response.json() - for json_dict in response_dict['value']: - speaker_profile = PersonalVoice(json_dict) - personal_voice_list.append(speaker_profile) - if 'nextLink' in response_dict: - api_url = response_dict['nextLink'] - else: - api_url = None - return personal_voice_list - - - @staticmethod - def get(config: Config, personal_voice_id: str): - config.logger.debug('PersonalVoice.get personal_voice_id = %s' % personal_voice_id) - if personal_voice_id is None or len(personal_voice_id) == 0: - raise ValueError("'personal_voice_id' is None or empty") - api_url = config.url_prefix + 'personalvoices/' + personal_voice_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - personal_voice = PersonalVoice(response.json()) - return personal_voice - - - @staticmethod - def create(config: Config, project_id: str, personal_voice_id: str, consent_id: str, audio_folder: str, description = None): - config.logger.debug('PersonalVoice.create personal_voice_id = %s' % personal_voice_id) - if project_id is None or len(project_id) == 0: - raise ValueError("'project_id' is None or empty") - if personal_voice_id is None or len(personal_voice_id) == 0: - raise ValueError("'personal_voice_id' is None or empty") - if consent_id is None or len(consent_id) == 0: - raise ValueError("'consent_id' is None or empty") - if audio_folder is None or len(audio_folder) == 0: - raise ValueError("'audio_folder' is None or empty") - if not os.path.isdir(audio_folder): - raise ValueError("Can't find 'audio_folder' %s" % audio_folder) - - api_url = config.url_prefix + 'personalvoices/' + personal_voice_id + '?' + config.api_version - request_dict = { - 'description': description, - 'projectId': project_id, - 'consentId': consent_id, - } - - files = [] - for file_name in os.listdir(audio_folder): - file_path = os.path.join(audio_folder, file_name) - file=('audiodata', (file_name, open(file_path, 'rb'), 'audio/wav')) - files.append(file) - - headers = { 'Ocp-Apim-Subscription-Key': config.key } - response = requests.post(api_url, data=request_dict, headers=headers, files=files) - raise_exception_when_reqeust_failed('POST', api_url, response, config.logger) - personal_voice = PersonalVoice(response.json()) - - # Wait for personal voice ready. It takes 1 second. - while (personal_voice.status != Status.Succeeded and personal_voice.status != Status.Failed): - sleep(1) - personal_voice = PersonalVoice.get(config, personal_voice_id) - if personal_voice.status == Status.Succeeded: - config.logger.debug('PersonalVoice.create succeeded personal_voice_id = %s' % personal_voice_id) - elif personal_voice.status == Status.Failed: - config.logger.debug('PersonalVoice.create failed personal_voice_id = %s' % personal_voice_id) - return personal_voice - - - @staticmethod - def delete(config: Config, personal_voice_id: str): - config.logger.debug('PersonalVoice.delete personal_voice_id = %s' % personal_voice_id) - if personal_voice_id is None or len(personal_voice_id) == 0: - raise ValueError("'speaker_profile_id' is None or empty") - api_url = config.url_prefix + 'personalvoices/' + personal_voice_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.delete(api_url, headers=headers) - raise_exception_when_reqeust_failed('DELETE', api_url, response, config.logger) diff --git a/samples/custom-voice/python/customvoice/project.py b/samples/custom-voice/python/customvoice/project.py deleted file mode 100644 index ee461eae3..000000000 --- a/samples/custom-voice/python/customvoice/project.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -from enum import Enum -import json -import requests -import logging - -from .helper import * -from .config import Config -from .customvoice_object import CustomVoiceObject - -class ProjectKind(Enum): - ProfessionalVoice = 1 - PersonalVoice = 2 - - -class Project(CustomVoiceObject): - - def __init__(self, json_dict: dict): - super().__init__(json_dict) - if 'kind' not in json_dict: - raise ValueError("could not find 'kind' in json_dict") - self.kind = ProjectKind[json_dict['kind']] - - # get all projects in current speech account - @staticmethod - def list(config: Config): - config.logger.debug('Project.list') - projects = [] - api_url = config.url_prefix + 'projects' + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - while api_url is not None and len(api_url) > 0: - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - response_dict= response.json() - for json_dict in response_dict['value']: - project = Project(json_dict) - projects.append(project) - if 'nextLink' in response_dict: - api_url = response_dict['nextLink'] - else: - api_url = None - return projects - - @staticmethod - def get(config: Config, project_id: str): - config.logger.debug('Project.get project_id = %s' % project_id) - if project_id is None or len(project_id) == 0: - raise ValueError("'project_id' is None or empty") - api_url = config.url_prefix + 'projects/' + project_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - project = Project(response.json()) - return project - - @staticmethod - def create(config: Config, project_id: str, kind: ProjectKind, description = None): - config.logger.debug('Project.create project_id = %s' % project_id) - if project_id is None or len(project_id) == 0: - raise ValueError("'project_id' is None or empty") - api_url = config.url_prefix + 'projects/' + project_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - request_dict = {'description': description, 'kind': kind.name} - response = requests.put(api_url, json=request_dict, headers=headers) - raise_exception_when_reqeust_failed('PUT', api_url, response, config.logger) - project = Project(response.json()) - return project - - @staticmethod - def delete(config: Config, project_id: str, forceDelete = False): - config.logger.debug('Project.delete project_id = %s' % project_id) - if project_id is None or len(project_id) == 0: - raise ValueError("'project_id' is None or empty") - api_url = config.url_prefix + 'projects/' + project_id + '?' + config.api_version - if forceDelete: - api_url += '&forceDelete=true' - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.delete(api_url, headers=headers) - raise_exception_when_reqeust_failed('DELETE', api_url, response, config.logger) diff --git a/samples/custom-voice/python/customvoice/status_object.py b/samples/custom-voice/python/customvoice/status_object.py deleted file mode 100644 index 2c912351b..000000000 --- a/samples/custom-voice/python/customvoice/status_object.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -from enum import Enum -from .customvoice_object import CustomVoiceObject - -class Status(Enum): - NotStarted = 1 - Running = 2 - Succeeded = 3 - Failed = 4 - Disabling = 5 - Disabled = 6 - -class StatusObject(CustomVoiceObject): - def __init__(self, json_dict: dict): - super().__init__(json_dict) - if 'status' not in json_dict: - raise ValueError("could not find 'status' in json_dict") - self.status = Status[json_dict['status']] - if 'lastActionDateTime' in json_dict: - self.last_action_date_time = json_dict['lastActionDateTime'] - else: - self.last_action_date_time = '' diff --git a/samples/custom-voice/python/customvoice/training_set.py b/samples/custom-voice/python/customvoice/training_set.py deleted file mode 100644 index 7b953c14c..000000000 --- a/samples/custom-voice/python/customvoice/training_set.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -import json -import requests -import logging -import os -from typing import NamedTuple - -from .helper import * -from .config import Config -from .customvoice_object import CustomVoiceObject -from .status_object import * - - -class DatasetKind(Enum): - AudioAndScript = 1 - LongAudio = 2 - AudioOnly = 3 - - -class AzureBlobContentSource(NamedTuple): - containerUrl: str - prefix: str - extensions: list[str] - - -class TrainingSet(StatusObject): - def __init__(self, json_dict: dict): - super().__init__(json_dict) - # Seems no locale property in CNV Lite training set. So, didn't treat locale as required. - if 'locale' in json_dict: - self.locale = json_dict['locale'] - else: - self.locale = '' - if 'projectId' not in json_dict: - raise ValueError("could not find 'projectId' in json_dict") - self.project_id = json_dict['projectId'] - if 'properties' in json_dict: - properties = json_dict['properties'] - if 'utteranceCount' in properties: - self.utteranceCount = int(properties['utteranceCount']) - - - # get all training sets in project - # when project_id is None, get all training sets in current speech account - @staticmethod - def list(config: Config, project_id: str = None): - config.logger.debug('TrainingSet.list') - training_sets = [] - api_url = config.url_prefix + 'trainingsets' + '?' + config.api_version - if project_id is not None and len(project_id) > 0: - api_url += "&filter=projectId eq '%s'" % project_id - headers = {'Ocp-Apim-Subscription-Key':config.key} - while api_url is not None and len(api_url) > 0: - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - response_dict= response.json() - for json_dict in response_dict['value']: - training_set = TrainingSet(json_dict) - training_sets.append(training_set) - if 'nextLink' in response_dict: - api_url = response_dict['nextLink'] - else: - api_url = None - return training_sets - - - @staticmethod - def get(config: Config, training_set_id: str): - config.logger.debug('TrainingSet.get training_set_id = %s' % training_set_id) - if training_set_id is None or len(training_set_id) == 0: - raise ValueError("'training_set_id' is None or empty") - api_url = config.url_prefix + 'trainingsets/' + training_set_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.get(api_url, headers=headers) - raise_exception_when_reqeust_failed('GET', api_url, response, config.logger) - training_set = TrainingSet(response.json()) - return training_set - - - @staticmethod - def create(config: Config, project_id: str, training_set_id: str, locale: str, description: str = None): - config.logger.debug('TrainingSet.create training_set_id = %s' % training_set_id) - if project_id is None or len(project_id) == 0: - raise ValueError("'project_id' is None or empty") - if training_set_id is None or len(training_set_id) == 0: - raise ValueError("'training_set_id' is None or empty") - if locale is None or len(locale) == 0: - raise ValueError("'locale' is None or empty") - - headers = { 'Ocp-Apim-Subscription-Key': config.key } - api_url = config.url_prefix + 'trainingsets/' + training_set_id + '?' + config.api_version - request_dict = { - 'description': description, - 'locale': locale, - 'projectId': project_id, - } - response = requests.put(api_url, json=request_dict, headers=headers) - raise_exception_when_reqeust_failed('PUT', api_url, response, config.logger) - training_set = TrainingSet(response.json()) - return training_set - - - @staticmethod - def delete(config: Config, training_set_id: str): - config.logger.debug('TrainingSet.delete training_set_id = %s' % training_set_id) - if training_set_id is None or len(training_set_id) == 0: - raise ValueError("'training_set_id' is None or empty") - api_url = config.url_prefix + 'trainingsets/' + training_set_id + '?' + config.api_version - headers = {'Ocp-Apim-Subscription-Key':config.key} - response = requests.delete(api_url, headers=headers) - raise_exception_when_reqeust_failed('DELETE', api_url, response, config.logger) - - - @staticmethod - def upload_data(config: Config, training_set_id: str, kind: DatasetKind, audios: AzureBlobContentSource, scripts: AzureBlobContentSource): - config.logger.debug('TrainingSet.upload_data training_set_id = %s' % training_set_id) - if training_set_id is None or len(training_set_id) == 0: - raise ValueError("'training_set_id' is None or empty") - if audios is None: - raise ValueError("'audios' is None") - if (kind == DatasetKind.AudioAndScript or kind == DatasetKind.LongAudio) and scripts is None: - raise scripts("'audios' is None") - - headers = {'Ocp-Apim-Subscription-Key':config.key} - api_url = config.url_prefix + 'trainingsets/' + training_set_id + ':upload' + '?' + config.api_version - request_dict = { - 'kind': kind.name, - 'audios': { - 'containerUrl': audios.containerUrl, - 'prefix': audios.prefix, - 'extensions': audios.extensions - } - } - if scripts is not None: - request_dict['scripts'] = { - 'containerUrl': scripts.containerUrl, - 'prefix': scripts.prefix, - 'extensions': scripts.extensions - } - response = requests.post(api_url, json=request_dict, headers=headers) - raise_exception_when_reqeust_failed('POST', api_url, response, config.logger) diff --git a/samples/custom-voice/python/personal_voice_sample.py b/samples/custom-voice/python/personal_voice_sample.py deleted file mode 100644 index b9b359511..000000000 --- a/samples/custom-voice/python/personal_voice_sample.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -import json -import requests -from time import sleep -import os -import logging -try: - import customvoice -except ImportError: - print('Pleae copy folder https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/custom-voice/python/customvoice and keep the same folder structure as github.' ) - quit() -import azure.cognitiveservices.speech as speechsdk - - -def create_personal_voice(project_id: str, - consent_id: str, consent_file_path: str, voice_talent_name: str, company_name: str, - personal_voice_id: str, audio_folder: str): - # create project - project = customvoice.Project.create(config, project_id, customvoice.ProjectKind.PersonalVoice) - print('Project created. project id: %s' % project.id) - - # upload consent - consent = customvoice.Consent.create(config, project_id, consent_id, voice_talent_name, company_name, consent_file_path, 'en-us') - if consent.status == customvoice.Status.Failed: - print('Create consent failed. consent id: %s' % consent.id) - raise Exception - elif consent.status == customvoice.Status.Succeeded: - print('Create consent succeeded. consent id: %s' % consent.id) - - # create personal voice - personal_voice = customvoice.PersonalVoice.create(config, project_id, personal_voice_id, consent_id, audio_folder) - if personal_voice.status == customvoice.Status.Failed: - print('Create personal voice failed. personal voice id: %s' % personal_voice.id) - raise Exception - elif personal_voice.status == customvoice.Status.Succeeded: - print('Create personal voice succeeded. personal voice id: %s, speaker profile id: %s' % (personal_voice.id, personal_voice.speaker_profile_id)) - return personal_voice.speaker_profile_id - - -def speech_synthesis_to_wave_file(text: str, output_file_path: str, speaker_profile_id: str): - # Creates an instance of a speech config with specified subscription key and service region. - speech_config = speechsdk.SpeechConfig(subscription=config.key, region=config.region) - speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm) - file_config = speechsdk.audio.AudioOutputConfig(filename=output_file_path) - speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config) - - - # use PhoenixLatestNeural if you want word boundary event. We will support events on DragonLatestNeural in the future. - ssml = "" \ - "" \ - "" \ - "" \ - " %s " \ - "" \ - " " % (speaker_profile_id, text) - - def word_boundary(evt): - print(f"Word Boundary: Text='{evt.text}', Audio offset={evt.audio_offset / 10000}ms, Duration={evt.duration / 10000}ms, text={evt.text}") - - speech_synthesizer.synthesis_word_boundary.connect(word_boundary) - result = speech_synthesizer.speak_ssml_async(ssml).get() - - # Check result - if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: - print("Speech synthesized for text [{}], and the audio was saved to [{}]".format(text, output_file_path)) - print("result id: {}".format(result.result_id)) - elif result.reason == speechsdk.ResultReason.Canceled: - cancellation_details = result.cancellation_details - print("Speech synthesis canceled: {}".format(cancellation_details.reason)) - if cancellation_details.reason == speechsdk.CancellationReason.Error: - print("Error details: {}".format(cancellation_details.error_details)) - print("result id: {}".format(result.result_id)) - - -def clean_up(project_id: str, consent_id: str, personal_voice_id: str): - customvoice.PersonalVoice.delete(config, personal_voice_id) - customvoice.Consent.delete(config, consent_id) - customvoice.Project.delete(config, project_id) - - -region = 'eastus' # eastus, westeurope, southeastasia -key = 'your speech key here' - - -logging.basicConfig(filename="customvoice.log", - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - filemode='w') -logger = logging.getLogger() -logger.setLevel(logging.DEBUG) - -config = customvoice.Config(key, region, logger) - - -project_id = 'personal-voice-project-1' -consent_id = 'personal-voice-consent-1' -personal_voice_id = 'personal-voice-1' - -try: - # step 1: create personal voice - # Need consent file and audio file to create personal vocie. - # This is consent file template. - # I [voice talent name] am aware that recordings of my voice will be used by [company name] to create and use a synthetic version of my voice. - # You can find sample consent file here - # https://github.com/Azure-Samples/Cognitive-Speech-TTS/blob/master/CustomVoice/Sample%20Data/Individual%20utterances%20%2B%20matching%20script/VoiceTalentVerbalStatement.wav - consent_file_path = r'TestData\\VoiceTalentVerbalStatement.wav' - voice_talent_name = 'Sample Voice Actor' - company_name = 'Contoso' - - # Need 5 - 90 seconds audio file. - # You can find sample audio file here. - # https://github.com/Azure-Samples/Cognitive-Speech-TTS/blob/master/CustomVoice/Sample%20Data/Individual%20utterances%20%2B%20matching%20script/SampleAudios.zip - audio_folder = r'TestData\\voice\\' - speaker_profile_id = create_personal_voice(project_id, - consent_id, consent_file_path, voice_talent_name, company_name, - personal_voice_id, audio_folder) - - # step 2: synthesis wave - text = 'This is zero shot voice. Test 2.' - output_wave_file_path = 'output_sdk.wav' - speech_synthesis_to_wave_file(text, output_wave_file_path, speaker_profile_id) -except Exception as e: - print(e) -finally: - # Optional step 3: clean up, if you don't need this voice to synthesis more content. - clean_up(project_id, consent_id, personal_voice_id) - diff --git a/samples/custom-voice/python/professional_voice_sample.py b/samples/custom-voice/python/professional_voice_sample.py deleted file mode 100644 index 03054096f..000000000 --- a/samples/custom-voice/python/professional_voice_sample.py +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. See LICENSE.md file in the project root for full license information. - -import json -from time import sleep -import uuid -import os -import datetime -import logging -try: - import customvoice -except ImportError: - print('Pleae copy folder https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/custom-voice/python/customvoice and keep the same folder structure as github.' ) - quit() - - -def create_project(): - project = customvoice.Project.create(config, project_id, customvoice.ProjectKind.ProfessionalVoice) - print('Project created. project id: %s' % project.id) - - -def upload_consent(): - # Custom Voice training need voice consent file with template below. - # I [voice talent name] am aware that recordings of my voice will be used by [company name] to create and use a synthetic version of my voice. - # You can find sample consent file here - # https://github.com/Azure-Samples/Cognitive-Speech-TTS/blob/master/CustomVoice/Sample%20Data/Individual%20utterances%20%2B%20matching%20script/VoiceTalentVerbalStatement.wav - consent_file_path = r'D:\CNV_API\VoiceTalentVerbalStatement.wav' - consent = customvoice.Consent.create(config, project_id, consent_id, - voice_talent_name = 'Sample Voice Actor', - company_name = 'Contoso', - audio_file_path = consent_file_path, - locale = 'en-us') - print('Consent created. consent id: %s' % consent.id) - - -# Upload wave and script file in local_folder to Azure blob under https://blob_url/container_name/blob_prefix/ -# If container_name doesnt exists, will create a new container. -def upload_training_set_to_blob(blob_url: str, blob_key: str, container_name: str, blob_prefix: str, local_folder: str): - message = 'Uploading data from ' + local_folder + ' to https://' + blob_url + '/' + container_name + '/' + blob_prefix - print(message) - - from azure.storage.blob import BlobServiceClient, generate_container_sas, ContainerSasPermissions - from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError - blob_service_client = BlobServiceClient(account_url=blob_url, credential=blob_key) - container_client = blob_service_client.get_container_client(container_name) - try: - properties = container_client.get_container_properties() - except ResourceNotFoundError: - container_client.create_container() - properties = container_client.get_container_properties() - - for file_name in os.listdir(local_folder): - file_path = os.path.join(local_folder, file_name) - print('uploading ' + file_name) - with open(file_path, "rb") as data: - container_client.upload_blob(name=blob_prefix+file_name, data=data) - - sas_token = generate_container_sas( - container_client.account_name, - container_client.container_name, - account_key=container_client.credential.account_key, - permission = ContainerSasPermissions(read = True, list = True), - expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1) - ) - return sas_token - -def upload_training_set(): - # Create training set - training_set = customvoice.TrainingSet.create(config, project_id, training_set_id, locale = 'en-US') - - # Upload wave and script from local_folder to Azure blob. - blob_url = 'contoso.blob.core.windows.net' - blob_key = '' - container_name = 'voicedata' - blob_prefix = str(uuid.uuid4()) + '/' - print('blob_prefix: %s' % blob_prefix) - # You can find sample script and audio file here. - # https://github.com/Azure-Samples/Cognitive-Speech-TTS/blob/master/CustomVoice/Sample%20Data/Individual%20utterances%20%2B%20matching%20script/SampleScript.txt - # https://github.com/Azure-Samples/Cognitive-Speech-TTS/blob/master/CustomVoice/Sample%20Data/Individual%20utterances%20%2B%20matching%20script/SampleAudios.zip - # Pleae unzip audio file. Put both audio and script file in foler below. - local_folder = 'D:\CNV_API\SampleAudios' - sas_token = upload_training_set_to_blob(blob_url, blob_key, container_name, blob_prefix, local_folder) - print('sas_token: %s' % sas_token) - - # Upload data to training et - containeUrlWithSAS = "https://" + blob_url + '/' + container_name + '?' + sas_token - print('containeUrl: %s' % containeUrlWithSAS) - audios = customvoice.AzureBlobContentSource(containeUrlWithSAS, blob_prefix, ['.wav']) - scripts = customvoice.AzureBlobContentSource(containeUrlWithSAS, blob_prefix, ['.txt']) - customvoice.TrainingSet.upload_data(config, training_set_id, kind = customvoice.DatasetKind.AudioAndScript, audios = audios, scripts = scripts) - - # Wait for training set ready - print('Training set is processing in server. It takes around 5 minutes to 1 hour depend on data size.') - training_set = customvoice.TrainingSet.get(config, training_set_id) - while training_set.status != customvoice.Status.Succeeded and training_set.status != customvoice.Status.Failed: - print('.', end='', flush=True) - sleep(10) - training_set = customvoice.TrainingSet.get(config, training_set_id) - if training_set.status == customvoice.Status.Failed: - print('Training set failed') - raise Exception - elif training_set.status == customvoice.Status.Succeeded: - print('Training set succeeded') - - -def train_model(): - model = customvoice.Model.create(config, project_id, model_id, - voice_name='SampleVoiceNeural', - recipe_kind = customvoice.RecipeKind.Default.name, - consent_id=consent_id, - training_set_id=training_set_id) - print('Started model training. model id: %s' % model.id) - - # Wait for model ready - print('Model is training in server. It takes around 24 hours.') - while model.status != customvoice.Status.Succeeded and model.status != customvoice.Status.Failed: - print('.', end='', flush=True) - sleep(300) - model = customvoice.Model.get(config, model_id) - if model.status == customvoice.Status.Failed: - print('Model training failed. Failure reason: %s' % model.failure_reason) - raise Exception - elif model.status == customvoice.Status.Succeeded: - print('Model training succeeded') - - -def deploy_model(): - endpoint = customvoice.Endpoint.create(config, project_id, endpoint_id, model_id) - print('Start deploying model . endpoint id: %s' % endpoint.id) - - # Wait for model deploy - print('Deploying model. It takes around 1 to 5 minutes.') - while endpoint.status != customvoice.Status.Succeeded and endpoint.status != customvoice.Status.Failed: - print('.', end='', flush=True) - sleep(10) - endpoint = customvoice.Endpoint.get(config, endpoint_id) - if endpoint.status == customvoice.Status.Failed: - print('Model deploy failed') - raise Exception - elif endpoint.status == customvoice.Status.Succeeded: - print('Model deploy succeeded') - - -region = 'eastus' # eastus, westeurope, southeastasia -key = '' - -logging.basicConfig(filename="customvoice.log", - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - filemode='w') -logger = logging.getLogger() -logger.setLevel(logging.DEBUG) - -config = customvoice.Config(key, region, logger) - - -project_id = 'professional-voice-project-1' -consent_id = 'professional-voice-consent-1' -training_set_id = 'professional-voice-trainingset-1' -model_id = 'professional-voice-model-1' -endpoint_id = str(uuid.uuid4()) - -# step 1: creat project -create_project() - -# step 2: upload consent file -upload_consent() - -# step 3: upload training set -upload_training_set() - -# step 4: train model -train_model() - -# step 5: deploy model -deploy_model() - -# step 6: synthesis with endpoint -# You can find sample code here -# https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/d806061d8ab00e9729128839bf5051c4871ab35f/samples/python/console/speech_synthesis_sample.py#LL119C1-L119C43 diff --git a/samples/ingestion/ingestion-client/.devcontainer/devcontainer.json b/samples/ingestion/ingestion-client/.devcontainer/devcontainer.json deleted file mode 100644 index b434ec530..000000000 --- a/samples/ingestion/ingestion-client/.devcontainer/devcontainer.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "name": "Ingestion Client", - "image": "mcr.microsoft.com/devcontainers/dotnet:1-8.0", - "features": { - "ghcr.io/devcontainers/features/azure-cli:1": {}, - "ghcr.io/devcontainers/features/node:1": {}, - "ghcr.io/jlaundry/devcontainer-features/azure-functions-core-tools:1": {}, - "ghcr.io/devcontainers-contrib/features/pre-commit:2": {}, - "ghcr.io/rchaganti/vsc-devcontainer-features/azurebicep:1.0.5": {} - }, - "postCreateCommand": "pre-commit install", - "customizations": { - "vscode": { - "extensions": [ - "github.vscode-pull-request-github", - "ms-dotnettools.csdevkit", - "ms-vscode.azure-account", - "ms-azuretools.vscode-azurefunctions", - "editorconfig.editorconfig", - "ms-azuretools.vscode-bicep", - "ms-dotnettools.vscode-dotnet-runtime" - ], - "settings": { - "omnisharp.enableRoslynAnalyzers": true, - "omnisharp.enableEditorConfigSupport": true - } - } - } -} diff --git a/samples/ingestion/ingestion-client/.github/dependabot.yml b/samples/ingestion/ingestion-client/.github/dependabot.yml deleted file mode 100644 index f33a02cd1..000000000 --- a/samples/ingestion/ingestion-client/.github/dependabot.yml +++ /dev/null @@ -1,12 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for more information: -# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates -# https://containers.dev/guide/dependabot - -version: 2 -updates: - - package-ecosystem: "devcontainers" - directory: "/" - schedule: - interval: weekly diff --git a/samples/ingestion/ingestion-client/.gitignore b/samples/ingestion/ingestion-client/.gitignore deleted file mode 100644 index 1927772bc..000000000 --- a/samples/ingestion/ingestion-client/.gitignore +++ /dev/null @@ -1 +0,0 @@ -local.settings.json \ No newline at end of file diff --git a/samples/ingestion/ingestion-client/BatchIngestionClient.sln b/samples/ingestion/ingestion-client/BatchIngestionClient.sln index b89e9a2fd..059548a33 100644 --- a/samples/ingestion/ingestion-client/BatchIngestionClient.sln +++ b/samples/ingestion/ingestion-client/BatchIngestionClient.sln @@ -14,12 +14,6 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "StartTranscriptionByTimer", EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DatabaseMigrator", "DatabaseMigrator\DatabaseMigrator.csproj", "{5BD38646-D3F3-481B-909E-353750AC5384}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{7BC59BB6-7DD9-4D72-8AA0-91F774D6E45A}" - ProjectSection(SolutionItems) = preProject - Setup\ArmTemplateBatch.json = Setup\ArmTemplateBatch.json - Setup\ArmTemplateRealtime.json = Setup\ArmTemplateRealtime.json - EndProjectSection -EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU diff --git a/samples/ingestion/ingestion-client/Connector/BatchClient.cs b/samples/ingestion/ingestion-client/Connector/BatchClient.cs index c5791a3e6..88a91b378 100644 --- a/samples/ingestion/ingestion-client/Connector/BatchClient.cs +++ b/samples/ingestion/ingestion-client/Connector/BatchClient.cs @@ -21,7 +21,7 @@ public static class BatchClient { private const string TranscriptionsBasePath = "speechtotext/v3.0/Transcriptions/"; - private const int MaxNumberOfRetries = 10; + private const int MaxNumberOfRetries = 3; private static readonly TimeSpan PostTimeout = TimeSpan.FromMinutes(1); @@ -34,7 +34,7 @@ public static class BatchClient private static readonly AsyncRetryPolicy RetryPolicy = Policy .Handle(e => e is HttpStatusCodeException || e is HttpRequestException) - .WaitAndRetryAsync(MaxNumberOfRetries, retryAttempt => TimeSpan.FromSeconds(5)); + .WaitAndRetryAsync(MaxNumberOfRetries, retryAttempt => TimeSpan.FromSeconds(2)); public static Task GetTranscriptionReportFileFromSasAsync(string sasUri) { diff --git a/samples/ingestion/ingestion-client/Connector/Enums/TranscriptionAnalyticsJobStatus.cs b/samples/ingestion/ingestion-client/Connector/Enums/TranscriptionAnalyticsJobStatus.cs deleted file mode 100644 index 9e8fc659c..000000000 --- a/samples/ingestion/ingestion-client/Connector/Enums/TranscriptionAnalyticsJobStatus.cs +++ /dev/null @@ -1,14 +0,0 @@ -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -namespace Connector.Enums -{ - public enum TranscriptionAnalyticsJobStatus - { - NotSubmitted, - Running, - Completed - } -} diff --git a/samples/ingestion/ingestion-client/Connector/Serializable/TextAnalytics/TextAnalyticsRequest.cs b/samples/ingestion/ingestion-client/Connector/Serializable/TextAnalytics/TextAnalyticsRequest.cs new file mode 100644 index 000000000..5870f3919 --- /dev/null +++ b/samples/ingestion/ingestion-client/Connector/Serializable/TextAnalytics/TextAnalyticsRequest.cs @@ -0,0 +1,23 @@ +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +namespace Connector.Serializable +{ + public class TextAnalyticsRequest + { + public TextAnalyticsRequest(string language, string id, string text) + { + this.Language = language; + this.Id = id; + this.Text = text; + } + + public string Language { get; private set; } + + public string Id { get; private set; } + + public string Text { get; private set; } + } +} diff --git a/samples/ingestion/ingestion-client/Connector/Serializable/TextAnalytics/TextAnalyticsRequestsChunk.cs b/samples/ingestion/ingestion-client/Connector/Serializable/TextAnalytics/TextAnalyticsRequestsChunk.cs new file mode 100644 index 000000000..8f6fda501 --- /dev/null +++ b/samples/ingestion/ingestion-client/Connector/Serializable/TextAnalytics/TextAnalyticsRequestsChunk.cs @@ -0,0 +1,19 @@ +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +namespace Connector.Serializable +{ + using System.Collections.Generic; + + public class TextAnalyticsRequestsChunk + { + public TextAnalyticsRequestsChunk(IEnumerable documents) + { + this.Documents = documents; + } + + public IEnumerable Documents { get; private set; } + } +} diff --git a/samples/ingestion/ingestion-client/Connector/Serializable/TranscriptionStartedMessage/TextAnalyticsRequests.cs b/samples/ingestion/ingestion-client/Connector/Serializable/TranscriptionStartedMessage/TextAnalyticsRequests.cs index 1f2dcb839..fe0cfb021 100644 --- a/samples/ingestion/ingestion-client/Connector/Serializable/TranscriptionStartedMessage/TextAnalyticsRequests.cs +++ b/samples/ingestion/ingestion-client/Connector/Serializable/TranscriptionStartedMessage/TextAnalyticsRequests.cs @@ -14,15 +14,15 @@ public TextAnalyticsRequests( IEnumerable audioLevelRequests, IEnumerable conversationRequests) { - this.UtteranceLevelRequests = utteranceLevelRequests ?? new List(); - this.AudioLevelRequests = audioLevelRequests ?? new List(); - this.ConversationRequests = conversationRequests ?? new List(); + this.UtteranceLevelRequests = utteranceLevelRequests; + this.AudioLevelRequests = audioLevelRequests; + this.ConversationRequests = conversationRequests; } - public IEnumerable UtteranceLevelRequests { get; set; } + public IEnumerable UtteranceLevelRequests { get; } - public IEnumerable AudioLevelRequests { get; set; } + public IEnumerable AudioLevelRequests { get; } - public IEnumerable ConversationRequests { get; set; } + public IEnumerable ConversationRequests { get; } } } diff --git a/samples/ingestion/ingestion-client/Connector/StorageConnector.cs b/samples/ingestion/ingestion-client/Connector/StorageConnector.cs index 7f6bc679d..e410da79e 100644 --- a/samples/ingestion/ingestion-client/Connector/StorageConnector.cs +++ b/samples/ingestion/ingestion-client/Connector/StorageConnector.cs @@ -121,7 +121,7 @@ public async Task MoveFileAsync(string inputContainerName, string inputFileName, if (!keepSource) { - await inputBlockBlobClient.DeleteIfExistsAsync().ConfigureAwait(false); + await inputBlockBlobClient.DeleteAsync().ConfigureAwait(false); } return; @@ -131,7 +131,7 @@ public async Task MoveFileAsync(string inputContainerName, string inputFileName, if (!keepSource) { - await inputBlockBlobClient.DeleteIfExistsAsync().ConfigureAwait(false); + await inputBlockBlobClient.DeleteAsync().ConfigureAwait(false); } } } diff --git a/samples/ingestion/ingestion-client/FetchTranscription/FetchTranscription.cs b/samples/ingestion/ingestion-client/FetchTranscription/FetchTranscription.cs index b48112db2..b86ed6364 100644 --- a/samples/ingestion/ingestion-client/FetchTranscription/FetchTranscription.cs +++ b/samples/ingestion/ingestion-client/FetchTranscription/FetchTranscription.cs @@ -3,7 +3,7 @@ // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // -namespace FetchTranscription +namespace FetchTranscriptionFunction { using System; using System.Threading.Tasks; diff --git a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/Language/AnalyzeConversationsProvider.cs b/samples/ingestion/ingestion-client/FetchTranscription/Language/AnalyzeConversationsProvider.cs similarity index 88% rename from samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/Language/AnalyzeConversationsProvider.cs rename to samples/ingestion/ingestion-client/FetchTranscription/Language/AnalyzeConversationsProvider.cs index bef69644a..091315d0b 100644 --- a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/Language/AnalyzeConversationsProvider.cs +++ b/samples/ingestion/ingestion-client/FetchTranscription/Language/AnalyzeConversationsProvider.cs @@ -3,7 +3,7 @@ // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // -namespace FetchTranscription +namespace Language { using System; using System.Collections.Generic; @@ -17,11 +17,11 @@ namespace FetchTranscription using Connector; using Connector.Constants; - using Connector.Enums; using Connector.Serializable.Language.Conversations; - using Connector.Serializable.TextAnalytics; using Connector.Serializable.TranscriptionStartedServiceBusMessage; + using FetchTranscriptionFunction; + using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; @@ -32,7 +32,7 @@ namespace FetchTranscription /// /// Analyze Conversations async client. /// - public class AnalyzeConversationsProvider : ITranscriptionAnalyticsProvider + public class AnalyzeConversationsProvider { private const string DefaultInferenceSource = "lexical"; private static readonly TimeSpan RequestTimeout = TimeSpan.FromMinutes(3); @@ -44,7 +44,8 @@ public class AnalyzeConversationsProvider : ITranscriptionAnalyticsProvider public AnalyzeConversationsProvider(string locale, string subscriptionKey, string endpoint, ILogger log, IOptions appConfig) { - this.conversationAnalysisClient = new ConversationAnalysisClient(new Uri(endpoint), new AzureKeyCredential(subscriptionKey)); + this.conversationAnalysisClient = new ConversationAnalysisClient(new Uri($"https://{region}.api.cognitive.microsoft.com"), new AzureKeyCredential(subscriptionKey)); + this.locale = locale; this.log = log; this.appConfig = appConfig?.Value; @@ -207,7 +208,7 @@ private static IEnumerable GetAllErrorsFromResults((IEnumerable /// Instance of the speech transcript. /// An enumerable of the jobs IDs and errors if any. - private async Task<(IEnumerable jobIds, IEnumerable errors)> SubmitAnalyzeConversationsRequestAsync(SpeechTranscript speechTranscript) + public async Task<(IEnumerable jobIds, IEnumerable errors)> SubmitAnalyzeConversationsRequestAsync(SpeechTranscript speechTranscript) { speechTranscript = speechTranscript ?? throw new ArgumentNullException(nameof(speechTranscript)); var data = new List(); @@ -222,7 +223,7 @@ private static IEnumerable GetAllErrorsFromResults((IEnumerable /// Enumerable of conversational jobIds. /// Enumerable of results of conversation PII redaction and errors encountered if any. - private async Task<(AnalyzeConversationPiiResults piiResults, AnalyzeConversationSummarizationResults summarizationResults, IEnumerable errors)> GetConversationsOperationsResult(IEnumerable jobIds) + public async Task<(AnalyzeConversationPiiResults piiResults, AnalyzeConversationSummarizationResults summarizationResults, IEnumerable errors)> GetConversationsOperationsResult(IEnumerable jobIds) { var errors = new List(); if (!jobIds.Any()) @@ -230,10 +231,10 @@ private static IEnumerable GetAllErrorsFromResults((IEnumerable await this.GetConversationsOperationResults(jobId).ConfigureAwait(false)); var results = await Task.WhenAll(tasks).ConfigureAwait(false); - var resultsErrors = GetAllErrorsFromResults(results); + var resultsErrors = results.SelectMany(result => result.piiResults).SelectMany(s => s.Errors).Concat(results.SelectMany(result => result.summarizationResults).SelectMany(s => s.Errors)); if (resultsErrors.Any()) { errors.AddRange(resultsErrors.Select(s => $"Error thrown for conversation : {s.Id}")); @@ -290,13 +291,50 @@ private static IEnumerable GetAllErrorsFromResults((IEnumerable + /// Checks for all conversational analytics requests that were marked as running if they have completed and sets a new state accordingly. + /// + /// Enumerable for audioFiles. + /// True if all requests completed, else false. + public async Task ConversationalRequestsCompleted(IEnumerable audioFileInfos) + { + if (!(IsConversationalPiiEnabled() || IsConversationalSummarizationEnabled()) || !audioFileInfos.Where(audioFileInfo => audioFileInfo.TextAnalyticsRequests.ConversationRequests != null).Any()) + { + return true; + } + + var conversationRequests = audioFileInfos.SelectMany(audioFileInfo => audioFileInfo.TextAnalyticsRequests.ConversationRequests).Where(text => text.Status == TextAnalyticsRequestStatus.Running); + + var runningJobsCount = 0; + + foreach (var textAnalyticsJob in conversationRequests) + { + var response = await this.conversationAnalysisClient.GetAnalyzeConversationJobStatusAsync(Guid.Parse(textAnalyticsJob.Id)).ConfigureAwait(false); + + if (response.IsError) + { + continue; + } + + var analysisResult = JsonConvert.DeserializeObject(response.Content.ToString()); + + if (analysisResult.Tasks.InProgress != 0) + { + // some jobs are still running. + runningJobsCount++; + } + } + + return runningJobsCount == 0; + } + /// /// Gets the (audio-level) results from text analytics, adds the results to the speech transcript. /// /// The conversation analysis job Ids. /// The speech transcript object. /// The errors, if any. - private async Task> AddConversationalEntitiesAsync( + public async Task> AddConversationalEntitiesAsync( IEnumerable conversationJobIds, SpeechTranscript speechTranscript) { @@ -530,7 +568,7 @@ private void PreparePiiRequest(SpeechTranscript speechTranscript, List piiResults, IEnumerable summarizationResults, IEnumerable errors)> GetConversationsOperationResults(string jobId) { - var piiResults = new List(); - var summarizationResults = new List(); var errors = new List(); try { @@ -574,14 +610,14 @@ private void PreparePiiRequest(SpeechTranscript speechTranscript, List item.Kind == AnalyzeConversationsTaskResultKind.conversationalPIIResults && (item as ConversationPiiItem)?.Results != null) - .Select(s => ((ConversationPiiItem)s).Results)); - summarizationResults.AddRange(analysisResult.Tasks - .Items - .Where(item => item.Kind == AnalyzeConversationsTaskResultKind.conversationalSummarizationResults && (item as ConversationSummarizationItem)?.Results != null) - .Select(s => ((ConversationSummarizationItem)s).Results)); + var piiResults = analysisResult.Tasks + .Items.Where(item => item.Kind == AnalyzeConversationsTaskResultKind.conversationalPIIResults) + .Select(s => s as ConversationPiiItem) + .Select(s => s.Results); + var summarizationResults = analysisResult.Tasks + .Items.Where(item => item.Kind == AnalyzeConversationsTaskResultKind.conversationalSummarizationResults) + .Select(s => s as ConversationSummarizationItem) + .Select(s => s.Results); return (piiResults, summarizationResults, errors); } } @@ -596,7 +632,7 @@ private void PreparePiiRequest(SpeechTranscript speechTranscript, List -``` - -Note: Replace `` with the actual name of your function app that you can get from the Azure Portal. It will look like `FetchTranscriptionFunction-20240531T092901Z`. - -2. In the local.settings.json file generated by the previous step, replace the value of the `AzureSpeechServicesKey` with the actual key for your Azure Speech Service instance. You can get this from the Azure portal. (If you're using the SQL database or the Text Analytics, do the same for these keys as well) - -3. Navigate to the FetchTranscription function running on your Azure via the portal, and click on Stop. You need to do this so that you don't have two instances of the FetchTranscription function running and listening to the same events when you start the function from your local machine in the next step. - -4. Run the following command to start the local function (this will apply your local code changes): - -``` -func start -``` \ No newline at end of file diff --git a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/TextAnalytics/TextAnalyticsProvider.cs b/samples/ingestion/ingestion-client/FetchTranscription/TextAnalytics/TextAnalyticsProvider.cs similarity index 94% rename from samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/TextAnalytics/TextAnalyticsProvider.cs rename to samples/ingestion/ingestion-client/FetchTranscription/TextAnalytics/TextAnalyticsProvider.cs index 0857fee03..dab5a9b3a 100644 --- a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/TextAnalytics/TextAnalyticsProvider.cs +++ b/samples/ingestion/ingestion-client/FetchTranscription/TextAnalytics/TextAnalyticsProvider.cs @@ -3,7 +3,7 @@ // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // -namespace FetchTranscription +namespace TextAnalytics { using System; using System.Collections.Generic; @@ -18,12 +18,35 @@ namespace FetchTranscription using Connector.Enums; using Connector.Serializable.TranscriptionStartedServiceBusMessage; + using FetchTranscriptionFunction; + + using Language; + using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using static Connector.Serializable.TranscriptionStartedServiceBusMessage.TextAnalyticsRequest; - public class TextAnalyticsProvider : ITranscriptionAnalyticsProvider + /// + /// The text analytics provide. + /// + /// General overview of text analytics request processing: + /// + /// For a succeded transcription, check if transcription has text analytics job info. + /// if true: + /// Check if text analytics job terminated. + /// if true: + /// Add text analytics results to transcript, write transcript to storage. + /// if false: + /// Re-enqueue job, check again after X minutes. + /// if false: + /// Check if text analytics is requested + /// if true: + /// Add text analytics job info to transcription. Re-enqueue job, check again after X minutes. + /// if false: + /// Write transcript to storage. + /// + public class TextAnalyticsProvider { private const int MaxRecordsPerRequest = 25; @@ -39,7 +62,7 @@ public class TextAnalyticsProvider : ITranscriptionAnalyticsProvider public TextAnalyticsProvider(string locale, string subscriptionKey, string endpoint, ILogger log, IOptions appConfig) { - this.textAnalyticsClient = new TextAnalyticsClient(new Uri(endpoint), new AzureKeyCredential(subscriptionKey)); + this.textAnalyticsClient = new TextAnalyticsClient(new Uri($"https://{region}.api.cognitive.microsoft.com"), new AzureKeyCredential(subscriptionKey)); this.locale = locale; this.log = log; this.appConfig = appConfig?.Value; @@ -56,12 +79,7 @@ public async Task GetTranscriptionAnalyticsJobS { if (!this.IsTextAnalyticsRequested()) { - return TranscriptionAnalyticsJobStatus.Completed; - } - - if (!audioFileInfos.Where(audioFileInfo => audioFileInfo.TextAnalyticsRequests != null).Any()) - { - return TranscriptionAnalyticsJobStatus.NotSubmitted; + return true; } var runningTextAnalyticsRequests = new List(); @@ -78,7 +96,8 @@ public async Task GetTranscriptionAnalyticsJobS .SelectMany(audioFileInfo => audioFileInfo.TextAnalyticsRequests.UtteranceLevelRequests) .Where(text => text.Status == TextAnalyticsRequestStatus.Running)); - var status = TranscriptionAnalyticsJobStatus.Completed; + var textAnalyticsRequestCompleted = true; + foreach (var textAnalyticsJob in runningTextAnalyticsRequests) { var operation = new AnalyzeActionsOperation(textAnalyticsJob.Id, this.textAnalyticsClient); @@ -93,8 +112,7 @@ public async Task GetTranscriptionAnalyticsJobS } else { - // if one or more jobs are still running, report status as running: - status = TranscriptionAnalyticsJobStatus.Running; + textAnalyticsRequestCompleted = false; } } @@ -210,7 +228,7 @@ public async Task> AddTranscriptionAnalyticsResultsToTranscr /// The speech transcript object. /// The sentiment analysis setting. /// The job ids and errors, if any were found. - private async Task<(IEnumerable jobIds, IEnumerable errors)> SubmitUtteranceLevelRequests( + public async Task<(IEnumerable jobIds, IEnumerable errors)> SubmitUtteranceLevelRequests( SpeechTranscript speechTranscript, SentimentAnalysisSetting sentimentAnalysisSetting) { @@ -240,7 +258,7 @@ public async Task> AddTranscriptionAnalyticsResultsToTranscr /// The sentiment analysis setting. /// The PII redaction setting. /// The job ids and errors, if any were found. - private async Task<(IEnumerable jobIds, IEnumerable errors)> SubmitAudioLevelRequests( + public async Task<(IEnumerable jobIds, IEnumerable errors)> SubmitAudioLevelRequests( SpeechTranscript speechTranscript, SentimentAnalysisSetting sentimentAnalysisSetting, PiiRedactionSetting piiRedactionSetting) @@ -290,7 +308,7 @@ public async Task> AddTranscriptionAnalyticsResultsToTranscr /// The text analytics job ids. /// The speech transcript object. /// The errors, if any. - private async Task> AddUtteranceLevelEntitiesAsync( + public async Task> AddUtteranceLevelEntitiesAsync( IEnumerable jobIds, SpeechTranscript speechTranscript) { @@ -336,7 +354,7 @@ private async Task> AddUtteranceLevelEntitiesAsync( /// The text analytics job ids. /// The speech transcript object. /// The errors, if any. - private async Task> AddAudioLevelEntitiesAsync( + public async Task> AddAudioLevelEntitiesAsync( IEnumerable jobIds, SpeechTranscript speechTranscript) { diff --git a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/ITranscriptionAnalyticsProvider.cs b/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/ITranscriptionAnalyticsProvider.cs deleted file mode 100644 index 3eeb6bfd6..000000000 --- a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/ITranscriptionAnalyticsProvider.cs +++ /dev/null @@ -1,38 +0,0 @@ -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -namespace FetchTranscription -{ - using System.Collections.Generic; - using System.Threading.Tasks; - - using Connector; - using Connector.Enums; - using Connector.Serializable.TranscriptionStartedServiceBusMessage; - - public interface ITranscriptionAnalyticsProvider - { - /// - /// Gets the status of the transcription analytics jobs that are monitored by the provider - /// - /// The audio file infos with transcription analytics jobs info - /// The overall status of all jobs monitored by the provider - Task GetTranscriptionAnalyticsJobStatusAsync(IEnumerable audioFileInfos); - - /// - /// Submits transcription analytics jobs based on the transcript in speechtranscript and sets the job ids in the corresponding audio file infos. - /// - /// The mapping from audio file info to transcript - /// The errors if any. - Task> SubmitTranscriptionAnalyticsJobsAsync(Dictionary speechTranscriptMappings); - - /// - /// Fetches the transcription analytics results and adds them to the corresponding speech transcript - /// - /// The mapping from audio file info to transcript - /// The errors if any. - Task> AddTranscriptionAnalyticsResultsToTranscriptsAsync(Dictionary speechTranscriptMappings); - } -} diff --git a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/TranscriptionAnalyticsOrchestrator.cs b/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/TranscriptionAnalyticsOrchestrator.cs deleted file mode 100644 index 78842df0d..000000000 --- a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionAnalytics/TranscriptionAnalyticsOrchestrator.cs +++ /dev/null @@ -1,109 +0,0 @@ -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// - -namespace FetchTranscription -{ - using System; - using System.Collections.Generic; - using System.Threading.Tasks; - - using Connector; - using Connector.Enums; - using Connector.Serializable.TranscriptionStartedServiceBusMessage; - - using Microsoft.Extensions.Logging; - using Microsoft.Extensions.Options; - - public sealed class TranscriptionAnalyticsOrchestrator - { - private readonly List providers; - - private readonly AppConfig appConfig; - - public TranscriptionAnalyticsOrchestrator( - string locale, - ILogger logger, - IOptions appConfig) - { - this.appConfig = appConfig?.Value; - this.providers = new List(); - - if (!string.IsNullOrEmpty(this.appConfig.TextAnalyticsKey) && !string.IsNullOrEmpty(this.appConfig.TextAnalyticsEndpoint)) - { - this.providers.Add(new TextAnalyticsProvider(locale, this.appConfig.TextAnalyticsKey, this.appConfig.TextAnalyticsEndpoint, logger, Options.Create(this.appConfig))); - this.providers.Add(new AnalyzeConversationsProvider(locale, this.appConfig.TextAnalyticsKey, this.appConfig.TextAnalyticsEndpoint, logger, Options.Create(this.appConfig))); - } - } - - /// - /// Gets the merged status of all transcription analytics jobs. - /// - /// The transcription started service bus message. - /// The merged job status. - public async Task GetTranscriptionAnalyticsJobsStatusAsync(TranscriptionStartedMessage transcriptionStartedMessage) - { - _ = transcriptionStartedMessage ?? throw new ArgumentNullException(nameof(transcriptionStartedMessage)); - - foreach (var provider in this.providers) - { - var providerStatus = await provider.GetTranscriptionAnalyticsJobStatusAsync(transcriptionStartedMessage.AudioFileInfos).ConfigureAwait(false); - - // if any is not submitted, we can safely return here since we submit all requests at the same time - therefore all other providers should not have any running requests. - if (providerStatus == TranscriptionAnalyticsJobStatus.NotSubmitted) - { - return TranscriptionAnalyticsJobStatus.NotSubmitted; - } - - // if any is running, we set the status to running and fetch it again after some time. - if (providerStatus == TranscriptionAnalyticsJobStatus.Running) - { - return TranscriptionAnalyticsJobStatus.Running; - } - } - - return TranscriptionAnalyticsJobStatus.Completed; - } - - /// - /// Submit transcription analytics jobs and adds their IDs to the audio file infos, so that they can get fetched the next time the transcription job status is polled. - /// - /// The mapping from audio file infos to speech transcripts. - /// The errors if any. - public async Task> SubmitTranscriptionAnalyticsJobsAndAddToAudioFileInfos(Dictionary speechTranscriptMappings) - { - _ = speechTranscriptMappings ?? throw new ArgumentNullException(nameof(speechTranscriptMappings)); - - var errors = new List(); - - foreach (var provider in this.providers) - { - var providerErros = await provider.SubmitTranscriptionAnalyticsJobsAsync(speechTranscriptMappings).ConfigureAwait(false); - errors.AddRange(providerErros); - } - - return errors; - } - - /// - /// Adds the result of all transcription analytics jobs to the corresponding speech transcript. - /// - /// The mapping from audio file infos to speech transcripts. - /// The errors if any. - public async Task> AddTranscriptionAnalyticsResultsToTranscripts(Dictionary speechTranscriptMappings) - { - _ = speechTranscriptMappings ?? throw new ArgumentNullException(nameof(speechTranscriptMappings)); - - var errors = new List(); - - foreach (var provider in this.providers) - { - var providerErros = await provider.AddTranscriptionAnalyticsResultsToTranscriptsAsync(speechTranscriptMappings).ConfigureAwait(false); - errors.AddRange(providerErros); - } - - return errors; - } - } -} diff --git a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionProcessor.cs b/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionProcessor.cs index faa4c7f10..c6aa28af6 100644 --- a/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionProcessor.cs +++ b/samples/ingestion/ingestion-client/FetchTranscription/TranscriptionProcessor.cs @@ -3,7 +3,7 @@ // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // -namespace FetchTranscription +namespace FetchTranscriptionFunction { using System; using System.Collections.Generic; @@ -20,12 +20,16 @@ namespace FetchTranscription using Connector.Enums; using Connector.Serializable.TranscriptionStartedServiceBusMessage; + using Language; + using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Azure; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Newtonsoft.Json; + using TextAnalytics; + using static Connector.Serializable.TranscriptionStartedServiceBusMessage.TextAnalyticsRequest; public class TranscriptionProcessor { @@ -393,21 +397,112 @@ private async Task ProcessSucceededTranscriptionAsync(string transcriptionLocati } } - if (transcriptionAnalyticsJobStatus == TranscriptionAnalyticsJobStatus.Completed) + if (textAnalyticsProvider != null && + (FetchTranscriptionEnvironmentVariables.SentimentAnalysisSetting != SentimentAnalysisSetting.None + || FetchTranscriptionEnvironmentVariables.PiiRedactionSetting != PiiRedactionSetting.None + || AnalyzeConversationsProvider.IsConversationalPiiEnabled() + || AnalyzeConversationsProvider.IsConversationalSummarizationEnabled())) { - var errors = await transcriptionAnalyticsOrchestrator.AddTranscriptionAnalyticsResultsToTranscripts(speechTranscriptMappings).ConfigureAwait(false); - - foreach (var error in errors) + // If we already got text analytics requests in the transcript (containsTextAnalyticsRequest), add the results to the transcript. + // Otherwise, submit new text analytics requests. + if (containsTextAnalyticsRequest) { - generalErrorsStringBuilder.AppendLine(error); + foreach (var speechTranscriptMapping in speechTranscriptMappings) + { + var speechTranscript = speechTranscriptMapping.Value; + var audioFileInfo = speechTranscriptMapping.Key; + var fileName = audioFileInfo.FileName; + if (FetchTranscriptionEnvironmentVariables.PiiRedactionSetting != PiiRedactionSetting.None) + { + speechTranscript.RecognizedPhrases.ToList().ForEach(phrase => + { + if (phrase.NBest != null && phrase.NBest.Any()) + { + var firstNBest = phrase.NBest.First(); + phrase.NBest = new[] { firstNBest }; + } + }); + } + + var textAnalyticsErrors = new List(); + + if (audioFileInfo.TextAnalyticsRequests.AudioLevelRequests?.Any() == true) + { + var audioLevelErrors = await textAnalyticsProvider.AddAudioLevelEntitiesAsync(audioFileInfo.TextAnalyticsRequests.AudioLevelRequests.Select(request => request.Id), speechTranscript).ConfigureAwait(false); + textAnalyticsErrors.AddRange(audioLevelErrors); + } + + if (audioFileInfo.TextAnalyticsRequests.UtteranceLevelRequests?.Any() == true) + { + var utteranceLevelErrors = await textAnalyticsProvider.AddUtteranceLevelEntitiesAsync(audioFileInfo.TextAnalyticsRequests.UtteranceLevelRequests.Select(request => request.Id), speechTranscript).ConfigureAwait(false); + textAnalyticsErrors.AddRange(utteranceLevelErrors); + } + + if (audioFileInfo.TextAnalyticsRequests.ConversationRequests?.Any() == true) + { + var conversationalAnalyticsErrors = await conversationsAnalysisProvider.AddConversationalEntitiesAsync(audioFileInfo.TextAnalyticsRequests.ConversationRequests.Select(request => request.Id), speechTranscript).ConfigureAwait(false); + textAnalyticsErrors.AddRange(conversationalAnalyticsErrors); + } + + if (textAnalyticsErrors.Any()) + { + var distinctErrors = textAnalyticsErrors.Distinct(); + var errorMessage = $"File {(string.IsNullOrEmpty(fileName) ? "unknown" : fileName)}:\n{string.Join('\n', distinctErrors)}"; + + generalErrorsStringBuilder.AppendLine(errorMessage); + } + } } - } - else if (transcriptionAnalyticsJobStatus == TranscriptionAnalyticsJobStatus.NotSubmitted) - { - var errors = await transcriptionAnalyticsOrchestrator.SubmitTranscriptionAnalyticsJobsAndAddToAudioFileInfos(speechTranscriptMappings).ConfigureAwait(false); - foreach (var error in errors) + else { - generalErrorsStringBuilder.AppendLine(error); + foreach (var speechTranscriptMapping in speechTranscriptMappings) + { + var speechTranscript = speechTranscriptMapping.Value; + var audioFileInfo = speechTranscriptMapping.Key; + + var fileName = audioFileInfo.FileName; + + if (speechTranscript.RecognizedPhrases != null && speechTranscript.RecognizedPhrases.All(phrase => phrase.RecognitionStatus.Equals("Success", StringComparison.Ordinal))) + { + var textAnalyticsErrors = new List(); + + (var utteranceLevelJobIds, var utteranceLevelErrors) = await textAnalyticsProvider.SubmitUtteranceLevelRequests( + speechTranscript, + FetchTranscriptionEnvironmentVariables.SentimentAnalysisSetting).ConfigureAwait(false); + + var utteranceLevelRequests = utteranceLevelJobIds?.Select(jobId => new TextAnalyticsRequest(jobId, TextAnalyticsRequestStatus.Running)); + textAnalyticsErrors.AddRange(utteranceLevelErrors); + + (var audioLevelJobIds, var audioLevelErrors) = await textAnalyticsProvider.SubmitAudioLevelRequests( + speechTranscript, + FetchTranscriptionEnvironmentVariables.SentimentAnalysisSetting, + FetchTranscriptionEnvironmentVariables.PiiRedactionSetting).ConfigureAwait(false); + + var audioLevelRequests = audioLevelJobIds?.Select(jobId => new TextAnalyticsRequest(jobId, TextAnalyticsRequestStatus.Running)); + textAnalyticsErrors.AddRange(audioLevelErrors); + + (var conversationJobIds, var conversationErrors) = await conversationsAnalysisProvider.SubmitAnalyzeConversationsRequestAsync(speechTranscript).ConfigureAwait(false); + + var conversationalRequests = conversationJobIds?.Select(jobId => new TextAnalyticsRequest(jobId, TextAnalyticsRequestStatus.Running)); + textAnalyticsErrors.AddRange(conversationErrors); + + audioFileInfo.TextAnalyticsRequests = new TextAnalyticsRequests(utteranceLevelRequests, audioLevelRequests, conversationalRequests); + + if (textAnalyticsErrors.Any()) + { + var distinctErrors = textAnalyticsErrors.Distinct(); + var errorMessage = $"File {(string.IsNullOrEmpty(fileName) ? "unknown" : fileName)}:\n{string.Join('\n', distinctErrors)}"; + + generalErrorsStringBuilder.AppendLine(errorMessage); + } + } + } + + log.LogInformation($"Added text analytics requests to service bus message - re-queueing message."); + + // Poll for first time with TA request after 1 minute + await ServiceBusUtilities.SendServiceBusMessageAsync(FetchServiceBusSender, serviceBusMessage.CreateMessageString(), log, TimeSpan.FromMinutes(1)).ConfigureAwait(false); + return; } var textAnalyticsSubmitErrors = generalErrorsStringBuilder.ToString(); diff --git a/samples/ingestion/ingestion-client/StartTranscriptionByServiceBus/README.md b/samples/ingestion/ingestion-client/StartTranscriptionByServiceBus/README.md deleted file mode 100644 index aca51650e..000000000 --- a/samples/ingestion/ingestion-client/StartTranscriptionByServiceBus/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# Local development instructions - -Pre-requisites: -1. Please follow the instructions on [the main guide](./../infra/guide.md#ingestion-client-setup-instructions) to deploy the Ingestion Client and associated ecosystem to Azure. -2. Reopen the project within a [devcontainer](https://containers.dev/overview). (The devcontainer settings at the root of the project have the tools needed to develop and run this function locally) - -To run the StartTranscriptionByServiceBus function locally, do the following: - -1. Run the following command to fetch your Azure Function app settings and save them to local.settings.json: - -``` -func azure functionapp fetch-app-settings -``` - -Note: Replace `` with the actual name of your function app that you can get from the Azure Portal. It will look like `StartTranscriptionFunction-20240531T092901Z`. - -2. In the local.settings.json file generated by the previous step, replace the value of the `AzureSpeechServicesKey` with the actual key for your Azure Speech Service instance. You can get this from the Azure portal. - -3. Navigate to the StartTranscription function running on your Azure via the portal, and click on Stop. You need to do this so that you don't have two instances of the StartTranscription function running and listening to the same events when you start the function from your local machine in the next step. - -4. Run the following command to start the local function (this will apply your local code changes): - -``` -func start -``` \ No newline at end of file diff --git a/samples/ingestion/ingestion-client/StartTranscriptionByTimer/README.md b/samples/ingestion/ingestion-client/StartTranscriptionByTimer/README.md deleted file mode 100644 index da5773518..000000000 --- a/samples/ingestion/ingestion-client/StartTranscriptionByTimer/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# Local development instructions - -Pre-requisites: -1. Please follow the instructions on [the main guide](./../infra/guide.md#ingestion-client-setup-instructions) to deploy the Ingestion Client and associated ecosystem to Azure. -2. Reopen the project within a [devcontainer](https://containers.dev/overview). (The devcontainer settings at the root of the project have the tools needed to develop and run this function locally) - -To run the StartTranscriptionByTimer function locally, do the following: - -1. Run the following command to fetch your Azure Function app settings and save them to local.settings.json: - -``` -func azure functionapp fetch-app-settings -``` - -Note: Replace `` with the actual name of your function app that you can get from the Azure Portal. It will look like `StartTranscriptionFunction-20240531T092901Z`. - -2. In the local.settings.json file generated by the previous step, replace the value of the `AzureSpeechServicesKey` with the actual key for your Azure Speech Service instance. You can get this from the Azure portal. - -3. Navigate to the StartTranscription function running on your Azure via the portal, and click on Stop. You need to do this so that you don't have two instances of the StartTranscription function running and listening to the same events when you start the function from your local machine in the next step. - -4. Run the following command to start the local function (this will apply your local code changes): - -``` -func start -``` \ No newline at end of file diff --git a/samples/ingestion/ingestion-client/Tests/EndToEndTests.cs b/samples/ingestion/ingestion-client/Tests/EndToEndTests.cs index 0ea366690..555d94261 100644 --- a/samples/ingestion/ingestion-client/Tests/EndToEndTests.cs +++ b/samples/ingestion/ingestion-client/Tests/EndToEndTests.cs @@ -14,7 +14,7 @@ namespace Tests using Connector.Serializable.Language.Conversations; using Connector.Serializable.TranscriptionStartedServiceBusMessage; - using FetchTranscription; + using Language; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; @@ -69,9 +69,11 @@ public async Task AnalyzeConversationTestAsync() var errors = await provider.SubmitTranscriptionAnalyticsJobsAsync(speechTranscriptMapping).ConfigureAwait(false); Console.WriteLine("Submit"); - Assert.AreEqual(0, errors.Count()); + Console.WriteLine(JsonConvert.SerializeObject(jobIds)); + Assert.AreEqual(0, jobIds.errors.Count()); + var req = jobIds.jobIds.Select(jobId => new AudioFileInfo(default, default, new TextAnalyticsRequests(default, default, new[] { new TextAnalyticsRequest(jobId, TextAnalyticsRequest.TextAnalyticsRequestStatus.Running) }))); - while ((await provider.GetTranscriptionAnalyticsJobStatusAsync(speechTranscriptMapping.Keys).ConfigureAwait(false)) == Connector.Enums.TranscriptionAnalyticsJobStatus.Running) + while (!await provider.ConversationalRequestsCompleted(req).ConfigureAwait(false)) { await Task.Delay(TimeSpan.FromSeconds(10)).ConfigureAwait(false); Console.WriteLine($"[{DateTime.Now}]jobs are running..."); @@ -79,7 +81,7 @@ public async Task AnalyzeConversationTestAsync() Console.WriteLine($"[{DateTime.Now}]jobs done."); - var err = await provider.AddTranscriptionAnalyticsResultsToTranscriptsAsync(speechTranscriptMapping); + var err = await provider.AddConversationalEntitiesAsync(jobIds.jobIds, transcription); Console.WriteLine($"annotation result: {JsonConvert.SerializeObject(transcription)}"); Assert.AreEqual(0, err.Count()); Assert.AreEqual(4, transcription.ConversationAnalyticsResults.AnalyzeConversationSummarizationResults.Conversations.First().Summaries.Count()); diff --git a/samples/ingestion/ingestion-client/infra/guide.md b/samples/ingestion/ingestion-client/infra/guide.md index 8f2c467ea..4ad4595c3 100644 --- a/samples/ingestion/ingestion-client/infra/guide.md +++ b/samples/ingestion/ingestion-client/infra/guide.md @@ -14,7 +14,7 @@ The following diagram shows the structure of this tool as defined by the ARM tem ![Architecture](./images/architecture.png) -When a file lands in a storage container, the Grid event indicates the completed upload of a file. The file is filtered and pushed to a Service bus topic. Code in Azure Functions triggered by a timer picks up the event and creates a transcription request using the Azure Speech services batch pipeline. When the transcription request is complete, an event is placed in another queue in the same service bus resource. A different Azure Function triggered by the completion event starts monitoring transcription completion status. When transcription completes, the Azure Function copies the transcript into the same container where the audio file was obtained. +When a file lands in a storage container, the Grid event indicates the completed upload of a file. The file is filtered and pushed to a Service bus topic. Code in Azure Functions triggered by a timer picks up the event and creates a transmission request using the Azure Speech services batch pipeline. When the transmission request is complete, an event is placed in another queue in the same service bus resource. A different Azure Function triggered by the completion event starts monitoring transcription completion status. When transcription completes, the Azure Function copies the transcript into the same container where the audio file was obtained. The rest of the features are applied on demand. By deploying additional resources through the ARM template, you can choose to apply analytics on the transcript, produce reports or redact. @@ -57,7 +57,17 @@ To deploy the required infrastructure, click the button below: [![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fcognitive-services-speech-sdk%2Fmaster%2Fsamples%2Fingestion%2Fingestion-client%2Finfra%2Fmain.json) -This will result in the screen below on your browser. You will need to fill in the form provided. It is +4. Load the template by clicking **Load file**. Alternatively, +you could copy/paste the template in the editor. + +![Load template](./images/image007.png) + +5. Once the template text is loaded you will be able to read and edit the transcript. Do +**NOT** attempt any edits at this stage. You need to save the template you loaded, so click the **Save** button. + +![Save template](./images/image009.png) + +Saving the template will result in the screen below. You will need to fill in the form provided. It is important that all the information is correct. Let us look at the form and go through each field. ![form template](./images/image011.png) diff --git a/samples/java/android/SpeechSynthesis/app/build.gradle b/samples/java/android/SpeechSynthesis/app/build.gradle index c8973010f..1dbf23377 100644 --- a/samples/java/android/SpeechSynthesis/app/build.gradle +++ b/samples/java/android/SpeechSynthesis/app/build.gradle @@ -28,7 +28,7 @@ android { dependencies { // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.38.0' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.40.0' implementation 'androidx.appcompat:appcompat:1.6.1' implementation 'com.google.android.material:material:1.8.0' diff --git a/samples/java/android/SpeechSynthesis/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/speechsynthesis/MainActivity.java b/samples/java/android/SpeechSynthesis/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/speechsynthesis/MainActivity.java index 993e106d7..d314fa636 100644 --- a/samples/java/android/SpeechSynthesis/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/speechsynthesis/MainActivity.java +++ b/samples/java/android/SpeechSynthesis/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/speechsynthesis/MainActivity.java @@ -125,7 +125,7 @@ public void onCreateSynthesizerButtonClicked(View v) { // Use 24k Hz format for higher quality. speechConfig.setSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm); // Set voice name. - speechConfig.setSpeechSynthesisVoiceName("en-US-AvaMultilingualNeural"); + speechConfig.setSpeechSynthesisVoiceName("en-US-JennyNeural"); synthesizer = new SpeechSynthesizer(speechConfig, null); connection = Connection.fromSpeechSynthesizer(synthesizer); diff --git a/samples/java/android/avatar/app/build.gradle b/samples/java/android/avatar/app/build.gradle index 477318b20..279b9bb72 100644 --- a/samples/java/android/avatar/app/build.gradle +++ b/samples/java/android/avatar/app/build.gradle @@ -28,7 +28,7 @@ android { dependencies { // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.38.0' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.40.0' implementation 'androidx.appcompat:appcompat:1.6.1' implementation 'com.google.android.material:material:1.8.0' diff --git a/samples/java/android/compressed-input/app/build.gradle b/samples/java/android/compressed-input/app/build.gradle index 8ffa93aea..6f4a250b3 100644 --- a/samples/java/android/compressed-input/app/build.gradle +++ b/samples/java/android/compressed-input/app/build.gradle @@ -25,7 +25,7 @@ android { dependencies { // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.38.0' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.40.0' implementation 'androidx.appcompat:appcompat:1.3.1' implementation 'com.google.android.material:material:1.4.0' diff --git a/samples/java/android/embedded-speech/README.md b/samples/java/android/embedded-speech/README.md index f09e86b1a..1c5721ebc 100644 --- a/samples/java/android/embedded-speech/README.md +++ b/samples/java/android/embedded-speech/README.md @@ -38,20 +38,16 @@ To build: * Files belonging to a specific model must be present as normal individual files in a model folder, not in a package, and they must be readable by the application process. The model internal subfolder structure must be intact i.e. as originally delivered. + * `EmbeddedSpeechModelLicense` + * License text. It is presumed that all the customer's embedded speech models use the same license. * `EmbeddedSpeechRecognitionModelName` * Name of the embedded speech recognition model to be used for recognition. If recognition is not needed, leave the default value unchanged. * The model name can be short (see https://aka.ms/speech/sr-languages, e.g. `en-US`) or full (e.g. `Microsoft Speech Recognizer en-US FP Model V8.1`). - * `EmbeddedSpeechRecognitionModelKey` - * Decryption key of the (encrypted) embedded speech recognition model. - If recognition is not needed, leave the default value unchanged. * `EmbeddedSpeechSynthesisVoiceName` * Name of the embedded speech synthesis voice to be used for synthesis. If synthesis is not needed, leave the default value unchanged. * The voice name can be short (see https://aka.ms/speech/tts-languages, e.g. `en-US-AriaNeural`) or full (e.g. `Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)`). - * `EmbeddedSpeechSynthesisVoiceKey` - * Decryption key of the (encrypted) embedded speech synthesis voice. - If synthesis is not needed, leave the default value unchanged. 1. Press **Ctrl+F9** or select **Build** \> **Make Project**. Note: If the build is successful but Android Studio shows references to Speech SDK symbols in red and displays "*Cannot resolve symbol ...*", delete `.gradle` and `.idea` folders, then rebuild. @@ -67,7 +63,7 @@ Note: If the build is successful but Android Studio shows references to Speech S This access method will not work if the application specifies a target API level 30 (Android 11) or higher. See https://developer.android.com/about/versions/11/privacy/storage for information on alternatives. 1. Use the buttons in the app as follows. - * *Initialize objects* : Initializes the recognizer and/or synthesizer based on model/voice configuration (name, key) in `MainActivity`. + * *Initialize objects* : Initializes the recognizer and/or synthesizer based on model/voice configuration (name, license) in `MainActivity`. * This can take a moment due to loading of model data, so it is best done as a separate stage in advance before starting recognition or synthesis. * *Recognize speech* : Listens to the device default microphone for input and transcribes recognized speech to text in the app window. This returns one result - run it again to recognize more. * *Synthesize speech* : Reads input from the text entry above this button, and synthesizes speech to the device default speaker. @@ -81,7 +77,7 @@ Do **not** add [client-sdk](https://mvnrepository.com/artifact/com.microsoft.cog **Note:** Make sure that `@aar` suffix is used when the dependency is specified in `build.gradle`. For example, ``` dependencies { - implementation 'com.microsoft.cognitiveservices.speech:client-sdk-embedded:1.35.0@aar' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk-embedded:1.40.0@aar' ... ``` diff --git a/samples/java/android/embedded-speech/app/build.gradle b/samples/java/android/embedded-speech/app/build.gradle index c6ca83846..f5744362e 100644 --- a/samples/java/android/embedded-speech/app/build.gradle +++ b/samples/java/android/embedded-speech/app/build.gradle @@ -25,7 +25,7 @@ dependencies { implementation fileTree(include: ['*.jar'], dir: 'libs') // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk-embedded:1.38.0@aar' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk-embedded:1.40.0' implementation 'androidx.appcompat:appcompat:1.3.1' implementation 'androidx.constraintlayout:constraintlayout:2.1.0' diff --git a/samples/java/android/embedded-speech/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/embedded/MainActivity.java b/samples/java/android/embedded-speech/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/embedded/MainActivity.java index 60b7ab2ed..ead1c35ff 100644 --- a/samples/java/android/embedded-speech/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/embedded/MainActivity.java +++ b/samples/java/android/embedded-speech/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/embedded/MainActivity.java @@ -28,12 +28,11 @@ public class MainActivity extends AppCompatActivity * START OF CONFIGURABLE SETTINGS * **********************************/ - // Enter the names and keys of your embedded speech recognition model and synthesis voice. - // If either recognition or synthesis is not needed, leave the corresponding default values unchanged. + // Enter the names of your embedded speech recognition model and synthesis voice, and the license (text). + // If either recognition or synthesis is not needed, leave the corresponding name string empty. + private static final String EmbeddedSpeechModelLicense = ""; // license text (presumed to be the same for all the customer's models) private static final String EmbeddedSpeechRecognitionModelName = ""; // e.g. "en-US" or "Microsoft Speech Recognizer en-US FP Model V8.1" - private static final String EmbeddedSpeechRecognitionModelKey = ""; // model decryption key private static final String EmbeddedSpeechSynthesisVoiceName = ""; // e.g. "en-US-AriaNeural" or "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)" - private static final String EmbeddedSpeechSynthesisVoiceKey = ""; // voice decryption key // Embedded speech recognition models and synthesis voices must reside // as normal individual files in model/voice specific folders on the @@ -100,10 +99,10 @@ public void onInitObjectsButtonClicked(View v) StringBuilder sb = new StringBuilder("Initialized"); - if (!EmbeddedSpeechRecognitionModelName.isEmpty() && !EmbeddedSpeechRecognitionModelKey.isEmpty()) + if (!EmbeddedSpeechRecognitionModelName.isEmpty() && !EmbeddedSpeechModelLicense.isEmpty()) { // Selects the embedded speech recognition model to use. - speechConfig.setSpeechRecognitionModel(EmbeddedSpeechRecognitionModelName, EmbeddedSpeechRecognitionModelKey); + speechConfig.setSpeechRecognitionModel(EmbeddedSpeechRecognitionModelName, EmbeddedSpeechModelLicense); // Creates a speech recognizer instance using the device default // microphone for audio input. @@ -114,10 +113,10 @@ public void onInitObjectsButtonClicked(View v) sb.append(" recognizer"); } - if (!EmbeddedSpeechSynthesisVoiceName.isEmpty() && !EmbeddedSpeechSynthesisVoiceKey.isEmpty()) + if (!EmbeddedSpeechSynthesisVoiceName.isEmpty() && !EmbeddedSpeechModelLicense.isEmpty()) { // Selects the embedded speech synthesis voice to use. - speechConfig.setSpeechSynthesisVoice(EmbeddedSpeechSynthesisVoiceName, EmbeddedSpeechSynthesisVoiceKey); + speechConfig.setSpeechSynthesisVoice(EmbeddedSpeechSynthesisVoiceName, EmbeddedSpeechModelLicense); if (EmbeddedSpeechSynthesisVoiceName.contains("Neural")) { diff --git a/samples/java/android/sdkdemo/app/build.gradle b/samples/java/android/sdkdemo/app/build.gradle index 1f6454be0..b419af94b 100644 --- a/samples/java/android/sdkdemo/app/build.gradle +++ b/samples/java/android/sdkdemo/app/build.gradle @@ -25,7 +25,7 @@ android { dependencies { // Speech SDK - implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.38.0' + implementation 'com.microsoft.cognitiveservices.speech:client-sdk:1.40.0' // Diff lib for pronunciation assessment implementation "io.github.java-diff-utils:java-diff-utils:4.11" diff --git a/samples/java/jre/console/README.md b/samples/java/jre/console/README.md index f17769fa5..f74fb46d1 100644 --- a/samples/java/jre/console/README.md +++ b/samples/java/jre/console/README.md @@ -18,15 +18,6 @@ This sample demonstrates various forms of speech recognition, intent recognition sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - - ```sh - sudo yum update - sudo yum install alsa-lib java-1.8.0-openjdk-devel openssl - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - 1. This sample has not been verified with Eclipse on ARM platforms. ## Build the sample diff --git a/samples/java/jre/console/pom.xml b/samples/java/jre/console/pom.xml index 2c663476f..32a95466c 100644 --- a/samples/java/jre/console/pom.xml +++ b/samples/java/jre/console/pom.xml @@ -56,7 +56,7 @@ com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 jakarta.json diff --git a/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/SpeechSynthesisSamples.java b/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/SpeechSynthesisSamples.java index 9237dc812..8a58c0134 100644 --- a/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/SpeechSynthesisSamples.java +++ b/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/SpeechSynthesisSamples.java @@ -126,11 +126,11 @@ public static void synthesisWithVoiceAsync() throws InterruptedException, Execut SpeechConfig config = SpeechConfig.fromSubscription("YourSubscriptionKey", "YourServiceRegion"); // Sets the voice name. - // e.g. "en-US-AndrewMultilingualNeural". + // e.g. "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)". // The full list of supported voices can be found here: // https://aka.ms/csspeech/voicenames // And, you can try getVoicesAsync method to get all available voices (see synthesisGetAvailableVoicesAsync() sample below). - String voice = "en-US-AndrewMultilingualNeural"; + String voice = "en-US-JennyNeural"; config.setSpeechSynthesisVoiceName(voice); // Creates a speech synthesizer using the default speaker as audio output. @@ -864,7 +864,7 @@ public static void synthesisBookmarkEventAsync() throws InterruptedException, Ex new Scanner(System.in).nextLine(); // Bookmark tag is needed in the SSML, e.g. - String ssml = " one. two. three. four."; + String ssml = " one. two. three. four."; SpeechSynthesisResult result = synthesizer.SpeakSsmlAsync(ssml).get(); diff --git a/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/SpeechSynthesisScenarioSamples.java b/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/SpeechSynthesisScenarioSamples.java index acc47fb8f..64026c6b4 100644 --- a/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/SpeechSynthesisScenarioSamples.java +++ b/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/SpeechSynthesisScenarioSamples.java @@ -90,7 +90,7 @@ public SpeechSynthesisService() { /** * A thread-safe method to synthesize content * @param content The text to synthesize - * @param voice The voice name, e.g. en-US-AvaMultilingualNeural + * @param voice The voice name, e.g. en-US-JennyNeural * @return The first byte latency and processing time, in millisecond. */ public long[] synthesis(String content, String voice) { @@ -149,7 +149,7 @@ public static void synthesisServerScenarioAsync() throws InterruptedException { System.out.printf("Turn: %d%n", finalTurn); IntStream.range(0, 64).parallel().forEach(i -> { - long[] latency = service.synthesis(String.format("today is a nice day. %d%d", finalTurn, i), "en-US-AvaMultilingualNeural"); + long[] latency = service.synthesis(String.format("today is a nice day. %d%d", finalTurn, i), "en-US-JennyNeural"); if (finalTurn > 0) { latencies.add(latency[0]); processingTimes.add(latency[1]); diff --git a/samples/java/jre/embedded-speech/README.md b/samples/java/jre/embedded-speech/README.md index 32289f54b..ce62e2cc4 100644 --- a/samples/java/jre/embedded-speech/README.md +++ b/samples/java/jre/embedded-speech/README.md @@ -19,7 +19,7 @@ See the [platform requirements for installing the Speech SDK](https://learn.micr Requirements specific to embedded speech samples are as follows. * Supported operating systems and architectures: * Windows - `x64`. - * Linux - `x64`, `ARM64`. Note that embedded speech is not supported on RHEL/CentOS 7. + * Linux - `x64`, `ARM64`. * macOS - `x64`, `ARM64`. * Java 8 or newer JDK. Check with `java -version` on the command line. * **Note:** Make sure that the Java installation is native to the system architecture (e.g. macOS `ARM64`) and not running through emulation. @@ -57,12 +57,10 @@ To tailor the sample to your configuration, there are two options: * Alternatively set corresponding environment variables (shown in parentheses in the list) before running the sample application. See details in [how to run the sample](#run-the-sample). Sample settings: -1. `SpeechRecognitionLocale` (`SPEECH_RECOGNITION_LOCALE`) - * Speech recognition locale in BCP-47 format, case-sensitive. If not set, en-US will be assumed. - * Setting `EmbeddedSpeechRecognitionModelName` overrides this for embedded speech. -1. `SpeechSynthesisLocale` (`SPEECH_SYNTHESIS_LOCALE`) - * Speech synthesis locale in BCP-47 format, case-sensitive. If not set, en-US will be assumed. - * Setting `EmbeddedSpeechSynthesisVoiceName` overrides this for embedded speech. +1. `EmbeddedSpeechModelLicense` (`EMBEDDED_SPEECH_MODEL_LICENSE`) + * Embedded speech model license (text). + * This applies to embedded speech recognition, synthesis and translation. + * It is presumed that all the customer's embedded speech models use the same license. 1. `EmbeddedSpeechRecognitionModelPath` (`EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH`) * Path to the local embedded speech recognition model(s) on the device file system. This may be a single model folder or a top-level folder for several models. @@ -72,10 +70,8 @@ Sample settings: not inside an archive, and they must be readable by the application process. The model internal subfolder structure must be intact i.e. as originally delivered. 1. `EmbeddedSpeechRecognitionModelName` (`EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME`) - * Name of the embedded speech recognition model to be used for recognition. If set, this overrides `SpeechRecognitionLocale` for embedded. + * Name of the embedded speech recognition model to be used for recognition. * The model name can be short (see https://aka.ms/speech/sr-languages, e.g. `en-US`) or full (e.g. `Microsoft Speech Recognizer en-US FP Model V8`). -1. `EmbeddedSpeechRecognitionModelKey` (`EMBEDDED_SPEECH_RECOGNITION_MODEL_KEY`) - * Decryption key of the (encrypted) embedded speech recognition model. 1. `EmbeddedSpeechSynthesisVoicePath` (`EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH`) * Path to the local embedded speech synthesis voice(s) on the device file system. This may be a single voice folder or a top-level folder for several voices. @@ -85,10 +81,8 @@ Sample settings: not inside an archive, and they must be readable by the application process. The voice internal subfolder structure must be intact i.e. as originally delivered. 1. `EmbeddedSpeechSynthesisVoiceName` (`EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME`) - * Name of the embedded speech synthesis voice to be used for synthesis. If set, this overrides `SpeechSynthesisLocale` for embedded. + * Name of the embedded speech synthesis voice to be used for synthesis. * The voice name can be short (see https://aka.ms/speech/tts-languages, e.g. `en-US-JennyNeural`) or full (e.g. `Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)`). -1. `EmbeddedSpeechSynthesisVoiceKey` (`EMBEDDED_SPEECH_SYNTHESIS_VOICE_KEY`) - * Decryption key of the (encrypted) embedded speech synthesis voice. 1. `EmbeddedSpeechTranslationModelPath` (`EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH`) * Path to the local embedded speech translation model(s) on the device file system. This may be a single model folder or a top-level folder for several models. @@ -100,12 +94,14 @@ Sample settings: 1. `EmbeddedSpeechTranslationModelName` (`EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME`) * Name of the embedded speech translation model to be used for translation. * The full model name must be given (e.g. `Microsoft Speech Translator Many-to-English Model V2`). -1. `EmbeddedSpeechTranslationModelKey` (`EMBEDDED_SPEECH_TRANSLATION_MODEL_KEY`) - * Decryption key of the (encrypted) embedded speech translation model. 1. `CloudSpeechSubscriptionKey` (`CLOUD_SPEECH_SUBSCRIPTION_KEY`) * Cloud speech service subscription key. This is needed with hybrid speech configuration. If not set, only embedded speech will be used. 1. `CloudSpeechServiceRegion` (`CLOUD_SPEECH_SERVICE_REGION`) * Cloud speech service region. This is needed with hybrid speech configuration. If not set, only embedded speech will be used. +1. `CloudSpeechRecognitionLanguage` (`CLOUD_SPEECH_RECOGNITION_LANGUAGE`) + * Cloud speech recognition language in BCP-47 format, case-sensitive. This is needed with hybrid speech configuration. If not set, en-US will be assumed. +1. `CloudSpeechSynthesisLanguage` (`CLOUD_SPEECH_SYNTHESIS_LANGUAGE`) + * Cloud speech synthesis language in BCP-47 format, case-sensitive. This is needed with hybrid speech configuration. If not set, en-US will be assumed. ### Eclipse diff --git a/samples/java/jre/embedded-speech/pom.xml b/samples/java/jre/embedded-speech/pom.xml index 2f37c1c0f..9ed68c713 100644 --- a/samples/java/jre/embedded-speech/pom.xml +++ b/samples/java/jre/embedded-speech/pom.xml @@ -56,7 +56,7 @@ com.microsoft.cognitiveservices.speech client-sdk-embedded - 1.38.0 + 1.40.0 org.json diff --git a/samples/java/jre/embedded-speech/src/com/microsoft/cognitiveservices/speech/samples/embedded/Settings.java b/samples/java/jre/embedded-speech/src/com/microsoft/cognitiveservices/speech/samples/embedded/Settings.java index 9bbf58b1e..d12abee18 100644 --- a/samples/java/jre/embedded-speech/src/com/microsoft/cognitiveservices/speech/samples/embedded/Settings.java +++ b/samples/java/jre/embedded-speech/src/com/microsoft/cognitiveservices/speech/samples/embedded/Settings.java @@ -15,13 +15,10 @@ public class Settings { // START OF CONFIGURABLE SETTINGS - // Locale to be used in speech recognition, cloud and embedded. In BCP-47 format, case-sensitive. - // If EmbeddedSpeechRecognitionModelName is changed from the default, it will override this for embedded. - private static final String SpeechRecognitionLocale = "en-US"; // or set SPEECH_RECOGNITION_LOCALE - - // Locale to be used in speech synthesis (text-to-speech), cloud and embedded. In BCP-47 format, case-sensitive. - // If EmbeddedSpeechSynthesisVoiceName is changed from the default, it will override this for embedded. - private static final String SpeechSynthesisLocale = "en-US"; // or set SPEECH_SYNTHESIS_LOCALE + // Embedded speech model license (text). + // This applies to embedded speech recognition, synthesis and translation. + // It is presumed that all the customer's embedded speech models use the same license. + private static final String EmbeddedSpeechModelLicense = "YourEmbeddedSpeechModelLicense"; // or set EMBEDDED_SPEECH_MODEL_LICENSE // Path to the local embedded speech recognition model(s) on the device file system. // This may be a single model folder or a top-level folder for several models. @@ -32,14 +29,9 @@ public class Settings private static final String EmbeddedSpeechRecognitionModelPath = "YourEmbeddedSpeechRecognitionModelPath"; // or set EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH // Name of the embedded speech recognition model to be used for recognition. - // If changed from the default, this will override SpeechRecognitionLocale. // For example: "en-US" or "Microsoft Speech Recognizer en-US FP Model V8" private static final String EmbeddedSpeechRecognitionModelName = "YourEmbeddedSpeechRecognitionModelName"; // or set EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME - // Decryption key of the (encrypted) embedded speech recognition model. - // WARNING: The key may be visible in the program binary if hard-coded as a plain string. - private static final String EmbeddedSpeechRecognitionModelKey = "YourEmbeddedSpeechRecognitionModelKey"; // or set EMBEDDED_SPEECH_RECOGNITION_MODEL_KEY - // Path to the local embedded speech synthesis voice(s) on the device file system. // This may be a single voice folder or a top-level folder for several voices. // Use an absolute path or a path relative to the application working folder. @@ -49,14 +41,9 @@ public class Settings private static final String EmbeddedSpeechSynthesisVoicePath = "YourEmbeddedSpeechSynthesisVoicePath"; // or set EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH // Name of the embedded speech synthesis voice to be used for synthesis. - // If changed from the default, this will override SpeechSynthesisLocale. // For example: "en-US-JennyNeural" or "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)" private static final String EmbeddedSpeechSynthesisVoiceName = "YourEmbeddedSpeechSynthesisVoiceName"; // or set EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME - // Decryption key of the (encrypted) embedded speech synthesis voice. - // WARNING: The key may be visible in the program binary if hard-coded as a plain string. - private static final String EmbeddedSpeechSynthesisVoiceKey = "YourEmbeddedSpeechSynthesisVoiceKey"; // or set EMBEDDED_SPEECH_SYNTHESIS_VOICE_KEY - // Path to the local embedded speech translation model(s) on the device file system. // This may be a single model folder or a top-level folder for several models. // Use an absolute path or a path relative to the application working folder. @@ -69,14 +56,13 @@ public class Settings // For example: "Microsoft Speech Translator Many-to-English Model V2" private static final String EmbeddedSpeechTranslationModelName = "YourEmbeddedSpeechTranslationModelName"; // or set EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME - // Decryption key of the (encrypted) embedded speech translation model. - // WARNING: The key may be visible in the program binary if hard-coded as a plain string. - private static final String EmbeddedSpeechTranslationModelKey = "YourEmbeddedSpeechTranslationModelKey"; // or set EMBEDDED_SPEECH_TRANSLATION_MODEL_KEY - - // Cloud speech service subscription information. - // This is needed with hybrid (cloud & embedded) speech configuration. + // Cloud speech service subscription and language settings. + // These are needed with hybrid (cloud & embedded) speech configuration. + // The language must be specified in BCP-47 format, case-sensitive. private static final String CloudSpeechSubscriptionKey = "YourCloudSpeechSubscriptionKey"; // or set CLOUD_SPEECH_SUBSCRIPTION_KEY private static final String CloudSpeechServiceRegion = "YourCloudSpeechServiceRegion"; // or set CLOUD_SPEECH_SERVICE_REGION + private static final String CloudSpeechRecognitionLanguage = "en-US"; // or set CLOUD_SPEECH_RECOGNITION_LANGUAGE + private static final String CloudSpeechSynthesisLanguage = "en-US"; // or set CLOUD_SPEECH_SYNTHESIS_LANGUAGE // END OF CONFIGURABLE SETTINGS @@ -106,17 +92,18 @@ private static String getSetting(String environmentVariableName, String defaultV // These are set in verifySettings() after some basic verification. + private static String SpeechModelLicense; + private static String SpeechRecognitionModelPath; private static String SpeechRecognitionModelName; - private static String SpeechRecognitionModelKey; + private static String SpeechSynthesisVoicePath; private static String SpeechSynthesisVoiceName; - private static String SpeechSynthesisVoiceKey; + private static String SpeechTranslationModelPath; private static String SpeechTranslationModelName; - private static String SpeechTranslationModelKey; // Utility functions for main menu. public static boolean hasSpeechRecognitionModel() { - if (SpeechRecognitionModelName.isEmpty()) + if (SpeechRecognitionModelPath.isEmpty() || SpeechRecognitionModelName.isEmpty()) { System.err.println("## ERROR: No speech recognition model specified."); return false; @@ -126,7 +113,7 @@ public static boolean hasSpeechRecognitionModel() public static boolean hasSpeechSynthesisVoice() { - if (SpeechSynthesisVoiceName.isEmpty()) + if (SpeechSynthesisVoicePath.isEmpty() || SpeechSynthesisVoiceName.isEmpty()) { System.err.println("## ERROR: No speech synthesis voice specified."); return false; @@ -136,7 +123,7 @@ public static boolean hasSpeechSynthesisVoice() public static boolean hasSpeechTranslationModel() { - if (SpeechTranslationModelName.isEmpty()) + if (SpeechTranslationModelPath.isEmpty() || SpeechTranslationModelName.isEmpty()) { System.err.println("## ERROR: No speech translation model specified."); return false; @@ -150,20 +137,17 @@ public static EmbeddedSpeechConfig createEmbeddedSpeechConfig() throws Interrupt List paths = new ArrayList<>(); // Add paths for offline data. - String recognitionModelPath = getSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH", EmbeddedSpeechRecognitionModelPath); - if (!recognitionModelPath.isEmpty() && !recognitionModelPath.equals("YourEmbeddedSpeechRecognitionModelPath")) + if (!SpeechRecognitionModelPath.isEmpty()) { - paths.add(recognitionModelPath); + paths.add(SpeechRecognitionModelPath); } - String synthesisVoicePath = getSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH", EmbeddedSpeechSynthesisVoicePath); - if (!synthesisVoicePath.isEmpty() && !synthesisVoicePath.equals("YourEmbeddedSpeechSynthesisVoicePath")) + if (!SpeechSynthesisVoicePath.isEmpty()) { - paths.add(synthesisVoicePath); + paths.add(SpeechSynthesisVoicePath); } - String translationModelPath = getSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH", EmbeddedSpeechTranslationModelPath); - if (!translationModelPath.isEmpty() && !translationModelPath.equals("YourEmbeddedSpeechTranslationModelPath")) + if (!SpeechTranslationModelPath.isEmpty()) { - paths.add(translationModelPath); + paths.add(SpeechTranslationModelPath); } if (paths.size() == 0) @@ -186,14 +170,14 @@ public static EmbeddedSpeechConfig createEmbeddedSpeechConfig() throws Interrupt if (!SpeechRecognitionModelName.isEmpty()) { - // Mandatory configuration for embedded speech recognition. - config.setSpeechRecognitionModel(SpeechRecognitionModelName, SpeechRecognitionModelKey); + // Mandatory configuration for embedded speech (and intent) recognition. + config.setSpeechRecognitionModel(SpeechRecognitionModelName, SpeechModelLicense); } if (!SpeechSynthesisVoiceName.isEmpty()) { // Mandatory configuration for embedded speech synthesis. - config.setSpeechSynthesisVoice(SpeechSynthesisVoiceName, SpeechSynthesisVoiceKey); + config.setSpeechSynthesisVoice(SpeechSynthesisVoiceName, SpeechModelLicense); if (SpeechSynthesisVoiceName.contains("Neural")) { // Embedded neural voices only support 24kHz sample rate. @@ -204,7 +188,7 @@ public static EmbeddedSpeechConfig createEmbeddedSpeechConfig() throws Interrupt if (!SpeechTranslationModelName.isEmpty()) { // Mandatory configuration for embedded speech translation. - config.setSpeechTranslationModel(SpeechTranslationModelName, SpeechTranslationModelKey); + config.setSpeechTranslationModel(SpeechTranslationModelName, SpeechModelLicense); } // Disable profanity masking. @@ -228,8 +212,8 @@ public static HybridSpeechConfig createHybridSpeechConfig() throws InterruptedEx // Also see // https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/java/jre/console // for complete Speech SDK samples using cloud speech services. - cloudSpeechConfig.setSpeechRecognitionLanguage(getSetting("SPEECH_RECOGNITION_LOCALE", SpeechRecognitionLocale)); - cloudSpeechConfig.setSpeechSynthesisLanguage(getSetting("SPEECH_SYNTHESIS_LOCALE", SpeechSynthesisLocale)); + cloudSpeechConfig.setSpeechRecognitionLanguage(getSetting("CLOUD_SPEECH_RECOGNITION_LANGUAGE", CloudSpeechRecognitionLanguage)); + cloudSpeechConfig.setSpeechSynthesisLanguage(getSetting("CLOUD_SPEECH_SYNTHESIS_LANGUAGE", CloudSpeechSynthesisLanguage)); EmbeddedSpeechConfig embeddedSpeechConfig = createEmbeddedSpeechConfig(); @@ -245,109 +229,86 @@ public static boolean verifySettings() throws InterruptedException, ExecutionExc String cwd = System.getProperty("user.dir"); System.out.println("Current working directory: " + cwd); - String recognitionModelPath = getSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH", EmbeddedSpeechRecognitionModelPath); - if (recognitionModelPath.isEmpty() || recognitionModelPath.equals("YourEmbeddedSpeechRecognitionModelPath")) + SpeechModelLicense = getSetting("EMBEDDED_SPEECH_MODEL_LICENSE", EmbeddedSpeechModelLicense); + if (SpeechModelLicense.isEmpty() || SpeechModelLicense.equals("YourEmbeddedSpeechModelLicense")) { - recognitionModelPath = ""; + System.err.println("## ERROR: The embedded speech model license is not set."); + return false; } - String synthesisVoicePath = getSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH", EmbeddedSpeechSynthesisVoicePath); - if (synthesisVoicePath.isEmpty() || synthesisVoicePath.equals("YourEmbeddedSpeechSynthesisVoicePath")) + SpeechRecognitionModelPath = getSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_PATH", EmbeddedSpeechRecognitionModelPath); + if (SpeechRecognitionModelPath.equals("YourEmbeddedSpeechRecognitionModelPath")) { - synthesisVoicePath = ""; + SpeechRecognitionModelPath = ""; } - - String translationModelPath = getSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH", EmbeddedSpeechTranslationModelPath); - if (translationModelPath.isEmpty() || translationModelPath.equals("YourEmbeddedSpeechTranslationModelPath")) + SpeechRecognitionModelName = getSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME", EmbeddedSpeechRecognitionModelName); + if (SpeechRecognitionModelName.equals("YourEmbeddedSpeechRecognitionModelName")) { - translationModelPath = ""; + SpeechRecognitionModelName = ""; } - // Find an embedded speech recognition model based on the name or locale. - - SpeechRecognitionModelName = ""; - - if (!recognitionModelPath.isEmpty()) + SpeechSynthesisVoicePath = getSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_PATH", EmbeddedSpeechSynthesisVoicePath); + if (SpeechSynthesisVoicePath.equals("YourEmbeddedSpeechSynthesisVoicePath")) { - String modelName = getSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_NAME", EmbeddedSpeechRecognitionModelName); - String modelLocale = getSetting("SPEECH_RECOGNITION_LOCALE", SpeechRecognitionLocale); + SpeechSynthesisVoicePath = ""; + } + SpeechSynthesisVoiceName = getSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME", EmbeddedSpeechSynthesisVoiceName); + if (SpeechSynthesisVoiceName.equals("YourEmbeddedSpeechSynthesisVoiceName")) + { + SpeechSynthesisVoiceName = ""; + } - if (modelName.isEmpty() || modelName.equals("YourEmbeddedSpeechRecognitionModelName")) - { - modelName = ""; // no name given -> search by locale - } + SpeechTranslationModelPath = getSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_PATH", EmbeddedSpeechTranslationModelPath); + if (SpeechTranslationModelPath.equals("YourEmbeddedSpeechTranslationModelPath")) + { + SpeechTranslationModelPath = ""; + } + SpeechTranslationModelName = getSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME", EmbeddedSpeechTranslationModelName); + if (SpeechTranslationModelName.equals("YourEmbeddedSpeechTranslationModelName")) + { + SpeechTranslationModelName = ""; + } - EmbeddedSpeechConfig config = EmbeddedSpeechConfig.fromPath(recognitionModelPath); + // Find an embedded speech recognition model based on the name. + if (!SpeechRecognitionModelPath.isEmpty() && !SpeechRecognitionModelName.isEmpty()) + { + EmbeddedSpeechConfig config = EmbeddedSpeechConfig.fromPath(SpeechRecognitionModelPath); List models = config.getSpeechRecognitionModels(); - final String name = modelName; + final String name = SpeechRecognitionModelName; SpeechRecognitionModel result = models.stream() .filter(model -> - (name.isEmpty() && model.getLocales().get(0).equals(modelLocale)) || - (!name.isEmpty() && (model.getName().equals(name) || model.getLocales().get(0).equals(name)))) + model.getName().equals(name) || model.getLocales().get(0).equals(name)) .findAny() .orElse(null); - if (result != null) + if (result == null) { - SpeechRecognitionModelName = result.getName(); - } - - if (SpeechRecognitionModelName.isEmpty()) - { - System.out.print("## WARNING: Cannot locate an embedded speech recognition model by "); - if (modelName.isEmpty()) - { - System.out.print("locale \"" + modelLocale + "\". "); - } - else - { - System.out.print("name \"" + modelName + "\". "); - } - System.out.println("Current recognition model search path: " + recognitionModelPath); - } - else - { - SpeechRecognitionModelKey = getSetting("EMBEDDED_SPEECH_RECOGNITION_MODEL_KEY", EmbeddedSpeechRecognitionModelKey); - if (SpeechRecognitionModelKey.isEmpty() || SpeechRecognitionModelKey.equals("YourEmbeddedSpeechRecognitionModelKey")) - { - SpeechRecognitionModelKey = ""; - System.out.println("## WARNING: The key for \"" + SpeechRecognitionModelName + "\" is not set."); - } + System.out.println("## WARNING: Cannot locate an embedded speech recognition model \"" + SpeechRecognitionModelName + "\""); } } - // Find an embedded speech synthesis voice based on the name or locale. - - SpeechSynthesisVoiceName = ""; - - if (!synthesisVoicePath.isEmpty()) + // Find an embedded speech synthesis voice based on the name. + if (!SpeechSynthesisVoicePath.isEmpty() && !SpeechSynthesisVoiceName.isEmpty()) { - String voiceName = getSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_NAME", EmbeddedSpeechSynthesisVoiceName); - String voiceLocale = getSetting("SPEECH_SYNTHESIS_LOCALE", SpeechSynthesisLocale); - - if (voiceName.isEmpty() || voiceName.equals("YourEmbeddedSpeechSynthesisVoiceName")) - { - voiceName = ""; // no name given -> search by locale - } - - EmbeddedSpeechConfig config = EmbeddedSpeechConfig.fromPath(synthesisVoicePath); + EmbeddedSpeechConfig config = EmbeddedSpeechConfig.fromPath(SpeechSynthesisVoicePath); SpeechSynthesizer synthesizer = new SpeechSynthesizer(config, null); + + boolean found = false; SynthesisVoicesResult voicesList = synthesizer.getVoicesAsync("").get(); if (voicesList.getReason() == ResultReason.VoicesListRetrieved) { - final String name = voiceName; + final String name = SpeechSynthesisVoiceName; VoiceInfo result = voicesList.getVoices().stream() .filter(voice -> - (name.isEmpty() && voice.getLocale().equals(voiceLocale)) || - (!name.isEmpty() && (voice.getName().equals(name) || voice.getShortName().equals(name)))) + voice.getName().equals(name) || voice.getShortName().equals(name)) .findAny() .orElse(null); if (result != null) { - SpeechSynthesisVoiceName = result.getName(); + found = true; } } @@ -355,105 +316,40 @@ public static boolean verifySettings() throws InterruptedException, ExecutionExc synthesizer.close(); config.close(); - if (SpeechSynthesisVoiceName.isEmpty()) + if (!found) { - System.out.print("## WARNING: Cannot locate an embedded speech synthesis voice by "); - if (voiceName.isEmpty()) - { - System.out.print("locale \"" + voiceLocale + "\". "); - } - else - { - System.out.println("name \"" + voiceName + "\". "); - } - System.out.println("Current synthesis voice search path: " + synthesisVoicePath); - } - else - { - SpeechSynthesisVoiceKey = getSetting("EMBEDDED_SPEECH_SYNTHESIS_VOICE_KEY", EmbeddedSpeechSynthesisVoiceKey); - if (SpeechSynthesisVoiceKey.isEmpty() || SpeechSynthesisVoiceKey.equals("YourEmbeddedSpeechSynthesisVoiceKey")) - { - SpeechSynthesisVoiceKey = ""; - System.out.println("## WARNING: The key for \"" + SpeechSynthesisVoiceName + "\" is not set."); - } + System.out.println("## WARNING: Cannot locate an embedded speech synthesis voice \"" + SpeechSynthesisVoiceName + "\""); } } // Find an embedded speech translation model based on the name. - - SpeechTranslationModelName = ""; - - if (!translationModelPath.isEmpty()) + if (!SpeechTranslationModelPath.isEmpty() && !SpeechTranslationModelName.isEmpty()) { - String modelName = getSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_NAME", EmbeddedSpeechTranslationModelName); - - EmbeddedSpeechConfig config = EmbeddedSpeechConfig.fromPath(translationModelPath); + EmbeddedSpeechConfig config = EmbeddedSpeechConfig.fromPath(SpeechTranslationModelPath); List models = config.getSpeechTranslationModels(); - final String name = modelName; + final String name = SpeechTranslationModelName; SpeechTranslationModel result = models.stream() .filter(model -> model.getName().equals(name)) .findAny() .orElse(null); - if (result != null) - { - SpeechTranslationModelName = result.getName(); - } - - if (SpeechTranslationModelName.isEmpty()) - { - System.out.print("## WARNING: Cannot locate an embedded speech translation model by "); - System.out.print("name \"" + modelName + "\". "); - System.out.println("Current translation model search path: " + translationModelPath); - } - else + if (result == null) { - SpeechTranslationModelKey = getSetting("EMBEDDED_SPEECH_TRANSLATION_MODEL_KEY", EmbeddedSpeechTranslationModelKey); - if (SpeechTranslationModelKey.isEmpty() || SpeechTranslationModelKey.equals("YourEmbeddedSpeechTranslationModelKey")) - { - SpeechTranslationModelKey = ""; - System.out.println("## WARNING: The key for \"" + SpeechTranslationModelName + "\" is not set."); - } + System.out.println("## WARNING: Cannot locate an embedded speech translation model \"" + SpeechTranslationModelName + "\""); } } - System.out.println("Embedded speech recognition\n model search path: " + (recognitionModelPath.isEmpty() ? "(not set)" : recognitionModelPath)); - if (!recognitionModelPath.isEmpty()) - { - System.out.println(" model name: " + (SpeechRecognitionModelName.isEmpty() ? "(not found)" : SpeechRecognitionModelName)); - if (!SpeechRecognitionModelName.isEmpty()) - { - System.out.println(" model key: " + (SpeechRecognitionModelKey.isEmpty() ? "(not set)" : maskValue(SpeechRecognitionModelKey))); - } - } - System.out.println("Embedded speech synthesis\n voice search path: " + (synthesisVoicePath.isEmpty() ? "(not set)" : synthesisVoicePath)); - if (!synthesisVoicePath.isEmpty()) - { - System.out.println(" voice name: " + (SpeechSynthesisVoiceName.isEmpty() ? "(not found)" : SpeechSynthesisVoiceName)); - if (!SpeechSynthesisVoiceName.isEmpty()) - { - System.out.println(" voice key: " + (SpeechSynthesisVoiceKey.isEmpty() ? "(not set)" : maskValue(SpeechSynthesisVoiceKey))); - } - } - System.out.println("Embedded speech translation\n model search path: " + (translationModelPath.isEmpty() ? "(not set)" : translationModelPath)); - if (!translationModelPath.isEmpty()) - { - System.out.println(" model name: " + (SpeechTranslationModelName.isEmpty() ? "(not found)" : SpeechTranslationModelName)); - if (!SpeechTranslationModelName.isEmpty()) - { - System.out.println(" model key: " + (SpeechTranslationModelKey.isEmpty() ? "(not set)" : maskValue(SpeechTranslationModelKey))); - } - } + System.out.println("Embedded speech recognition"); + System.out.println(" model search path: " + (SpeechRecognitionModelPath.isEmpty() ? "(not set)" : SpeechRecognitionModelPath)); + System.out.println(" model name: " + (SpeechRecognitionModelName.isEmpty() ? "(not set)" : SpeechRecognitionModelName)); + System.out.println("Embedded speech synthesis"); + System.out.println(" voice search path: " + (SpeechSynthesisVoicePath.isEmpty() ? "(not set)" : SpeechSynthesisVoicePath)); + System.out.println(" voice name: " + (SpeechSynthesisVoiceName.isEmpty() ? "(not set)" : SpeechSynthesisVoiceName)); + System.out.println("Embedded speech translation"); + System.out.println(" model search path: " + (SpeechTranslationModelPath.isEmpty() ? "(not set)" : SpeechTranslationModelPath)); + System.out.println(" model name: " + (SpeechTranslationModelName.isEmpty() ? "(not set)" : SpeechTranslationModelName)); return true; } - - private static String maskValue(String value) - { - // Mask the string value, leave only the last 3 chars visible - int visibleLength = value.length() > 3 ? 3 : 0; - String masked = new String(new char[value.length() - visibleLength]).replace('\0', '*') + value.substring(value.length() - visibleLength); - return masked; - }; } diff --git a/samples/js/browser/avatar/README.md b/samples/js/browser/avatar/README.md index 9ade1668f..9ab722482 100644 --- a/samples/js/browser/avatar/README.md +++ b/samples/js/browser/avatar/README.md @@ -15,7 +15,6 @@ This sample demonstrates the basic usage of Azure text-to-speech avatar real-tim * TTS Configuration * TTS Voice - the voice of the TTS. Here is the [available TTS voices list](https://learn.microsoft.com/azure/ai-services/speech-service/language-support?tabs=tts#supported-languages) * Custom Voice Deployment ID (Endpoint ID) - the deployment ID (also called endpoint ID) of your custom voice. If you are not using a custom voice, please leave it empty. - * Personal Voice Speaker Profile ID - the personal voice speaker profile ID of your personal voice. Please follow [here](https://learn.microsoft.com/azure/ai-services/speech-service/personal-voice-overview) to view and create personal voice. * Avatar Configuration * Avatar Character - The character of the avatar. By default it's `lisa`, and you can update this value to use a different avatar. * Avatar Style - The style of the avatar. You can update this value to use a different avatar style. This parameter is optional for custom avatar. @@ -46,33 +45,30 @@ This sample demonstrates the chat scenario, with integration of Azure speech-to- * Endpoint - the endpoint of your Azure OpenAI resource, e.g. https://your-openai-resource-name.openai.azure.com/, which can be found in the `Keys and Endpoint` section of your Azure OpenAI resource in Azure portal. * API Key - the API key of your Azure OpenAI resource, which can be found in the `Keys and Endpoint` section of your Azure OpenAI resource in Azure portal. * Deployment Name - the name of your Azure OpenAI model deployment, which can be found in the `Model deployments` section of your Azure OpenAI resource in Azure portal. - * System Prompt - you can edit this text to preset the context for the chat API. The chat API will then generate the response based on this context. - * Enable On Your Data - check this if you want to use your own data to constrain the chat. If you check this, you need to fill `Azure Cognitive Search Resource` section below. + * Enable BYOD (Bring Your Own Data) - check this if you want to use your own data to constrain the chat. If you check this, you need to fill `Azure Cognitive Search Resource` section below. * Azure Cognitive Search Resource - if you want to constrain the chat within your own data, please follow [Quickstart: Chat with Azure OpenAI models using your own data](https://learn.microsoft.com/azure/cognitive-services/openai/use-your-data-quickstart?pivots=programming-language-studio) to create your data source, and then fill below information: * Endpoint - the endpoint of your Azure Cognitive Search resource, e.g. https://your-cogsearch-resource-name.search.windows.net/, which can be found in the `Overview` section of your Azure Cognitive Search resource in Azure portal, appearing at `Essentials -> Url` field. * API Key - the API key of your Azure Cognitive Search resource, which can be found in the `Keys` section of your Azure Cognitive Search resource in Azure portal. Please make sure to use the `Admin Key` instead of `Query Key`. * Index Name - the name of your Azure Cognitive Search index, which can be found in the `Indexes` section of your Azure Cognitive Search resource in Azure portal. + * ICE Server + * URL - the ICE server URL for WebRTC. e.g. `turn:relay.communication.microsoft.com:3478`. You can get the ICE server from ACS ([Azure Communication Services](https://learn.microsoft.com/azure/communication-services/overview)): you need follow [Create communication resource](https://learn.microsoft.com/azure/communication-services/quickstarts/create-communication-resource?tabs=windows&pivots=platform-azp) to create ACS resource, and then follow [Getting the relay configuration](https://learn.microsoft.com/azure/communication-services/quickstarts/relay-token?pivots=programming-language-python#getting-the-relay-configuration) to get ICE server URL, ICE server username, and ICE server credential. For ICE server URL, please make sure to use prefix `turn:`, instead of `stun:`. + * IceServerUsername - the username of the ICE server, which is provided together with the ICE server URL (see above). + * IceServerCredential - the credential (password) of the ICE server, which is provided together with the ICE server URL (see above). * STT / TTS Configuration - * STT Locale(s) - the locale(s) of the STT. Here is the [available STT languages list](https://learn.microsoft.com/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages). If multiple locales are specified, the STT will enable multi-language recognition, which means the STT will recognize the speech in any of the specified locales. + * STT Locale - the locale of the STT. Here is the [available STT languages list](https://learn.microsoft.com/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages) * TTS Voice - the voice of the TTS. Here is the [available TTS voices list](https://learn.microsoft.com/azure/ai-services/speech-service/language-support?tabs=tts#supported-languages) * Custom Voice Deployment ID (Endpoint ID) - the deployment ID (also called endpoint ID) of your custom voice. If you are not using a custom voice, please leave it empty. - * Personal Voice Speaker Profile ID - the personal voice speaker profile ID of your personal voice. Please follow [here](https://learn.microsoft.com/azure/ai-services/speech-service/personal-voice-overview) to view and create personal voice. - * Continuous Conversation - check this if you want to enable continuous conversation. If this is checked, the STT will keep listening to your speech, with microphone always on until you click `Stop Microphone` button. If this is not checked, the microphone will automatically stop once an utterance is recognized, and you need click `Start Microphone` every time before you give a speech. The `Continuous Conversation` mode is suitable for quiet environment, while the `Non-Continuous Conversation` mode is suitable for noisy environment, which can avoid the noise being recorded while you are not speaking. * Avatar Configuration * Avatar Character - The character of the avatar. By default it's `lisa`, and you can update this value to use a different avatar. * Avatar Style - The style of the avatar. You can update this value to use a different avatar style. This parameter is optional for custom avatar. * Custom Avatar - Check this if you are using a custom avatar. - * Auto Reconnect - Check this if you want to enable auto reconnect. If this is checked, the avatar video stream is automatically reconnected once the connection is lost. - * Use Local Video for Idle - Check this if you want to use local video for idle part. If this is checked, the avatar video stream is replaced by local video when the avatar is idle. To use this feature, you need to prepare a local video file. Usually, you can record a video of the avatar doing idle action. [Here](https://ttspublic.blob.core.windows.net/sampledata/video/avatar/lisa-casual-sitting-idle.mp4) is a sample video for lisa-casual-sitting avatar idle status. You can download it and put it to `video/lisa-casual-sitting-idle.mp4` under the same folder of `chat.html`. -* Step 3: Click `Open Avatar Session` button to setup video connection with Azure TTS Talking Avatar service. If everything goes well, you should see a live video with an avatar being shown on the web page. +* Step 3: Click `Open Video Connection` button to setup video connection with Azure TTS Talking Avatar service. If everything goes well, you should see a live video with an avatar being shown on the web page. * Step 4: Click `Start Microphone` button to start microphone (make sure to allow the microphone access tip box popping up in the browser), and then you can start chatting with the avatar with speech. The chat history (the text of what you said, and the response text by the Azure OpenAI chat API) will be shown beside the avatar. The avatar will then speak out the response of the chat API. -# Additional Tip(s) - -* If you want to enforce the avatar to stop speaking before the avatar finishes the utterance, you can click `Stop Speaking` button. This is useful when you want to interrupt the avatar speaking. +* Step 5: If you want to clear the chat history and start a new round of chat, you can click `Clear Chat History` button. And if you want to stop the avatar service, please click `Close Video Connection` button to close the connection with avatar service. -* If you want to clear the chat history and start a new round of chat, you can click `Clear Chat History` button. And if you want to stop the avatar service, please click `Close Avatar Session` button to close the connection with avatar service. +# Additional Tip(s) -* If you want to type your query message instead of speaking, you can check the `Type Message` checkbox, and then type your query message in the text box showing up below the checkbox. +* For the chat sample, you can edit the text in `System Prompt` text box to preset the context for the chat API. The chat API will then generate the response based on this context. diff --git a/samples/js/browser/avatar/basic.html b/samples/js/browser/avatar/basic.html index cb65820f9..c9aaea187 100644 --- a/samples/js/browser/avatar/basic.html +++ b/samples/js/browser/avatar/basic.html @@ -30,11 +30,9 @@

Azure Speech Resource

TTS Configuration

-
+

- -

Avatar Configuration

diff --git a/samples/js/browser/avatar/chat.html b/samples/js/browser/avatar/chat.html index 124570b78..eef582d24 100644 --- a/samples/js/browser/avatar/chat.html +++ b/samples/js/browser/avatar/chat.html @@ -35,10 +35,8 @@

Azure OpenAI Resource



-
-
- Enable On Your Data
+ Enable BYOD (Bring Your Own Data)

@@ -53,18 +51,22 @@

Azure Cognitive Search Resour
+

ICE Server

+ +
+ +
+ +
+
+

STT / TTS Configuration


-
+

- -
-
- Continuous Conversation
-

Avatar Configuration

@@ -75,21 +77,20 @@

Avatar Configuration

Custom Avatar
-
- Auto Reconnect
-
-
- Use Local Video for Idle
-

- - - +

Avatar Conversation Control Panel

+
+ +
+ + + - + +

Video

{ return } - const privateEndpointEnabled = document.getElementById('enablePrivateEndpoint').checked - const privateEndpoint = document.getElementById('privateEndpoint').value.slice(8) - if (privateEndpointEnabled && privateEndpoint === '') { - alert('Please fill in the Azure Speech endpoint.') - return - } - - let speechSynthesisConfig - if (privateEndpointEnabled) { - speechSynthesisConfig = SpeechSDK.SpeechConfig.fromEndpoint(new URL(`wss://${privateEndpoint}/tts/cognitiveservices/websocket/v1?enableTalkingAvatar=true`), cogSvcSubKey) - } else { - speechSynthesisConfig = SpeechSDK.SpeechConfig.fromSubscription(cogSvcSubKey, cogSvcRegion) - } + const speechSynthesisConfig = SpeechSDK.SpeechConfig.fromSubscription(cogSvcSubKey, cogSvcRegion) speechSynthesisConfig.endpointId = document.getElementById('customVoiceEndpointId').value + speechSynthesisConfig.speechSynthesisVoiceName = document.getElementById('ttsVoice').value const videoFormat = new SpeechSDK.AvatarVideoFormat() let videoCropTopLeftX = document.getElementById('videoCrop').checked ? 600 : 0 @@ -225,26 +214,17 @@ window.startSession = () => { console.log("[" + (new Date()).toISOString() + "] Event received: " + e.description + offsetMessage) } - document.getElementById('startSession').disabled = true - - const xhr = new XMLHttpRequest() - if (privateEndpointEnabled) { - xhr.open("GET", `https://${privateEndpoint}/tts/cognitiveservices/avatar/relay/token/v1`) - } else { - xhr.open("GET", `https://${cogSvcRegion}.tts.speech.microsoft.com/cognitiveservices/avatar/relay/token/v1`) + const iceServerUrl = document.getElementById('iceServerUrl').value + const iceServerUsername = document.getElementById('iceServerUsername').value + const iceServerCredential = document.getElementById('iceServerCredential').value + if (iceServerUrl === '' || iceServerUsername === '' || iceServerCredential === '') { + alert('Please fill in the ICE server URL, username and credential.') + return } - xhr.setRequestHeader("Ocp-Apim-Subscription-Key", cogSvcSubKey) - xhr.addEventListener("readystatechange", function() { - if (this.readyState === 4) { - const responseData = JSON.parse(this.responseText) - const iceServerUrl = responseData.Urls[0] - const iceServerUsername = responseData.Username - const iceServerCredential = responseData.Password - setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) - } - }) - xhr.send() - + + document.getElementById('startSession').disabled = true + + setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) } window.speak = () => { @@ -253,8 +233,7 @@ window.speak = () => { document.getElementById('audio').muted = false let spokenText = document.getElementById('spokenText').value let ttsVoice = document.getElementById('ttsVoice').value - let personalVoiceSpeakerProfileID = document.getElementById('personalVoiceSpeakerProfileID').value - let spokenSsml = `${htmlEncode(spokenText)}` + let spokenSsml = `${htmlEncode(spokenText)}` console.log("[" + (new Date()).toISOString() + "] Speak request sent.") avatarSynthesizer.speakSsmlAsync(spokenSsml).then( (result) => { @@ -302,11 +281,3 @@ window.updataTransparentBackground = () => { document.getElementById('backgroundColor').disabled = false } } - -window.updatePrivateEndpoint = () => { - if (document.getElementById('enablePrivateEndpoint').checked) { - document.getElementById('showPrivateEndpointCheckBox').hidden = false - } else { - document.getElementById('showPrivateEndpointCheckBox').hidden = true - } -} diff --git a/samples/js/browser/avatar/js/chat.js b/samples/js/browser/avatar/js/chat.js index 5ef414ddf..f41da5330 100644 --- a/samples/js/browser/avatar/js/chat.js +++ b/samples/js/browser/avatar/js/chat.js @@ -6,7 +6,6 @@ var speechRecognizer var avatarSynthesizer var peerConnection var messages = [] -var messageInitiated = false var dataSources = [] var sentenceLevelPunctuations = [ '.', '?', '!', ':', ';', '。', '?', '!', ':', ';' ] var enableQuickReply = false @@ -138,6 +137,14 @@ function setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) { // Fetch WebRTC video stream and mount it to an HTML video element peerConnection.ontrack = function (event) { + // Clean up existing video element if there is any + remoteVideoDiv = document.getElementById('remoteVideo') + for (var i = 0; i < remoteVideoDiv.childNodes.length; i++) { + if (remoteVideoDiv.childNodes[i].localName === event.track.kind) { + remoteVideoDiv.removeChild(remoteVideoDiv.childNodes[i]) + } + } + if (event.track.kind === 'audio') { let audioElement = document.createElement('audio') audioElement.id = 'audioPlayer' @@ -145,7 +152,7 @@ function setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) { audioElement.autoplay = true audioElement.onplaying = () => { - console.log(`WebRTC ${event.track.kind} channel connected.`) + log(`WebRTC ${event.track.kind} channel connected.`) } document.getElementById('remoteVideo').appendChild(audioElement) @@ -159,33 +166,12 @@ function setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) { videoElement.playsInline = true videoElement.onplaying = () => { - // Clean up existing video element if there is any - remoteVideoDiv = document.getElementById('remoteVideo') - for (var i = 0; i < remoteVideoDiv.childNodes.length; i++) { - if (remoteVideoDiv.childNodes[i].localName === event.track.kind) { - remoteVideoDiv.removeChild(remoteVideoDiv.childNodes[i]) - } - } - - // Append the new video element - document.getElementById('remoteVideo').appendChild(videoElement) - - console.log(`WebRTC ${event.track.kind} channel connected.`) - document.getElementById('microphone').disabled = false + log(`WebRTC ${event.track.kind} channel connected.`) + document.getElementById('startMicrophone').disabled = false document.getElementById('stopSession').disabled = false - document.getElementById('remoteVideo').style.width = '960px' - document.getElementById('chatHistory').hidden = false - document.getElementById('showTypeMessage').disabled = false - - if (document.getElementById('useLocalVideoForIdle').checked) { - document.getElementById('localVideo').hidden = true - if (lastSpeakTime === undefined) { - lastSpeakTime = new Date() - } - } - - setTimeout(() => { sessionActive = true }, 5000) // Set session active after 5 seconds } + + document.getElementById('remoteVideo').appendChild(videoElement) } } @@ -203,11 +189,13 @@ function setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) { // Make necessary update to the web page when the connection state changes peerConnection.oniceconnectionstatechange = e => { console.log("WebRTC status: " + peerConnection.iceConnectionState) - if (peerConnection.iceConnectionState === 'disconnected') { - if (document.getElementById('useLocalVideoForIdle').checked) { - document.getElementById('localVideo').hidden = false - document.getElementById('remoteVideo').style.width = '0.1px' - } + + if (peerConnection.iceConnectionState === 'connected') { + document.getElementById('configuration').hidden = true + } + + if (peerConnection.iceConnectionState === 'disconnected' || peerConnection.iceConnectionState === 'failed') { + document.getElementById('configuration').hidden = false } } @@ -218,16 +206,15 @@ function setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) { // start avatar, establish WebRTC connection avatarSynthesizer.startAvatarAsync(peerConnection).then((r) => { if (r.reason === SpeechSDK.ResultReason.SynthesizingAudioCompleted) { - console.log("[" + (new Date()).toISOString() + "] Avatar started. Result ID: " + r.resultId) + console.log("[" + (new Date()).toISOString() + "] Avatar started.") } else { - console.log("[" + (new Date()).toISOString() + "] Unable to start avatar. Result ID: " + r.resultId) + console.log("[" + (new Date()).toISOString() + "] Unable to start avatar.") if (r.reason === SpeechSDK.ResultReason.Canceled) { let cancellationDetails = SpeechSDK.CancellationDetails.fromResult(r) if (cancellationDetails.reason === SpeechSDK.CancellationReason.Error) { console.log(cancellationDetails.errorDetails) }; - - console.log("Unable to start avatar: " + cancellationDetails.errorDetails); + log("Unable to start avatar: " + cancellationDetails.errorDetails); } document.getElementById('startSession').disabled = false; document.getElementById('configuration').hidden = false; @@ -307,40 +294,28 @@ function speak(text, endingSilenceMs = 0) { function speakNext(text, endingSilenceMs = 0) { let ttsVoice = document.getElementById('ttsVoice').value - let personalVoiceSpeakerProfileID = document.getElementById('personalVoiceSpeakerProfileID').value - let ssml = `${htmlEncode(text)}` + let ssml = `${htmlEncode(text)}` if (endingSilenceMs > 0) { - ssml = `${htmlEncode(text)}` + ssml = `${htmlEncode(text)}` } - lastSpeakTime = new Date() isSpeaking = true - document.getElementById('stopSpeaking').disabled = false avatarSynthesizer.speakSsmlAsync(ssml).then( (result) => { if (result.reason === SpeechSDK.ResultReason.SynthesizingAudioCompleted) { - console.log(`Speech synthesized to speaker for text [ ${text} ]. Result ID: ${result.resultId}`) - lastSpeakTime = new Date() + console.log(`Speech synthesized to speaker for text [ ${text} ]`) } else { - console.log(`Error occurred while speaking the SSML. Result ID: ${result.resultId}`) + console.log(`Error occurred while speaking the SSML.`) } if (spokenTextQueue.length > 0) { speakNext(spokenTextQueue.shift()) } else { isSpeaking = false - document.getElementById('stopSpeaking').disabled = true } }).catch( (error) => { console.log(`Error occurred while speaking the SSML: [ ${error} ]`) - - if (spokenTextQueue.length > 0) { - speakNext(spokenTextQueue.shift()) - } else { - isSpeaking = false - document.getElementById('stopSpeaking').disabled = true - } } ) } @@ -348,11 +323,7 @@ function speakNext(text, endingSilenceMs = 0) { function stopSpeaking() { spokenTextQueue = [] avatarSynthesizer.stopSpeakingAsync().then( - () => { - isSpeaking = false - document.getElementById('stopSpeaking').disabled = true - console.log("[" + (new Date()).toISOString() + "] Stop speaking request sent.") - } + log("[" + (new Date()).toISOString() + "] Stop speaking request sent.") ).catch( (error) => { console.log("Error occurred while stopping speaking: " + error) @@ -562,69 +533,77 @@ function getQuickReply() { return quickReplies[Math.floor(Math.random() * quickReplies.length)] } -function checkHung() { - // Check whether the avatar video stream is hung, by checking whether the video time is advancing - let videoElement = document.getElementById('videoPlayer') - if (videoElement !== null && videoElement !== undefined && sessionActive) { - let videoTime = videoElement.currentTime - setTimeout(() => { - // Check whether the video time is advancing - if (videoElement.currentTime === videoTime) { - // Check whether the session is active to avoid duplicatedly triggering reconnect - if (sessionActive) { - sessionActive = false - if (document.getElementById('autoReconnectAvatar').checked) { - console.log(`[${(new Date()).toISOString()}] The video stream got disconnected, need reconnect.`) - connectAvatar() - } - } - } - }, 2000) +window.startSession = () => { + const cogSvcRegion = document.getElementById('region').value + const cogSvcSubKey = document.getElementById('subscriptionKey').value + if (cogSvcSubKey === '') { + alert('Please fill in the subscription key of your speech resource.') + return } -} -function checkLastSpeak() { - if (lastSpeakTime === undefined) { + const speechSynthesisConfig = SpeechSDK.SpeechConfig.fromSubscription(cogSvcSubKey, cogSvcRegion) + speechSynthesisConfig.endpointId = document.getElementById('customVoiceEndpointId').value + speechSynthesisConfig.speechSynthesisVoiceName = document.getElementById('ttsVoice').value + + const talkingAvatarCharacter = document.getElementById('talkingAvatarCharacter').value + const talkingAvatarStyle = document.getElementById('talkingAvatarStyle').value + const avatarConfig = new SpeechSDK.AvatarConfig(talkingAvatarCharacter, talkingAvatarStyle) + avatarConfig.customized = document.getElementById('customizedAvatar').checked + avatarSynthesizer = new SpeechSDK.AvatarSynthesizer(speechSynthesisConfig, avatarConfig) + avatarSynthesizer.avatarEventReceived = function (s, e) { + var offsetMessage = ", offset from session start: " + e.offset / 10000 + "ms." + if (e.offset === 0) { + offsetMessage = "" + } + + console.log("Event received: " + e.description + offsetMessage) + } + + const speechRecognitionConfig = SpeechSDK.SpeechConfig.fromEndpoint(new URL(`wss://${cogSvcRegion}.stt.speech.microsoft.com/speech/universal/v2`), cogSvcSubKey) + speechRecognitionConfig.setProperty(SpeechSDK.PropertyId.SpeechServiceConnection_LanguageIdMode, "Continuous") + var sttLocales = document.getElementById('sttLocales').value.split(',') + var autoDetectSourceLanguageConfig = SpeechSDK.AutoDetectSourceLanguageConfig.fromLanguages(sttLocales) + speechRecognizer = SpeechSDK.SpeechRecognizer.FromConfig(speechRecognitionConfig, autoDetectSourceLanguageConfig, SpeechSDK.AudioConfig.fromDefaultMicrophoneInput()) + + const azureOpenAIEndpoint = document.getElementById('azureOpenAIEndpoint').value + const azureOpenAIApiKey = document.getElementById('azureOpenAIApiKey').value + const azureOpenAIDeploymentName = document.getElementById('azureOpenAIDeploymentName').value + if (azureOpenAIEndpoint === '' || azureOpenAIApiKey === '' || azureOpenAIDeploymentName === '') { + alert('Please fill in the Azure OpenAI endpoint, API key and deployment name.') return } - let currentTime = new Date() - if (currentTime - lastSpeakTime > 15000) { - if (document.getElementById('useLocalVideoForIdle').checked && sessionActive && !isSpeaking) { - disconnectAvatar() - document.getElementById('localVideo').hidden = false - document.getElementById('remoteVideo').style.width = '0.1px' - sessionActive = false + dataSources = [] + if (document.getElementById('enableByod').checked) { + const azureCogSearchEndpoint = document.getElementById('azureCogSearchEndpoint').value + const azureCogSearchApiKey = document.getElementById('azureCogSearchApiKey').value + const azureCogSearchIndexName = document.getElementById('azureCogSearchIndexName').value + if (azureCogSearchEndpoint === "" || azureCogSearchApiKey === "" || azureCogSearchIndexName === "") { + alert('Please fill in the Azure Cognitive Search endpoint, API key and index name.') + return + } else { + setDataSources(azureCogSearchEndpoint, azureCogSearchApiKey, azureCogSearchIndexName) } } -} -window.onload = () => { - setInterval(() => { - checkHung() - checkLastSpeak() - }, 2000) // Check session activity every 2 seconds -} + initMessages() -window.startSession = () => { - if (document.getElementById('useLocalVideoForIdle').checked) { - document.getElementById('startSession').disabled = true - document.getElementById('configuration').hidden = true - document.getElementById('microphone').disabled = false - document.getElementById('stopSession').disabled = false - document.getElementById('localVideo').hidden = false - document.getElementById('remoteVideo').style.width = '0.1px' - document.getElementById('chatHistory').hidden = false - document.getElementById('showTypeMessage').disabled = false + const iceServerUrl = document.getElementById('iceServerUrl').value + const iceServerUsername = document.getElementById('iceServerUsername').value + const iceServerCredential = document.getElementById('iceServerCredential').value + if (iceServerUrl === '' || iceServerUsername === '' || iceServerCredential === '') { + alert('Please fill in the ICE server URL, username and credential.') return } - connectAvatar() + document.getElementById('startSession').disabled = true + + setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) } window.stopSession = () => { document.getElementById('startSession').disabled = false - document.getElementById('microphone').disabled = true + document.getElementById('startMicrophone').disabled = true document.getElementById('stopSession').disabled = true document.getElementById('configuration').hidden = false document.getElementById('chatHistory').hidden = true @@ -644,35 +623,9 @@ window.clearChatHistory = () => { initMessages() } -window.microphone = () => { - if (document.getElementById('microphone').innerHTML === 'Stop Microphone') { - // Stop microphone - document.getElementById('microphone').disabled = true - speechRecognizer.stopContinuousRecognitionAsync( - () => { - document.getElementById('microphone').innerHTML = 'Start Microphone' - document.getElementById('microphone').disabled = false - }, (err) => { - console.log("Failed to stop continuous recognition:", err) - document.getElementById('microphone').disabled = false - }) - - return - } - - if (document.getElementById('useLocalVideoForIdle').checked) { - if (!sessionActive) { - connectAvatar() - } - - setTimeout(() => { - document.getElementById('audioPlayer').play() - }, 5000) - } else { - document.getElementById('audioPlayer').play() - } - - document.getElementById('microphone').disabled = true +window.startMicrophone = () => { + document.getElementById('startMicrophone').disabled = true + document.getElementById('audioPlayer').play() speechRecognizer.recognized = async (s, e) => { if (e.result.reason === SpeechSDK.ResultReason.RecognizedSpeech) { let userQuery = e.result.text.trim() @@ -680,35 +633,20 @@ window.microphone = () => { return } - // Auto stop microphone when a phrase is recognized, when it's not continuous conversation mode - if (!document.getElementById('continuousConversation').checked) { - document.getElementById('microphone').disabled = true - speechRecognizer.stopContinuousRecognitionAsync( - () => { - document.getElementById('microphone').innerHTML = 'Start Microphone' - document.getElementById('microphone').disabled = false - }, (err) => { - console.log("Failed to stop continuous recognition:", err) - document.getElementById('microphone').disabled = false - }) + let chatMessage = { + role: 'user', + content: userQuery } handleUserQuery(userQuery,"","") } } - speechRecognizer.startContinuousRecognitionAsync( - () => { - document.getElementById('microphone').innerHTML = 'Stop Microphone' - document.getElementById('microphone').disabled = false - }, (err) => { - console.log("Failed to start continuous recognition:", err) - document.getElementById('microphone').disabled = false - }) + speechRecognizer.startContinuousRecognitionAsync() } -window.updataEnableOyd = () => { - if (document.getElementById('enableOyd').checked) { +window.updataEnableByod = () => { + if (document.getElementById('enableByod').checked) { document.getElementById('cogSearchConfig').hidden = false } else { document.getElementById('cogSearchConfig').hidden = true @@ -769,4 +707,4 @@ window.updatePrivateEndpoint = () => { } else { document.getElementById('showPrivateEndpointCheckBox').hidden = true } -} \ No newline at end of file +} diff --git a/samples/js/browser/index.html b/samples/js/browser/index.html index f7f6400ae..d7f76b8b5 100644 --- a/samples/js/browser/index.html +++ b/samples/js/browser/index.html @@ -143,7 +143,7 @@

Javascript Browser Sample

- + diff --git a/samples/js/browser/public/index.html b/samples/js/browser/public/index.html index 0a4cd9160..1070a08ca 100644 --- a/samples/js/browser/public/index.html +++ b/samples/js/browser/public/index.html @@ -147,7 +147,7 @@

Javascript Browser Sample

- + @@ -396,11 +396,10 @@

Javascript Browser Sample

scenarioSelection.value == 'translationRecognizerContinuous' ? '' : 'none'; document.getElementById('pronunciationAssessmentReferenceTextRow').style.display = - scenarioSelection.value.includes('pronunciation') - && scenarioSelection.value != 'pronunciationAssessmentWithContentAssessment' ? '' : 'none'; + scenarioSelection.value.includes('pronunciation') ? '' : 'none'; document.getElementById('contentAssessmentTopicTextRow').style.display = - scenarioSelection.value == 'pronunciationAssessmentWithContentAssessment' ? '' : 'none'; + scenarioSelection.value.includes('content') ? '' : 'none'; } scenarioSelection.addEventListener("change", function () { @@ -610,7 +609,7 @@

Javascript Browser Sample

var pronunciationAssessmentConfig = new SpeechSDK.PronunciationAssessmentConfig(referenceText.value, SpeechSDK.PronunciationAssessmentGradingSystem.HundredMark, SpeechSDK.PronunciationAssessmentGranularity.Word, true); - pronunciationAssessmentConfig.enableProsodyAssessment = true; + pronunciationAssessmentConfig.enableProsodyAssessment() return pronunciationAssessmentConfig; } @@ -682,7 +681,7 @@

Javascript Browser Sample

var displayText = detailedResultJson['DisplayText']; phraseDiv.innerHTML += `Detailed result for "${displayText}":\r\n` + `${JSON.stringify(detailedResultJson, null, 2)}\r\n`; - } else if (result.text && result.text != ".") { + } else if (result.text) { phraseDiv.innerHTML += `${result.text}\r\n`; } @@ -702,16 +701,11 @@

Javascript Browser Sample

if (scenarioSelection.value.includes('pronunciation')) { var pronunciationAssessmentResult = SpeechSDK.PronunciationAssessmentResult.fromResult(result); - if (pronunciationAssessmentResult.detailResult.Display != '.') { - phraseDiv.innerHTML += - `[Pronunciation result] - Accuracy: ${pronunciationAssessmentResult.accuracyScore}. - Fluency: ${pronunciationAssessmentResult.fluencyScore}. - Prosody: ${pronunciationAssessmentResult.prosodyScore}. - ${referenceText.value != '' ? `Completeness: ${pronunciationAssessmentResult.completenessScore}.` : ``}\n`; - pronunciationAssessmentResults.push(pronunciationAssessmentResult); - } - + phraseDiv.innerHTML += + `[Pronunciation result] Accuracy: ${pronunciationAssessmentResult.accuracyScore}; + Fluency: ${pronunciationAssessmentResult.fluencyScore}; + Completeness: ${pronunciationAssessmentResult.completenessScore}.\n`; + pronunciationAssessmentResults.push(pronunciationAssessmentResult); } if (scenarioSelection.value == 'pronunciationAssessmentWithContentAssessment') { @@ -737,8 +731,7 @@

Javascript Browser Sample

statusDiv.innerHTML += `(sessionStopped) SessionId: ${sessionEventArgs.sessionId}\r\n`; if (scenarioSelection.value == 'pronunciationAssessmentContinuous' || - scenarioSelection.value == 'pronunciationAssessmentContinuousStream' || - scenarioSelection.value == 'pronunciationAssessmentWithContentAssessment' + scenarioSelection.value == 'pronunciationAssessmentContinuousStream' ) { calculateOverallPronunciationScore(); } @@ -822,8 +815,6 @@

Javascript Browser Sample

var sumDuration = 0; var sumAccuracy = 0; var sumFluency = 0; - var sumProsody = 0; - var countProsody = 0; for (const result of pronunciationAssessmentResults) { var duration = 0; for (const word of result.detailResult.Words) { @@ -833,14 +824,11 @@

Javascript Browser Sample

sumDuration += duration; sumAccuracy += duration * result.accuracyScore; sumFluency += duration * result.fluencyScore; - sumProsody += result.prosodyScore; - countProsody++; } // weighted accuracy and fluency scores var accuracy = sumAccuracy / sumDuration; var fluency = sumFluency / sumDuration; - var prosody = sumProsody / countProsody; var diff = new difflib.SequenceMatcher(null, referenceWords, recognizedWords); diffWordsNum = 0; @@ -853,18 +841,16 @@

Javascript Browser Sample

var completeness = (1 - diffWordsNum / referenceWords.length) * 100; phraseDiv.innerHTML += - `[Overall Pronunciation result] - Accuracy: ${accuracy}. - Fluency: ${fluency}. - Prosody: ${prosody}. - ${referenceText.value != '' ? `Completeness: ${completeness}.` : ``}\n`; + `[Overall Pronunciation result] Accuracy: ${accuracy}; + Fluency: ${fluency}; + Completeness: ${completeness}.\n`; } function getContentResult() { - phraseDiv.innerHTML += `[content assessment result] - \tvocabulary score: ${contentResults[contentResults.length-1].vocabularyScore}. - \tgrammar score: ${contentResults[contentResults.length-1].grammarScore}. - \ttopic score: ${contentResults[contentResults.length-1].topicScore}.` + phraseDiv.innerHTML = `[content assessment result] \n + \tvocabulary score: ${contentResults.vocabularyScore} '\n' + \tgrammar score: ${contentResults.grammarScore} '\n' + \ttopic score: ${contentResults.topicScore}` } diff --git a/samples/js/browser/public/synthesis.html b/samples/js/browser/public/synthesis.html index 9efc4b515..a99367a38 100644 --- a/samples/js/browser/public/synthesis.html +++ b/samples/js/browser/public/synthesis.html @@ -317,7 +317,7 @@

Speech Speech SDK not found request.onload = function() { if (request.status >= 200 && request.status < 400) { const response = this.response; - const defaultVoice = "AndrewMultilingualNeural"; + const defaultVoice = "JennyNeural"; let selectId; const data = JSON.parse(response); voiceOptions.innerHTML = ""; diff --git a/samples/js/browser/synthesis.html b/samples/js/browser/synthesis.html index 3d8b66894..91aef0eb8 100644 --- a/samples/js/browser/synthesis.html +++ b/samples/js/browser/synthesis.html @@ -313,7 +313,7 @@

Speech Speech SDK not found request.onload = function() { if (request.status >= 200 && request.status < 400) { const response = this.response; - const defaultVoice = "AndrewMultilingualNeural"; + const defaultVoice = "JennyNeural"; let selectId; const data = JSON.parse(response); voiceOptions.innerHTML = ""; diff --git a/samples/js/node/package.json b/samples/js/node/package.json index e36ea2691..bc8c0a13f 100644 --- a/samples/js/node/package.json +++ b/samples/js/node/package.json @@ -11,16 +11,15 @@ "author": "Microsoft", "license": "MIT", "dependencies": { - "diff": "^5.2.0", + "difflib": "^0.2.4", "https-proxy-agent": "^3.0.0", "lodash": "^4.17.21", "lodash.foreach": "^4.5.0", "lodash.sum": "^4.0.2", "mic-to-speech": "^1.0.1", - "microsoft-cognitiveservices-speech-sdk": "^1.38.0", + "microsoft-cognitiveservices-speech-sdk": "^1.40.0", "readline": "^1.3.0", "segment": "^0.1.3", - "uuid": "^9.0.1", "wav": "^1.0.2" } } diff --git a/samples/js/node/pronunciationAssessmentContinue.js b/samples/js/node/pronunciationAssessmentContinue.js index 89d148070..39e099dad 100644 --- a/samples/js/node/pronunciationAssessmentContinue.js +++ b/samples/js/node/pronunciationAssessmentContinue.js @@ -5,9 +5,7 @@ import * as sdk from "microsoft-cognitiveservices-speech-sdk"; import * as filePushStream from "./filePushStream.js"; import * as Segment from "segment"; -import fs from 'fs'; -import { v4 as uuidv4 } from 'uuid'; -import { diffArrays } from "diff"; +import * as difflib from "difflib"; import _ from "lodash"; // pronunciation assessment with audio streaming and continue mode @@ -22,7 +20,6 @@ export const main = (settings) => { var speechConfig = sdk.SpeechConfig.fromSubscription(settings.subscriptionKey, settings.serviceRegion); var reference_text = "What's the weather like?"; - var enableProsodyAssessment = true; // create pronunciation assessment config, set grading system, granularity and if enable miscue based on your requirement. const pronunciationAssessmentConfig = new sdk.PronunciationAssessmentConfig( reference_text, @@ -30,7 +27,7 @@ export const main = (settings) => { sdk.PronunciationAssessmentGranularity.Phoneme, true ); - pronunciationAssessmentConfig.enableProsodyAssessment = enableProsodyAssessment; + pronunciationAssessmentConfig.enableProsodyAssessment = true; // setting the recognition language to English. speechConfig.speechRecognitionLanguage = settings.language; @@ -42,17 +39,18 @@ export const main = (settings) => { const scoreNumber = { accuracyScore: 0, fluencyScore: 0, + compScore: 0, + prosodyScore: 0, }; const allWords = []; var currentText = []; var startOffset = 0; + var recognizedWords = []; var fluencyScores = []; var prosodyScores = []; var durations = []; var jo = {}; - var filePath = `${uuidv4()}.txt`; - var recognizedWordsNum = 0; - + // Before beginning speech recognition, setup the callbacks to be invoked when an event occurs. // The event recognizing signals that an intermediate recognition result is received. @@ -69,11 +67,11 @@ export const main = (settings) => { reco.recognized = function (s, e) { console.log("pronunciation assessment for: ", e.result.text); var pronunciation_result = sdk.PronunciationAssessmentResult.fromResult(e.result); - console.log(` Accuracy score: ${pronunciation_result.accuracyScore},` + - `\n pronunciation score: ${pronunciation_result.pronunciationScore},` + - `\n completeness score : ${pronunciation_result.completenessScore},` + - `\n fluency score: ${pronunciation_result.fluencyScore},` + - `${enableProsodyAssessment ? `\n prosody score: ${pronunciation_result.prosodyScore}` : ""}` + console.log(" Accuracy score: ", pronunciation_result.accuracyScore, '\n', + "pronunciation score: ", pronunciation_result.pronunciationScore, '\n', + "completeness score : ", pronunciation_result.completenessScore, '\n', + "fluency score: ", pronunciation_result.fluencyScore, '\n', + "prosody score: ", pronunciation_result.prosodyScore ); jo = JSON.parse(e.result.properties.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult)); @@ -87,10 +85,7 @@ export const main = (settings) => { const nBestWords = jo.NBest[0].Words; const durationList = []; _.forEach(nBestWords, (word) => { - if (word.PronunciationAssessment.AccuracyScore < 60 && word.PronunciationAssessment.ErrorType == "None") { - word.PronunciationAssessment.ErrorType = "Mispronunciation"; - } - if (word.PronunciationAssessment.ErrorType == "None") recognizedWordsNum++; + recognizedWords.push(word); durationList.push(word.Duration); }); durations.push(_.sum(durationList)); @@ -100,99 +95,136 @@ export const main = (settings) => { } }; - function wordsToTempDict (words) { - let data = ""; - const wordMap = {}; - for (const word of words) word in wordMap ? wordMap[word]++ : wordMap[word] = 1; - for (const key in wordMap) data += key + `|0x00000000|${wordMap[key]}\n`; - fs.writeFileSync(filePath, data.trim("\n"), (_e) => {}); - } - - function removeTempDict() { - fs.unlink(filePath, (_e) => {}); - } - function calculateOverallPronunciationScore() { - let wholelyricsArray = []; + const resText = currentText.join(" "); + let wholelyricsArry = []; + let resTextArray = []; // The sample code provides only zh-CN and en-US locales if (["zh-cn"].includes(settings.language.toLowerCase())) { - wordsToTempDict(currentText); + const resTextProcessed = (resText.toLocaleLowerCase() ?? "").replace(new RegExp("[^a-zA-Z0-9\u4E00-\u9FA5']+", "g"), " "); const wholelyrics = (reference_text.toLocaleLowerCase() ?? "").replace(new RegExp("[^a-zA-Z0-9\u4E00-\u9FA5']+", "g"), " "); - const segment = new Segment.Segment(); - segment.use('DictTokenizer'); - segment.loadDict(filePath); - _.map(segment.doSegment(wholelyrics, {stripPunctuation: true}), (res) => wholelyricsArray.push(res['w'])); - removeTempDict(); + const segment = new Segment(); + segment.useDefault(); + segment.loadDict('wildcard.txt'); + _.map(segment.doSegment(wholelyrics, {stripPunctuation: true}), (res) => wholelyricsArry.push(res['w'])); + _.map(segment.doSegment(resTextProcessed, {stripPunctuation: true}), (res) => resTextArray.push(res['w'])); } else { + let resTextProcessed = (resText.toLocaleLowerCase() ?? "").replace(new RegExp("[!\"#$%&()*+,-./:;<=>?@[^_`{|}~]+", "g"), "").replace(new RegExp("]+", "g"), ""); let wholelyrics = (reference_text.toLocaleLowerCase() ?? "").replace(new RegExp("[!\"#$%&()*+,-./:;<=>?@[^_`{|}~]+", "g"), "").replace(new RegExp("]+", "g"), ""); - wholelyricsArray = wholelyrics.split(" "); + wholelyricsArry = wholelyrics.split(" "); + resTextArray = resTextProcessed.split(" "); } - const wholelyricsArrayRes = _.map( - _.filter(wholelyricsArray, (item) => !!item), + const wholelyricsArryRes = _.map( + _.filter(wholelyricsArry, (item) => !!item), (item) => item.trim() ); - + // For continuous pronunciation assessment mode, the service won't return the words with `Insertion` or `Omission` // We need to compare with the reference text after received all recognized words to get these error words. - var lastWords = []; - if (reference_text.length != 0) { - const diff = diffArrays(wholelyricsArrayRes, currentText); - let currentWholelyricsArrayResIndex = 0; - let currentResTextArrayIndex = 0; - for (const d of diff) { - if (d.added) { - _.map(allWords.slice(currentResTextArrayIndex, currentResTextArrayIndex + (d.count ?? 0)), (item) => { - if (item.PronunciationAssessment.ErrorType !== "Insertion") { - item.PronunciationAssessment.ErrorType = "Insertion"; + const diff = new difflib.SequenceMatcher(null, wholelyricsArryRes, resTextArray); + const lastWords = []; + for (const d of diff.getOpcodes()) { + if (d[0] == "insert" || d[0] == "replace") { + if (["zh-cn"].includes(settings.language.toLowerCase())) { + for (let j = d[3], count = 0; j < d[4]; count++) { + let len = 0; + let bfind = false; + _.map(allWords, (item, index) => { + if ( + (len == j || + (index + 1 < allWords.length && + allWords[index].Word.length > 1 && + j > len && + j < len + allWords[index + 1].Word.length)) && + !bfind + ) { + const wordNew = _.cloneDeep(allWords[index]); + if ( + allWords && + allWords.length > 0 && + allWords[index].PronunciationAssessment.ErrorType !== "Insertion" + ) { + wordNew.PronunciationAssessment.ErrorType = "Insertion"; + } + lastWords.push(wordNew); + bfind = true; + j += allWords[index].Word.length; + } + len = len + item.Word.length; + }); + } + } else { + for (let j = d[3]; j < d[4]; j++) { + if (allWords && allWords.length > 0 && allWords[j].PronunciationAssessment.ErrorType !== "Insertion") { + allWords[j].PronunciationAssessment.ErrorType = "Insertion"; } - lastWords.push(item); - currentResTextArrayIndex++; - }); + lastWords.push(allWords[j]); + } } - if (d.removed) { - if ( - currentWholelyricsArrayResIndex + (d.count ?? 0) + 1 == wholelyricsArrayRes.length && - !( - jo.RecognitionStatus == "Success" || - jo.RecognitionStatus == "Failed" - ) + } + if (d[0] == "delete" || d[0] == "replace") { + if ( + d[2] == wholelyricsArryRes.length && + !( + jo.RecognitionStatus == "Success" || + jo.RecognitionStatus == "Failed" ) - continue; - for (let i = 0; i < (d.count ?? 0); i++) { - const word = { - Word: wholelyricsArrayRes[currentWholelyricsArrayResIndex], - PronunciationAssessment: { - ErrorType: "Omission", - }, - }; - lastWords.push(word); - currentWholelyricsArrayResIndex++; - } + ) + continue; + for (let i = d[1]; i < d[2]; i++) { + const word = { + Word: wholelyricsArryRes[i], + PronunciationAssessment: { + ErrorType: "Omission", + }, + }; + lastWords.push(word); } - if (!d.added && !d.removed) { - _.map(allWords.slice(currentResTextArrayIndex, currentResTextArrayIndex + (d.count ?? 0)), (item) => { - lastWords.push(item); - currentWholelyricsArrayResIndex++; - currentResTextArrayIndex++; - }); + } + if (d[0] == "equal") { + for (let k = d[3], count = 0; k < d[4]; count++) { + if (["zh-cn"].includes(settings.language.toLowerCase())) { + let len = 0; + let bfind = false; + _.map(allWords, (item, index) => { + if (len >= k && !bfind) { + if (allWords[index].PronunciationAssessment.ErrorType !== "None") { + allWords[index].PronunciationAssessment.ErrorType = "None"; + } + lastWords.push(allWords[index]); + bfind = true; + k += allWords[index].Word.length; + } + len = len + item.Word.length; + }); + } else { + lastWords.push(allWords[k]); + k++; + } } } - } else { - lastWords = allWords; } - if (reference_text.trim() != "") { - let compScore = - reference_text.length != 0 - ? Number(((recognizedWordsNum / wholelyricsArrayRes.length) * 100).toFixed(0)) - : 0; + let reference_words = []; + if (["zh-cn"].includes(settings.language.toLowerCase())) { + reference_words = allWords; + }else{ + reference_words = wholelyricsArryRes; + } - if (compScore > 100) { - compScore = 100; + let recognizedWordsRes = []; + _.forEach(recognizedWords, (word) => { + if (word.PronunciationAssessment.ErrorType == "None") { + recognizedWordsRes.push(word); } - scoreNumber.compScore = compScore; + }); + + let compScore = Number(((recognizedWordsRes.length / reference_words.length) * 100).toFixed(0)); + if (compScore > 100) { + compScore = 100; } + scoreNumber.compScore = compScore; const accuracyScores = []; _.forEach(lastWords, (word) => { @@ -210,12 +242,15 @@ export const main = (settings) => { scoreNumber.fluencyScore = _.sum(sumRes) / _.sum(durations); } - enableProsodyAssessment && (scoreNumber.prosodyScore = _.sum(prosodyScores) / prosodyScores.length); + scoreNumber.prosodyScore = _.sum(prosodyScores) / prosodyScores.length; const sortScore = Object.keys(scoreNumber).sort(function (a, b) { return scoreNumber[a] - scoreNumber[b]; }); - if (reference_text.trim() != "" && enableProsodyAssessment) { + if ( + jo.RecognitionStatus == "Success" || + jo.RecognitionStatus == "Failed" + ) { scoreNumber.pronScore = Number( ( scoreNumber[sortScore["0"]] * 0.4 + @@ -224,30 +259,16 @@ export const main = (settings) => { scoreNumber[sortScore["3"]] * 0.2 ).toFixed(0) ); - } else if (reference_text.trim() != "" || enableProsodyAssessment) { - scoreNumber.pronScore = Number( - (scoreNumber[sortScore["0"]] * 0.6 + scoreNumber[sortScore["1"]] * 0.2 + scoreNumber[sortScore["2"]] * 0.2).toFixed(0) - ); } else { scoreNumber.pronScore = Number( - (scoreNumber[sortScore["0"]] * 0.6 + scoreNumber[sortScore["1"]] * 0.4).toFixed(0) + (scoreNumber.accuracyScore * 0.6 + scoreNumber.fluencyScore * 0.2 + scoreNumber.prosodyScore * 0.2).toFixed(0) ); } - console.log(` Paragraph pronunciation score: ${scoreNumber.pronScore},` + - ` accuracy score: ${scoreNumber.accuracyScore},` + - `${reference_text.trim() != "" ? ` completeness score: ${scoreNumber.compScore},` : ""}` + - ` fluency score: ${scoreNumber.fluencyScore}` + - `${enableProsodyAssessment ? `, prosody score: ${scoreNumber.prosodyScore}` : ""}`); + console.log(" Paragraph accuracy score: ", scoreNumber.accuracyScore, ", completeness score: ", scoreNumber.compScore, ", fluency score: ", scoreNumber.fluencyScore, ", prosody score: ", scoreNumber.prosodyScore); _.forEach(lastWords, (word, ind) => { - let wordLevelOutput = ` ${ind + 1}: word: ${word.Word}`; - if (word.PronunciationAssessment.ErrorType != "Omission" && word.PronunciationAssessment.ErrorType != "Insertion") { - wordLevelOutput += `\taccuracy score: ${word.PronunciationAssessment.AccuracyScore}\terror type: ${word.PronunciationAssessment.ErrorType};`; - } else { - wordLevelOutput += `\t\t\t\terror type: ${word.PronunciationAssessment.ErrorType};`; - } - console.log(wordLevelOutput); + console.log(" ", ind + 1, ": word: ", word.Word, "\taccuracy score: ", word.PronunciationAssessment.AccuracyScore, "\terror type: ", word.PronunciationAssessment.ErrorType, ";"); }); }; diff --git a/samples/js/node/pronunciationAssessmentGetContentResult.js b/samples/js/node/pronunciationAssessmentGetContentResult.js index 70edf7592..cea32357d 100644 --- a/samples/js/node/pronunciationAssessmentGetContentResult.js +++ b/samples/js/node/pronunciationAssessmentGetContentResult.js @@ -2,6 +2,7 @@ // Licensed under the MIT license. // pull in the required packages. +// Please install the required packages from https://github.com/microsoft/cognitive-services-speech-sdk-js import * as sdk from 'microsoft-cognitiveservices-speech-sdk'; import fs from 'fs'; import _ from 'lodash'; diff --git a/samples/js/node/synthesis.js b/samples/js/node/synthesis.js index 6cab1e13e..05378a4c3 100644 --- a/samples/js/node/synthesis.js +++ b/samples/js/node/synthesis.js @@ -15,7 +15,7 @@ export const main = (settings, filename) => { // setting the synthesis language, voice name, and output audio format. // see https://aka.ms/speech/tts-languages for available languages and voices speechConfig.speechSynthesisLanguage = settings.language; - speechConfig.speechSynthesisVoiceName = "en-US-AvaMultilingualNeural"; + speechConfig.speechSynthesisVoiceName = "en-US-JennyNeural"; speechConfig.speechSynthesisOutputFormat = sdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3; var rl = readline.createInterface({ diff --git a/samples/kotlin/android/continuous-reco/app/build.gradle b/samples/kotlin/android/continuous-reco/app/build.gradle index 5f2575325..0353b0c65 100644 --- a/samples/kotlin/android/continuous-reco/app/build.gradle +++ b/samples/kotlin/android/continuous-reco/app/build.gradle @@ -36,5 +36,5 @@ dependencies { implementation 'androidx.appcompat:appcompat:1.4.2' implementation 'com.google.android.material:material:1.6.1' implementation 'androidx.constraintlayout:constraintlayout:2.1.4' - implementation "com.microsoft.cognitiveservices.speech:client-sdk:1.38.0" + implementation "com.microsoft.cognitiveservices.speech:client-sdk:1.40.0" } \ No newline at end of file diff --git a/samples/kotlin/android/tts-pause-example/app/build.gradle b/samples/kotlin/android/tts-pause-example/app/build.gradle index 166faa87d..a6562a553 100644 --- a/samples/kotlin/android/tts-pause-example/app/build.gradle +++ b/samples/kotlin/android/tts-pause-example/app/build.gradle @@ -36,6 +36,6 @@ dependencies { implementation 'com.google.android.material:material:1.6.1' implementation 'androidx.constraintlayout:constraintlayout:2.1.4' - implementation "com.microsoft.cognitiveservices.speech:client-sdk:1.38.0" + implementation "com.microsoft.cognitiveservices.speech:client-sdk:1.40.0" } \ No newline at end of file diff --git a/samples/kotlin/android/tts-pause-example/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/speechsynthesis/pauseexample/MainActivity.kt b/samples/kotlin/android/tts-pause-example/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/speechsynthesis/pauseexample/MainActivity.kt index 56f2a250c..4af2919a5 100644 --- a/samples/kotlin/android/tts-pause-example/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/speechsynthesis/pauseexample/MainActivity.kt +++ b/samples/kotlin/android/tts-pause-example/app/src/main/java/com/microsoft/cognitiveservices/speech/samples/speechsynthesis/pauseexample/MainActivity.kt @@ -109,7 +109,7 @@ class MainActivity : AppCompatActivity() { // Use 24k Hz format for higher quality. speechConfig?.setSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw24Khz16BitMonoPcm) // Set voice name. - speechConfig?.speechSynthesisVoiceName = "en-US-AvaMultilingualNeural" + speechConfig?.speechSynthesisVoiceName = "en-US-JennyNeural" synthesizer = SpeechSynthesizer(speechConfig, null) connection = Connection.fromSpeechSynthesizer(synthesizer) diff --git a/samples/objective-c/ios/speech-samples/speech-samples.xcodeproj/project.pbxproj b/samples/objective-c/ios/speech-samples/speech-samples.xcodeproj/project.pbxproj index ae0bbc49e..d37bc37e9 100644 --- a/samples/objective-c/ios/speech-samples/speech-samples.xcodeproj/project.pbxproj +++ b/samples/objective-c/ios/speech-samples/speech-samples.xcodeproj/project.pbxproj @@ -53,7 +53,7 @@ 3C00A32B25F8727100512312 /* AudioRecorder.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = AudioRecorder.m; sourceTree = ""; }; 3C1B8BC92679C50600706BB3 /* pronunciation-assessment.wav */ = {isa = PBXFileReference; lastKnownFileType = audio.wav; path = "pronunciation-assessment.wav"; sourceTree = ""; }; 52CF43E62AEF743E00227EF3 /* pronunciation_assessment_fall.wav */ = {isa = PBXFileReference; lastKnownFileType = audio.wav; path = pronunciation_assessment_fall.wav; sourceTree = ""; }; - 52FC64F929CACB27000C8918 /* MicrosoftCognitiveServicesSpeech.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = MicrosoftCognitiveServicesSpeech.xcframework; path = "../../../../../../../../MicrosoftCognitiveServicesSpeech-XCFramework-1.38.0/MicrosoftCognitiveServicesSpeech.xcframework"; sourceTree = ""; }; + 52FC64F929CACB27000C8918 /* MicrosoftCognitiveServicesSpeech.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = MicrosoftCognitiveServicesSpeech.xcframework; path = "../../../../../../../../MicrosoftCognitiveServicesSpeech-XCFramework-1.40.0/MicrosoftCognitiveServicesSpeech.xcframework"; sourceTree = ""; }; DC2CBA03227047EA007EB18A /* wreck-a-nice-beach.wav */ = {isa = PBXFileReference; lastKnownFileType = audio.wav; name = "wreck-a-nice-beach.wav"; path = "./wreck-a-nice-beach.wav"; sourceTree = ""; }; F3184E46214674D60096193E /* speech-samples.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "speech-samples.app"; sourceTree = BUILT_PRODUCTS_DIR; }; F3184E49214674D60096193E /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = ""; }; diff --git a/samples/objective-c/ios/synthesis-samples/synthesis-samples/ViewController.m b/samples/objective-c/ios/synthesis-samples/synthesis-samples/ViewController.m index 84aa88905..56359e2c1 100644 --- a/samples/objective-c/ios/synthesis-samples/synthesis-samples/ViewController.m +++ b/samples/objective-c/ios/synthesis-samples/synthesis-samples/ViewController.m @@ -149,11 +149,11 @@ - (void)synthesisToSpeaker { // https://docs.microsoft.com/azure/cognitive-services/speech-service/language-support#text-to-speech speechConfig.speechSynthesisLanguage = @"en-GB"; // Sets the voice name - // e.g. "en-GB-RyanNeural". + // e.g. "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)". // The full list of supported voices can be found here: // https://aka.ms/csspeech/voicenames // And, you can try getVoices method to get all available voices. - speechConfig.speechSynthesisVoiceName = @"en-GB-RyanNeural"; + speechConfig.speechSynthesisVoiceName = @"Microsoft Server Speech Text to Speech Voice (en-GB, RyanNeural)"; // Sets the synthesis output format. // The full list of supported format can be found here: // https://docs.microsoft.com/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs @@ -452,7 +452,7 @@ - (void)synthesisEvent { }]; // To trigger a bookmark event, bookmark tags are required in the SSML, e.g. - // " one. two. three. four." + // " one. two. three. four." [synthesizerForEvents addBookmarkReachedEventHandler: ^ (SPXSpeechSynthesizer *synthesizer, SPXSpeechSynthesisBookmarkEventArgs *eventArgs) { // The unit of AudioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to converted to milliseconds. NSLog(@"Bookmark reached. Audio offset: %fms, bookmark text: %@.", eventArgs.audioOffset/10000., eventArgs.text); diff --git a/samples/objective-c/macos/speech-keyword-recognition/helloworld/Podfile b/samples/objective-c/macos/speech-keyword-recognition/helloworld/Podfile index 774b470c4..6f5257e09 100644 --- a/samples/objective-c/macos/speech-keyword-recognition/helloworld/Podfile +++ b/samples/objective-c/macos/speech-keyword-recognition/helloworld/Podfile @@ -1,4 +1,4 @@ target 'helloworld' do platform :osx, '10.13' - pod 'MicrosoftCognitiveServicesSpeech-macOS', '~> 1.38.0' + pod 'MicrosoftCognitiveServicesSpeech-macOS', '~> 1.40.0' end diff --git a/samples/python/console/README.md b/samples/python/console/README.md index f885c07d1..0877a2451 100644 --- a/samples/python/console/README.md +++ b/samples/python/console/README.md @@ -12,14 +12,6 @@ This sample demonstrates various forms of speech recognition, intent recognition sudo apt-get install libssl-dev libasound2 ``` -* On RHEL or CentOS, run the following commands for the installation of required packages: - ```sh - sudo yum update - sudo yum install alsa-lib openssl python3 - ``` - - * See also [how to configure RHEL/CentOS 7 for Speech SDK](https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-configure-rhel-centos-7). - * On Windows you also need the [Microsoft Visual C++ Redistributable for Visual Studio 2017](https://support.microsoft.com/help/2977003/the-latest-supported-visual-c-downloads) for your platform. diff --git a/samples/python/console/long-form-text-synthesis/long_form_text_synthesis.py b/samples/python/console/long-form-text-synthesis/long_form_text_synthesis.py index 8a195f6d5..b1a55b91e 100644 --- a/samples/python/console/long-form-text-synthesis/long_form_text_synthesis.py +++ b/samples/python/console/long-form-text-synthesis/long_form_text_synthesis.py @@ -28,7 +28,7 @@ class LongTextSynthesizer: def __init__(self, subscription: str, region: str, language: str = 'english', - voice: str = 'en-US-AvaMultilingualNeural', parallel_threads: int = 8) -> None: + voice: str = 'en-US-JennyNeural', parallel_threads: int = 8) -> None: self.is_ssml = None self.subscription = subscription self.region = region diff --git a/samples/python/console/speech_synthesis_sample.py b/samples/python/console/speech_synthesis_sample.py index e1826b1a1..3bc31afe3 100644 --- a/samples/python/console/speech_synthesis_sample.py +++ b/samples/python/console/speech_synthesis_sample.py @@ -87,12 +87,12 @@ def speech_synthesis_with_voice(): # Creates an instance of a speech config with specified subscription key and service region. speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) # Sets the synthesis voice name. - # e.g. "en-US-AndrewMultilingualNeural". + # e.g. "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)". # The full list of supported voices can be found here: # https://aka.ms/csspeech/voicenames # And, you can try get_voices_async method to get all available voices. # See speech_synthesis_get_available_voices() sample below. - voice = "en-US-AndrewMultilingualNeural" + voice = "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)" speech_config.speech_synthesis_voice_name = voice # Creates a speech synthesizer for the specified voice, # using the default speaker as audio output. @@ -522,7 +522,7 @@ def speech_synthesis_bookmark_event(): # Bookmark tag is needed in the SSML, e.g. ssml = "" \ - "" \ + "" \ " one. " \ " two. three. four. " diff --git a/samples/python/tts-text-stream/README.md b/samples/python/tts-text-stream/README.md deleted file mode 100644 index 37345e362..000000000 --- a/samples/python/tts-text-stream/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# Examples to synthesis with input text stream - -The input text stream API is designed to generate audio from text that is being streamed or generated in chunks. A typical scenario is to speak text generated from GPT-like models. Compared to non-text stream APIs, the text stream API significantly reduces TTS latency. - -| | Non text stream | Text Stream | -| ---------- | -------- | ----------- | -| Input Type | Whole GPT response | Each GPT output chunk | -| Latency | High: Time of full GPT response + Time of TTS | Low: Time of few GPT chunks + Time of TTS | - -### Available samples: - -| Language | Directory | Description | -| ---------- | -------- | ----------- | -| Python | [python](text_stream_sample.py) | synthesis with text stream API, the text stream generated by AOAI GPT chat model | - -## API overview -### Create text stream request -To use the text stream API, you have to use the websocket V2 endpoint. -```wss://{region}.tts.speech.microsoft.com/cognitiveservices/websocket/v2``` - -### Set global properties -Since the input of text stream API is parital text. SSML, which is based on XML, is not supported. And thus properties that set in SSML should be set in a new way. - -For now we only support set voice name and output format. - -### Create input text stream -Please specify `speechsdk.SpeechSynthesisRequestInputType.TextStream` when creating the request. - -### Send text to stream -For each text that generated from GPT, call `request.input_stream.write(text)` to send text to the stream. - -### Close text stream -When GPT finished the output, call `request.input_stream.close()` to close the stream. - diff --git a/samples/python/tts-text-stream/requirements.txt b/samples/python/tts-text-stream/requirements.txt deleted file mode 100644 index 81f5b4471..000000000 --- a/samples/python/tts-text-stream/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -azure-cognitiveservices-speech -openai diff --git a/samples/python/tts-text-stream/text_stream_sample.py b/samples/python/tts-text-stream/text_stream_sample.py deleted file mode 100644 index 8a17152c9..000000000 --- a/samples/python/tts-text-stream/text_stream_sample.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -from openai import AzureOpenAI -import azure.cognitiveservices.speech as speechsdk - -# setup AzureOpenAI client -gpt_client = AzureOpenAI(azure_endpoint=os.getenv("AZURE_OPENAI_API_ENDPOINT"), api_key=os.getenv("AZURE_OPENAI_API_KEY"), api_version="2024-02-01") - -# setup speech synthesizer -# IMPORTANT: MUST use the websocket v2 endpoint -speech_config = speechsdk.SpeechConfig(endpoint=f"wss://{os.getenv('AZURE_TTS_REGION')}.tts.speech.microsoft.com/cognitiveservices/websocket/v2", - subscription=os.getenv("AZURE_TTS_API_KEY")) - -speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config) - -speech_synthesizer.synthesizing.connect(lambda evt: print("[audio]", end="")) - -# set a voice name -speech_config.speech_synthesis_voice_name = "en-US-AvaMultilingualNeural" - -# set timeout value to bigger ones to avoid sdk cancel the request when GPT latency too high -properties = dict() -properties["SpeechSynthesis_FrameTimeoutInterval"]="100000000" -properties["SpeechSynthesis_RtfTimeoutThreshold"]="10" -speech_config.set_properties_by_name(properties) - -# create request with TextStream input type -tts_request = speechsdk.SpeechSynthesisRequest(input_type = speechsdk.SpeechSynthesisRequestInputType.TextStream) -tts_task = speech_synthesizer.speak_async(tts_request) - -# Get GPT output stream -completion = gpt_client.chat.completions.create( - model="gpt-4-turbo", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "tell me a joke in 100 words"} - ], - stream=True -) - -for chunk in completion: - if len(chunk.choices) > 0: - chunk_text = chunk.choices[0].delta.content - if chunk_text: - print(chunk_text, end="") - tts_request.input_stream.write(chunk_text) -print("[GPT END]", end="") - -# close tts input stream when GPT finished -tts_request.input_stream.close() - -# wait all tts audio bytes return -result = tts_task.get() -print("[TTS END]", end="") \ No newline at end of file diff --git a/samples/python/web/avatar/README.md b/samples/python/web/avatar/README.md deleted file mode 100644 index 1bd97ef72..000000000 --- a/samples/python/web/avatar/README.md +++ /dev/null @@ -1,103 +0,0 @@ -# Instructions to run Microsoft Azure TTS Talking Avatar sample code - -## Pre-requisites - -* Follow [Text to speech quickstart](https://learn.microsoft.com/azure/ai-services/speech-service/get-started-text-to-speech?pivots=programming-language-python#set-up-the-environment) to set up the environment for running Speech SDK in python. - -## Basic Sample - -This sample demonstrates the basic usage of Azure text-to-speech avatar real-time API. - -* Step 1: Open a console and navigate to the folder containing this README.md document. - * Run `pip install -r requirements.txt` to install the required packages. - * Set below environment virables: - * `SPEECH_REGION` - the region of your Azure speech resource, e.g. westus2. - * `SPEECH_KEY` - the API key of your Azure speech resource. - * `SPEECH_PRIVATE_ENDPOINT` - the private endpoint of your Azure speech resource. e.g. https://my-speech-service.cognitiveservices.azure.com. This is optional, and only needed when you want to use private endpoint to access Azure speech service. This is optional, which is only needed when you are using custom endpoint. - * Set below environment virables if you want to use customized ICE server: - * `ICE_SERVER_URL` - the URL of your customized ICE server. - * `ICE_SERVER_URL_REMOTE` - the URL of your customized ICE server for remote side. This is only required when the ICE address for remote side is different from local side. - * `ICE_SERVER_USERNAME` - the username of your customized ICE server. - * `ICE_SERVER_PASSWORD` - the password of your customized ICE server. - * Run `python -m flask run -h 0.0.0.0 -p 5000` to start this sample. - -* Step 2: Open a browser and navigate to `http://localhost:5000/basic` to view the web UI of this sample. - -* Step 3: Fill or select below information: - * TTS Configuration - * TTS Voice - the voice of the TTS. Here is the [available TTS voices list](https://learn.microsoft.com/azure/ai-services/speech-service/language-support?tabs=tts#supported-languages) - * Custom Voice Deployment ID (Endpoint ID) - the deployment ID (also called endpoint ID) of your custom voice. If you are not using a custom voice, please leave it empty. - * Personal Voice Speaker Profile ID - the personal voice speaker profile ID of your personal voice. Please follow [here](https://learn.microsoft.com/azure/ai-services/speech-service/personal-voice-overview) to view and create personal voice. - * Avatar Configuration - * Avatar Character - The character of the avatar. By default it's `lisa`, and you can update this value to use a different avatar. - * Avatar Style - The style of the avatar. You can update this value to use a different avatar style. This parameter is optional for custom avatar. - * Background Color - The color of the avatar background. - * Background Image (URL) - The URL of the background image. If you want to have a background image for the avatar, please fill this field. You need first upload your image to a publicly accessbile place, with a public URL. e.g. https://samples-files.com/samples/Images/jpg/1920-1080-sample.jpg - * Custom Avatar - Check this if you are using a custom avatar. - * Transparent Background - Check this if you want to use transparent background for the avatar. When this is checked, the background color of the video stream from server side is automatically set to green(#00FF00FF), and the js code on client side (check the `makeBackgroundTransparent` function in main.js) will do the real-time matting by replacing the green color with transparent color. - * Video Crop - By checking this, you can crop the video stream from server side to a smaller size. This is useful when you want to put the avatar video into a customized rectangle area. - -* Step 4: Click `Start Session` button to setup video connection with Azure TTS Talking Avatar service. If everything goes well, you should see a live video with an avatar being shown on the web page. - -* Step 5: Type some text in the `Spoken Text` text box and click `Speak` button to send the text to Azure TTS Talking Avatar service. The service will synthesize the text to talking avatar video, and send the video stream back to the browser. The browser will play the video stream. You should see the avatar speaking the text you typed with mouth movement, and hear the voice which is synchronized with the mouth movement. - -* Step 6: You can either continue to type text in the `Spoken Text` text box and let the avatar speak that text by clicking `Speak` button, or click `Stop Session` button to stop the video connection with Azure TTS Talking Avatar service. If you click `Stop Session` button, you can click `Start Session` button to start a new video connection with Azure TTS Talking Avatar service. - -## Chat Sample - -This sample demonstrates the chat scenario, with integration of Azure speech-to-text, Azure OpenAI, and Azure text-to-speech avatar real-time API. - -* Step 1: Open a console and navigate to the folder containing this README.md document. - * Run `pip install -r requirements.txt` to install the required packages. - * Set below environment virables: - * `SPEECH_REGION` - the region of your Azure speech resource, e.g. westus2. - * `SPEECH_KEY` - the API key of your Azure speech resource. - * `SPEECH_PRIVATE_ENDPOINT` - the private endpoint of your Azure speech resource. e.g. https://my-speech-service.cognitiveservices.azure.com. This is optional, and only needed when you want to use private endpoint to access Azure speech service. This is optional, which is only needed when you are using custom endpoint. For more information about private endpoint, please refer to [Enable private endpoint](https://learn.microsoft.com/azure/ai-services/speech-service/speech-services-private-link). - * `SPEECH_RESOURCE_URL` - the URL of your Azure speech resource, e.g. /subscriptions/6e83d8b7-00dd-4b0a-9e98-dab9f060418b/resourceGroups/my-resource-group/providers/Microsoft.CognitiveServices/accounts/my-speech-resource. To fetch the speech resource URL, go to your speech resource overview page on Azure portal, click `JSON View` link, and then copy the `Resource ID` value on the popped up page. This is optional, which is only needed when you want to use private endpoint to access Azure speech service. - * `USER_ASSIGNED_MANAGED_IDENTITY_CLIENT_ID` - the client ID of your user-assigned managed identity. This is optional, which is only needed when you want to use private endpoint with user-assigned managed identity to access Azure speech service. For more information about user-assigned managed identity, please refer to [Use a user-assigned managed identity](https://learn.microsoft.com/azure/active-directory/managed-identities-azure-resources/how-to-use-vm-token?tabs=azure-cli). - * `AZURE_OPENAI_ENDPOINT` - the endpoint of your Azure OpenAI resource, e.g. https://my-aoai.openai.azure.com/, which can be found in the `Keys and Endpoint` section of your Azure OpenAI resource in Azure portal. - * `AZURE_OPENAI_API_KEY` - the API key of your Azure OpenAI resource, which can be found in the `Keys and Endpoint` section of your Azure OpenAI resource in Azure portal. - * `AZURE_OPENAI_DEPLOYMENT_NAME` - the name of your Azure OpenAI model deployment, which can be found in the `Model deployments` section of your Azure OpenAI resource in Azure portal. - * Set below environment virables if you want to use your own data to constrain the chat: - * `COGNITIVE_SEARCH_ENDPOINT` - the endpoint of your Azure Cognitive Search resource, e.g. https://my-cognitive-search.search.windows.net/, which can be found in the `Overview` section of your Azure Cognitive Search resource in Azure portal, appearing at `Essentials -> Url` field. - * `COGNITIVE_SEARCH_API_KEY` - the API key of your Azure Cognitive Search resource, which can be found in the `Keys` section of your Azure Cognitive Search resource in Azure portal. Please make sure to use the `Admin Key` instead of `Query Key`. - * `COGNITIVE_SEARCH_INDEX_NAME` - the name of your Azure Cognitive Search index, which can be found in the `Indexes` section of your Azure Cognitive Search resource in Azure portal. - * Set below environment virables if you want to use customized ICE server: - * `ICE_SERVER_URL` - the URL of your customized ICE server. - * `ICE_SERVER_URL_REMOTE` - the URL of your customized ICE server for remote side. This is only required when the ICE address for remote side is different from local side. - * `ICE_SERVER_USERNAME` - the username of your customized ICE server. - * `ICE_SERVER_PASSWORD` - the password of your customized ICE server. - * Run `python -m flask run -h 0.0.0.0 -p 5000` to start this sample. - -* Step 2: Open a browser and navigate to `http://localhost:5000/chat` to view the web UI of this sample. - -* Step 3: Fill or select below information: - * Chat Configuration - * Azure OpenAI Deployment Name - the name of your Azure OpenAI model deployment, which can be found in the `Model deployments` section of your Azure OpenAI resource in Azure portal. - * System Prompt - you can edit this text to preset the context for the chat API. The chat API will then generate the response based on this context. - * Enable On Your Data - check this if you want to use your own data to constrain the chat. If you check this, you need to fill `Azure Cognitive Search Index Name` field below. - * Azure Cognitive Search Index Name - the name of your Azure Cognitive Search index, which can be found in the `Indexes` section of your Azure Cognitive Search resource in Azure portal. - * Speech Configuration - * STT Locale(s) - the locale(s) of the STT. Here is the [available STT languages list](https://learn.microsoft.com/azure/ai-services/speech-service/language-support?tabs=stt#supported-languages). If multiple locales are specified, the STT will enable multi-language recognition, which means the STT will recognize the speech in any of the specified locales. - * TTS Voice - the voice of the TTS. Here is the [available TTS voices list](https://learn.microsoft.com/azure/ai-services/speech-service/language-support?tabs=tts#supported-languages) - * Custom Voice Deployment ID (Endpoint ID) - the deployment ID (also called endpoint ID) of your custom voice. If you are not using a custom voice, please leave it empty. - * Personal Voice Speaker Profile ID - the personal voice speaker profile ID of your personal voice. Please follow [here](https://learn.microsoft.com/azure/ai-services/speech-service/personal-voice-overview) to view and create personal voice. - * Continuous Conversation - check this if you want to enable continuous conversation. If this is checked, the STT will keep listening to your speech, with microphone always on until you click `Stop Microphone` button. If this is not checked, the microphone will automatically stop once an utterance is recognized, and you need click `Start Microphone` every time before you give a speech. The `Continuous Conversation` mode is suitable for quiet environment, while the `Non-Continuous Conversation` mode is suitable for noisy environment, which can avoid the noise being recorded while you are not speaking. - * Avatar Configuration - * Avatar Character - The character of the avatar. By default it's `lisa`, and you can update this value to use a different avatar. - * Avatar Style - The style of the avatar. You can update this value to use a different avatar style. This parameter is optional for custom avatar. - * Custom Avatar - Check this if you are using a custom avatar. - * Auto Reconnect - Check this if you want to enable auto reconnect. If this is checked, the avatar video stream is automatically reconnected once the connection is lost. - * Use Local Video for Idle - Check this if you want to use local video for idle part. If this is checked, the avatar video stream is replaced by local video when the avatar is idle. To use this feature, you need to prepare a local video file. Usually, you can record a video of the avatar doing idle action. [Here](https://ttspublic.blob.core.windows.net/sampledata/video/avatar/lisa-casual-sitting-idle.mp4) is a sample video for lisa-casual-sitting avatar idle status. You can download it and put it to `video/lisa-casual-sitting-idle.mp4` under the same folder of `chat.html`. - -* Step 4: Click `Open Avatar Session` button to setup video connection with Azure TTS Talking Avatar service. If everything goes well, you should see a live video with an avatar being shown on the web page. - -* Step 5: Click `Start Microphone` button to start microphone (make sure to allow the microphone access tip box popping up in the browser), and then you can start chatting with the avatar with speech. The chat history (the text of what you said, and the response text by the Azure OpenAI chat API) will be shown beside the avatar. The avatar will then speak out the response of the chat API. - -# Additional Tip(s) - -* If you want to enforce the avatar to stop speaking before the avatar finishes the utterance, you can click `Stop Speaking` button. This is useful when you want to interrupt the avatar speaking. - -* If you want to clear the chat history and start a new round of chat, you can click `Clear Chat History` button. And if you want to stop the avatar service, please click `Close Avatar Session` button to close the connection with avatar service. - -* If you want to type your query message instead of speaking, you can check the `Type Message` checkbox, and then type your query message in the text box showing up below the checkbox. diff --git a/samples/python/web/avatar/app.py b/samples/python/web/avatar/app.py deleted file mode 100644 index 6cae32cb3..000000000 --- a/samples/python/web/avatar/app.py +++ /dev/null @@ -1,557 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved. -# Licensed under the MIT license. - -import azure.cognitiveservices.speech as speechsdk -import datetime -import html -import json -import os -import pytz -import random -import re -import requests -import threading -import time -import traceback -import uuid -from flask import Flask, Response, render_template, request -from azure.identity import DefaultAzureCredential - -# Create the Flask app -app = Flask(__name__, template_folder='.') - -# Environment variables -# Speech resource (required) -speech_region = os.environ.get('SPEECH_REGION') # e.g. westus2 -speech_key = os.environ.get('SPEECH_KEY') -speech_private_endpoint = os.environ.get('SPEECH_PRIVATE_ENDPOINT') # e.g. https://my-speech-service.cognitiveservices.azure.com/ (optional) -speech_resource_url = os.environ.get('SPEECH_RESOURCE_URL') # e.g. /subscriptions/6e83d8b7-00dd-4b0a-9e98-dab9f060418b/resourceGroups/my-rg/providers/Microsoft.CognitiveServices/accounts/my-speech (optional, only used for private endpoint) -user_assigned_managed_identity_client_id = os.environ.get('USER_ASSIGNED_MANAGED_IDENTITY_CLIENT_ID') # e.g. the client id of user assigned managed identity accociated to your app service (optional, only used for private endpoint and user assigned managed identity) -# OpenAI resource (required for chat scenario) -azure_openai_endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT') # e.g. https://my-aoai.openai.azure.com/ -azure_openai_api_key = os.environ.get('AZURE_OPENAI_API_KEY') -azure_openai_deployment_name = os.environ.get('AZURE_OPENAI_DEPLOYMENT_NAME') # e.g. my-gpt-35-turbo-deployment -# Cognitive search resource (optional, only required for 'on your data' scenario) -cognitive_search_endpoint = os.environ.get('COGNITIVE_SEARCH_ENDPOINT') # e.g. https://my-cognitive-search.search.windows.net/ -cognitive_search_api_key = os.environ.get('COGNITIVE_SEARCH_API_KEY') -cognitive_search_index_name = os.environ.get('COGNITIVE_SEARCH_INDEX_NAME') # e.g. my-search-index -# Customized ICE server (optional, only required for customized ICE server) -ice_server_url = os.environ.get('ICE_SERVER_URL') # The ICE URL, e.g. turn:x.x.x.x:3478 -ice_server_url_remote = os.environ.get('ICE_SERVER_URL_REMOTE') # The ICE URL for remote side, e.g. turn:x.x.x.x:3478. This is only required when the ICE address for remote side is different from local side. -ice_server_username = os.environ.get('ICE_SERVER_USERNAME') # The ICE username -ice_server_password = os.environ.get('ICE_SERVER_PASSWORD') # The ICE password - -# Const variables -default_tts_voice = 'en-US-JennyMultilingualV2Neural' # Default TTS voice -sentence_level_punctuations = [ '.', '?', '!', ':', ';', '。', '?', '!', ':', ';' ] # Punctuations that indicate the end of a sentence -enable_quick_reply = False # Enable quick reply for certain chat models which take longer time to respond -quick_replies = [ 'Let me take a look.', 'Let me check.', 'One moment, please.' ] # Quick reply reponses -oyd_doc_regex = re.compile(r'\[doc(\d+)\]') # Regex to match the OYD (on-your-data) document reference - -# Global variables -client_contexts = {} # Client contexts -speech_token = None # Speech token -ice_token = None # ICE token - -# The default route, which shows the default web page (basic.html) -@app.route("/") -def index(): - return render_template("basic.html", methods=["GET"], client_id=initializeClient()) - -# The basic route, which shows the basic web page -@app.route("/basic") -def basicView(): - return render_template("basic.html", methods=["GET"], client_id=initializeClient()) - -# The chat route, which shows the chat web page -@app.route("/chat") -def chatView(): - return render_template("chat.html", methods=["GET"], client_id=initializeClient()) - -# The API route to get the speech token -@app.route("/api/getSpeechToken", methods=["GET"]) -def getSpeechToken() -> Response: - global speech_token - response = Response(speech_token, status=200) - response.headers['SpeechRegion'] = speech_region - if speech_private_endpoint: - response.headers['SpeechPrivateEndpoint'] = speech_private_endpoint - return response - -# The API route to get the ICE token -@app.route("/api/getIceToken", methods=["GET"]) -def getIceToken() -> Response: - # Apply customized ICE server if provided - if ice_server_url and ice_server_username and ice_server_password: - custom_ice_token = json.dumps({ - 'Urls': [ ice_server_url ], - 'Username': ice_server_username, - 'Password': ice_server_password - }) - return Response(custom_ice_token, status=200) - return Response(ice_token, status=200) - -# The API route to connect the TTS avatar -@app.route("/api/connectAvatar", methods=["POST"]) -def connectAvatar() -> Response: - global client_contexts - client_id = uuid.UUID(request.headers.get('ClientId')) - client_context = client_contexts[client_id] - - # Override default values with client provided values - client_context['azure_openai_deployment_name'] = request.headers.get('AoaiDeploymentName') if request.headers.get('AoaiDeploymentName') else azure_openai_deployment_name - client_context['cognitive_search_index_name'] = request.headers.get('CognitiveSearchIndexName') if request.headers.get('CognitiveSearchIndexName') else cognitive_search_index_name - client_context['tts_voice'] = request.headers.get('TtsVoice') if request.headers.get('TtsVoice') else default_tts_voice - client_context['custom_voice_endpoint_id'] = request.headers.get('CustomVoiceEndpointId') - client_context['personal_voice_speaker_profile_id'] = request.headers.get('PersonalVoiceSpeakerProfileId') - - custom_voice_endpoint_id = client_context['custom_voice_endpoint_id'] - - try: - if speech_private_endpoint: - speech_private_endpoint_wss = speech_private_endpoint.replace('https://', 'wss://') - speech_config = speechsdk.SpeechConfig(subscription=speech_key, endpoint=f'{speech_private_endpoint_wss}/tts/cognitiveservices/websocket/v1?enableTalkingAvatar=true') - else: - speech_config = speechsdk.SpeechConfig(subscription=speech_key, endpoint=f'wss://{speech_region}.tts.speech.microsoft.com/cognitiveservices/websocket/v1?enableTalkingAvatar=true') - - if custom_voice_endpoint_id: - speech_config.endpoint_id = custom_voice_endpoint_id - - client_context['speech_synthesizer'] = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None) - speech_synthesizer = client_context['speech_synthesizer'] - - ice_token_obj = json.loads(ice_token) - # Apply customized ICE server if provided - if ice_server_url and ice_server_username and ice_server_password: - ice_token_obj = { - 'Urls': [ ice_server_url_remote ] if ice_server_url_remote else [ ice_server_url ], - 'Username': ice_server_username, - 'Password': ice_server_password - } - local_sdp = request.data.decode('utf-8') - avatar_character = request.headers.get('AvatarCharacter') - avatar_style = request.headers.get('AvatarStyle') - background_color = '#FFFFFFFF' if request.headers.get('BackgroundColor') is None else request.headers.get('BackgroundColor') - background_image_url = request.headers.get('BackgroundImageUrl') - is_custom_avatar = request.headers.get('IsCustomAvatar') - transparent_background = 'false' if request.headers.get('TransparentBackground') is None else request.headers.get('TransparentBackground') - video_crop = 'false' if request.headers.get('VideoCrop') is None else request.headers.get('VideoCrop') - avatar_config = { - 'synthesis': { - 'video': { - 'protocol': { - 'name': "WebRTC", - 'webrtcConfig': { - 'clientDescription': local_sdp, - 'iceServers': [{ - 'urls': [ ice_token_obj['Urls'][0] ], - 'username': ice_token_obj['Username'], - 'credential': ice_token_obj['Password'] - }] - }, - }, - 'format':{ - 'crop':{ - 'topLeft':{ - 'x': 600 if video_crop.lower() == 'true' else 0, - 'y': 0 - }, - 'bottomRight':{ - 'x': 1320 if video_crop.lower() == 'true' else 1920, - 'y': 1080 - } - }, - 'bitrate': 1000000 - }, - 'talkingAvatar': { - 'customized': is_custom_avatar.lower() == 'true', - 'character': avatar_character, - 'style': avatar_style, - 'background': { - 'color': '#00FF00FF' if transparent_background.lower() == 'true' else background_color, - 'image': { - 'url': background_image_url - } - } - } - } - } - } - - connection = speechsdk.Connection.from_speech_synthesizer(speech_synthesizer) - connection.set_message_property('speech.config', 'context', json.dumps(avatar_config)) - - speech_sythesis_result = speech_synthesizer.speak_text_async('').get() - print(f'Result id for avatar connection: {speech_sythesis_result.result_id}') - if speech_sythesis_result.reason == speechsdk.ResultReason.Canceled: - cancellation_details = speech_sythesis_result.cancellation_details - print(f"Speech synthesis canceled: {cancellation_details.reason}") - if cancellation_details.reason == speechsdk.CancellationReason.Error: - print(f"Error details: {cancellation_details.error_details}") - raise Exception(cancellation_details.error_details) - turn_start_message = speech_synthesizer.properties.get_property_by_name('SpeechSDKInternal-ExtraTurnStartMessage') - remoteSdp = json.loads(turn_start_message)['webrtc']['connectionString'] - - return Response(remoteSdp, status=200) - - except Exception as e: - return Response(f"Result ID: {speech_sythesis_result.result_id}. Error message: {e}", status=400) - -# The API route to speak a given SSML -@app.route("/api/speak", methods=["POST"]) -def speak() -> Response: - client_id = uuid.UUID(request.headers.get('ClientId')) - try: - ssml = request.data.decode('utf-8') - result_id = speakSsml(ssml, client_id) - return Response(result_id, status=200) - except Exception as e: - return Response(f"Speak failed. Error message: {e}", status=400) - -# The API route to get the speaking status -@app.route("/api/getSpeakingStatus", methods=["GET"]) -def getSpeakingStatus() -> Response: - global client_contexts - client_id = uuid.UUID(request.headers.get('ClientId')) - is_speaking = client_contexts[client_id]['is_speaking'] - last_speak_time = client_contexts[client_id]['last_speak_time'] - speaking_status = { - 'isSpeaking': is_speaking, - 'lastSpeakTime': last_speak_time.isoformat() if last_speak_time else None - } - return Response(json.dumps(speaking_status), status=200) - -# The API route to stop avatar from speaking -@app.route("/api/stopSpeaking", methods=["POST"]) -def stopSpeaking() -> Response: - stopSpeakingInternal(uuid.UUID(request.headers.get('ClientId'))) - return Response('Speaking stopped.', status=200) - -# The API route for chat -# It receives the user query and return the chat response. -# It returns response in stream, which yields the chat response in chunks. -@app.route("/api/chat", methods=["POST"]) -def chat() -> Response: - global client_contexts - client_id = uuid.UUID(request.headers.get('ClientId')) - client_context = client_contexts[client_id] - chat_initiated = client_context['chat_initiated'] - if not chat_initiated: - initializeChatContext(request.headers.get('SystemPrompt'), client_id) - client_context['chat_initiated'] = True - user_query = request.data.decode('utf-8') - return Response(handleUserQuery(user_query, client_id), mimetype='text/plain', status=200) - -# The API route to clear the chat history -@app.route("/api/chat/clearHistory", methods=["POST"]) -def clearChatHistory() -> Response: - global client_contexts - client_id = uuid.UUID(request.headers.get('ClientId')) - client_context = client_contexts[client_id] - initializeChatContext(request.headers.get('SystemPrompt'), client_id) - client_context['chat_initiated'] = True - return Response('Chat history cleared.', status=200) - -# The API route to disconnect the TTS avatar -@app.route("/api/disconnectAvatar", methods=["POST"]) -def disconnectAvatar() -> Response: - global client_contexts - client_id = uuid.UUID(request.headers.get('ClientId')) - client_context = client_contexts[client_id] - speech_synthesizer = client_context['speech_synthesizer'] - try: - connection = speechsdk.Connection.from_speech_synthesizer(speech_synthesizer) - connection.close() - return Response('Disconnected avatar', status=200) - except: - return Response(traceback.format_exc(), status=400) - -# Initialize the client by creating a client id and an initial context -def initializeClient() -> uuid.UUID: - client_id = uuid.uuid4() - client_contexts[client_id] = { - 'azure_openai_deployment_name': azure_openai_deployment_name, # Azure OpenAI deployment name - 'cognitive_search_index_name': cognitive_search_index_name, # Cognitive search index name - 'tts_voice': default_tts_voice, # TTS voice - 'custom_voice_endpoint_id': None, # Endpoint ID (deployment ID) for custom voice - 'personal_voice_speaker_profile_id': None, # Speaker profile ID for personal voice - 'speech_synthesizer': None, # Speech synthesizer for avatar - 'speech_token': None, # Speech token for client side authentication with speech service - 'ice_token': None, # ICE token for ICE/TURN/Relay server connection - 'chat_initiated': False, # Flag to indicate if the chat context is initiated - 'messages': [], # Chat messages (history) - 'data_sources': [], # Data sources for 'on your data' scenario - 'is_speaking': False, # Flag to indicate if the avatar is speaking - 'spoken_text_queue': [], # Queue to store the spoken text - 'speaking_thread': None, # The thread to speak the spoken text queue - 'last_speak_time': None # The last time the avatar spoke - } - return client_id - -# Refresh the ICE token which being called -def refreshIceToken() -> None: - global ice_token - if speech_private_endpoint: - ice_token = requests.get(f'{speech_private_endpoint}/tts/cognitiveservices/avatar/relay/token/v1', headers={'Ocp-Apim-Subscription-Key': speech_key}).text - else: - ice_token = requests.get(f'https://{speech_region}.tts.speech.microsoft.com/cognitiveservices/avatar/relay/token/v1', headers={'Ocp-Apim-Subscription-Key': speech_key}).text - -# Refresh the speech token every 9 minutes -def refreshSpeechToken() -> None: - global speech_token - while True: - # Refresh the speech token every 9 minutes - if speech_private_endpoint: - credential = DefaultAzureCredential(managed_identity_client_id=user_assigned_managed_identity_client_id) - token = credential.get_token('https://cognitiveservices.azure.com/.default') - speech_token = f'aad#{speech_resource_url}#{token.token}' - else: - speech_token = requests.post(f'https://{speech_region}.api.cognitive.microsoft.com/sts/v1.0/issueToken', headers={'Ocp-Apim-Subscription-Key': speech_key}).text - time.sleep(60 * 9) - -# Initialize the chat context, e.g. chat history (messages), data sources, etc. For chat scenario. -def initializeChatContext(system_prompt: str, client_id: uuid.UUID) -> None: - global client_contexts - client_context = client_contexts[client_id] - cognitive_search_index_name = client_context['cognitive_search_index_name'] - messages = client_context['messages'] - data_sources = client_context['data_sources'] - - # Initialize data sources for 'on your data' scenario - data_sources.clear() - if cognitive_search_endpoint and cognitive_search_api_key and cognitive_search_index_name: - # On-your-data scenario - data_source = { - 'type': 'AzureCognitiveSearch', - 'parameters': { - 'endpoint': cognitive_search_endpoint, - 'key': cognitive_search_api_key, - 'indexName': cognitive_search_index_name, - 'semanticConfiguration': '', - 'queryType': 'simple', - 'fieldsMapping': { - 'contentFieldsSeparator': '\n', - 'contentFields': ['content'], - 'filepathField': None, - 'titleField': 'title', - 'urlField': None - }, - 'inScope': True, - 'roleInformation': system_prompt - } - } - data_sources.append(data_source) - - # Initialize messages - messages.clear() - if len(data_sources) == 0: - system_message = { - 'role': 'system', - 'content': system_prompt - } - messages.append(system_message) - -# Handle the user query and return the assistant reply. For chat scenario. -# The function is a generator, which yields the assistant reply in chunks. -def handleUserQuery(user_query: str, client_id: uuid.UUID): - global client_contexts - client_context = client_contexts[client_id] - azure_openai_deployment_name = client_context['azure_openai_deployment_name'] - messages = client_context['messages'] - data_sources = client_context['data_sources'] - is_speaking = client_context['is_speaking'] - - chat_message = { - 'role': 'user', - 'content': user_query - } - - messages.append(chat_message) - - # Stop previous speaking if there is any - if is_speaking: - stopSpeakingInternal(client_id) - - # For 'on your data' scenario, chat API currently has long (4s+) latency - # We return some quick reply here before the chat API returns to mitigate. - if len(data_sources) > 0 and enable_quick_reply: - speakWithQueue(random.choice(quick_replies), 2000) - - url = f"{azure_openai_endpoint}/openai/deployments/{azure_openai_deployment_name}/chat/completions?api-version=2023-06-01-preview" - body = json.dumps({ - 'messages': messages, - 'stream': True - }) - - if len(data_sources) > 0: - url = f"{azure_openai_endpoint}/openai/deployments/{azure_openai_deployment_name}/extensions/chat/completions?api-version=2023-06-01-preview" - body = json.dumps({ - 'dataSources': data_sources, - 'messages': messages, - 'stream': True - }) - - assistant_reply = '' - tool_content = '' - spoken_sentence = '' - - response = requests.post(url, stream=True, headers={ - 'api-key': azure_openai_api_key, - 'Content-Type': 'application/json' - }, data=body) - - if not response.ok: - raise Exception(f"Chat API response status: {response.status_code} {response.reason}") - - # Iterate chunks from the response stream - iterator = response.iter_content(chunk_size=None) - for chunk in iterator: - if not chunk: - # End of stream - return - - # Process the chunk of data (value) - chunk_string = chunk.decode() - - if not chunk_string.endswith('}\n\n') and not chunk_string.endswith('[DONE]\n\n'): - # This is an incomplete chunk, read the next chunk - while not chunk_string.endswith('}\n\n') and not chunk_string.endswith('[DONE]\n\n'): - chunk_string += next(iterator).decode() - - for line in chunk_string.split('\n\n'): - try: - if line.startswith('data:') and not line.endswith('[DONE]'): - response_json = json.loads(line[5:].strip()) - response_token = None - if len(response_json['choices']) > 0: - choice = response_json['choices'][0] - if len(data_sources) == 0: - if len(choice['delta']) > 0 and 'content' in choice['delta']: - response_token = choice['delta']['content'] - elif len(choice['messages']) > 0 and 'delta' in choice['messages'][0]: - delta = choice['messages'][0]['delta'] - if 'role' in delta and delta['role'] == 'tool' and 'content' in delta: - tool_content = response_json['choices'][0]['messages'][0]['delta']['content'] - elif 'content' in delta: - response_token = response_json['choices'][0]['messages'][0]['delta']['content'] - if response_token is not None: - if oyd_doc_regex.search(response_token): - response_token = oyd_doc_regex.sub('', response_token).strip() - if response_token == '[DONE]': - response_token = None - - if response_token is not None: - # Log response_token here if need debug - yield response_token # yield response token to client as display text - assistant_reply += response_token # build up the assistant message - if response_token == '\n' or response_token == '\n\n': - speakWithQueue(spoken_sentence.strip(), 0, client_id) - spoken_sentence = '' - else: - response_token = response_token.replace('\n', '') - spoken_sentence += response_token # build up the spoken sentence - if len(response_token) == 1 or len(response_token) == 2: - for punctuation in sentence_level_punctuations: - if response_token.startswith(punctuation): - speakWithQueue(spoken_sentence.strip(), 0, client_id) - spoken_sentence = '' - break - except Exception as e: - print(f"Error occurred while parsing the response: {e}") - print(line) - - if spoken_sentence != '': - speakWithQueue(spoken_sentence.strip(), 0, client_id) - spoken_sentence = '' - - if len(data_sources) > 0: - tool_message = { - 'role': 'tool', - 'content': tool_content - } - messages.append(tool_message) - - assistant_message = { - 'role': 'assistant', - 'content': assistant_reply - } - messages.append(assistant_message) - -# Speak the given text. If there is already a speaking in progress, add the text to the queue. For chat scenario. -def speakWithQueue(text: str, ending_silence_ms: int, client_id: uuid.UUID) -> None: - global client_contexts - client_context = client_contexts[client_id] - spoken_text_queue = client_context['spoken_text_queue'] - is_speaking = client_context['is_speaking'] - spoken_text_queue.append(text) - if not is_speaking: - def speakThread(): - nonlocal client_context - nonlocal spoken_text_queue - nonlocal ending_silence_ms - tts_voice = client_context['tts_voice'] - personal_voice_speaker_profile_id = client_context['personal_voice_speaker_profile_id'] - client_context['is_speaking'] = True - while len(spoken_text_queue) > 0: - text = spoken_text_queue.pop(0) - speakText(text, tts_voice, personal_voice_speaker_profile_id, ending_silence_ms, client_id) - client_context['last_speak_time'] = datetime.datetime.now(pytz.UTC) - client_context['is_speaking'] = False - client_context['speaking_thread'] = threading.Thread(target=speakThread) - client_context['speaking_thread'].start() - -# Speak the given text. -def speakText(text: str, voice: str, speaker_profile_id: str, ending_silence_ms: int, client_id: uuid.UUID) -> str: - ssml = f""" - - - - {html.escape(text)} - - - """ - if ending_silence_ms > 0: - ssml = f""" - - - - {html.escape(text)} - - - - """ - return speakSsml(ssml, client_id) - -# Speak the given ssml with speech sdk -def speakSsml(ssml: str, client_id: uuid.UUID) -> str: - global client_contexts - speech_synthesizer = client_contexts[client_id]['speech_synthesizer'] - speech_sythesis_result = speech_synthesizer.speak_ssml_async(ssml).get() - if speech_sythesis_result.reason == speechsdk.ResultReason.Canceled: - cancellation_details = speech_sythesis_result.cancellation_details - print(f"Speech synthesis canceled: {cancellation_details.reason}") - if cancellation_details.reason == speechsdk.CancellationReason.Error: - print(f"Result ID: {speech_sythesis_result.result_id}. Error details: {cancellation_details.error_details}") - raise Exception(cancellation_details.error_details) - return speech_sythesis_result.result_id - -# Stop speaking internal function -def stopSpeakingInternal(client_id: uuid.UUID) -> None: - global client_contexts - client_context = client_contexts[client_id] - speech_synthesizer = client_context['speech_synthesizer'] - spoken_text_queue = client_context['spoken_text_queue'] - spoken_text_queue.clear() - try: - connection = speechsdk.Connection.from_speech_synthesizer(speech_synthesizer) - connection.send_message_async('synthesis.control', '{"action":"stop"}').get() - except: - print("Sending message through connection object is not yet supported by current Speech SDK.") - -# Start the speech token refresh thread -speechTokenRefereshThread = threading.Thread(target=refreshSpeechToken) -speechTokenRefereshThread.daemon = True -speechTokenRefereshThread.start() - -# Fetch ICE token at startup -refreshIceToken() diff --git a/samples/python/web/avatar/basic.html b/samples/python/web/avatar/basic.html deleted file mode 100644 index ab033b48a..000000000 --- a/samples/python/web/avatar/basic.html +++ /dev/null @@ -1,69 +0,0 @@ - - - - - Talking Avatar Service Demo - - - - - - -

Talking Avatar Service Demo

- - - -
-

TTS Configuration

- -
- -
- -
-
- -

Avatar Configuration

- -
- -
- -
- -
-
- Custom Avatar
- Transparent - Background
- Video Crop
-
-
-
- -

Avatar Control Panel

-
-
- - - - -
- -

Avatar Video

-
- -
- - -
-
- -

Logs

-
- - diff --git a/samples/python/web/avatar/chat.html b/samples/python/web/avatar/chat.html deleted file mode 100644 index 8538299ad..000000000 --- a/samples/python/web/avatar/chat.html +++ /dev/null @@ -1,92 +0,0 @@ - - - - - Talking Avatar Chat Demo - - - - - - -

Talking Avatar Chat Demo

- - - -
-

Chat Configuration

- -
-
- -
- Enable On Your Data
-
-
- - - -

Speech Configuration

- -
- -
- -
- -
-
- Continuous Conversation
-
-
- -

Avatar Configuration

- -
- -
-
- Custom Avatar
-
-
- Auto Reconnect
-
-
- Use Local Video for Idle
-
-
-
- - - - - - - -
-
- -
- -
-
- -
- Type Message
-
-
- - diff --git a/samples/python/web/avatar/requirements.txt b/samples/python/web/avatar/requirements.txt deleted file mode 100644 index 12b252995..000000000 --- a/samples/python/web/avatar/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -azure-cognitiveservices-speech -azure-identity -flask -pytz -requests diff --git a/samples/python/web/avatar/static/css/styles.css b/samples/python/web/avatar/static/css/styles.css deleted file mode 100644 index e030ae4d9..000000000 --- a/samples/python/web/avatar/static/css/styles.css +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. - */ -.hidden { - display: none; -} - -.highlight { - background-color: #eee; - font-size: 1.2em; - margin: 0 0 30px 0; - padding: 0.2em 1.5em; -} - -.warning { - color: red; - font-weight: 400; -} - -@media screen and (min-width: 1000px) { - /* hack! to detect non-touch devices */ - div#links a { - line-height: 0.8em; - } -} - -audio { - max-width: 100%; -} - -body { - font-family: 'Roboto', sans-serif; - font-weight: 300; - margin: 0; - padding: 1em; - word-break: break-word; -} - -button { - background-color: #d84a38; - border: none; - border-radius: 2px; - color: white; - font-family: 'Roboto', sans-serif; - font-size: 0.8em; - margin: 10px 0 1em 0; - padding: 0.5em 0.7em 0.6em 0.7em; -} - -button:active { - background-color: #cf402f; -} - -button:hover { - background-color: #cf402f; -} - -button[disabled] { - color: #ccc; -} - -button[disabled]:hover { - background-color: #d84a38; -} - -canvas { - background-color: #ccc; - max-width: 100%; - width: 100%; -} - -code { - font-family: 'Roboto', sans-serif; - font-weight: 400; -} - -div#container { - margin: 0 auto 0 auto; - max-width: 60em; - padding: 1em 1.5em 1.3em 1.5em; -} - -div#links { - padding: 0.5em 0 0 0; -} - -h1 { - border-bottom: 1px solid #ccc; - font-family: 'Roboto', sans-serif; - font-weight: 500; - margin: 0 0 0.8em 0; - padding: 0 0 0.2em 0; -} - -h2 { - color: #444; - font-weight: 500; -} - -h3 { - border-top: 1px solid #eee; - color: #666; - font-weight: 500; - margin: 10px 0 10px 0; - white-space: nowrap; -} - -li { - margin: 0 0 0.4em 0; -} - -html { - /* avoid annoying page width change - when moving from the home page */ - overflow-y: scroll; -} - -img { - border: none; - max-width: 100%; -} - -input[type=radio] { - position: relative; - top: -1px; -} - -p { - color: #444; - font-weight: 300; -} - -p#data { - border-top: 1px dotted #666; - font-family: Courier New, monospace; - line-height: 1.3em; - max-height: 1000px; - overflow-y: auto; - padding: 1em 0 0 0; -} - -p.borderBelow { - border-bottom: 1px solid #aaa; - padding: 0 0 20px 0; -} - -section p:last-of-type { - margin: 0; -} - -section { - border-bottom: 1px solid #eee; - margin: 0 0 30px 0; - padding: 0 0 20px 0; -} - -section:last-of-type { - border-bottom: none; - padding: 0 0 1em 0; -} - -select { - margin: 0 1em 1em 0; - position: relative; - top: -1px; -} - -h1 span { - white-space: nowrap; -} - -a { - color: #1D6EEE; - font-weight: 300; - text-decoration: none; -} - -h1 a { - font-weight: 300; - margin: 0 10px 0 0; - white-space: nowrap; -} - -a:hover { - color: #3d85c6; - text-decoration: underline; -} - -a#viewSource { - display: block; - margin: 1.3em 0 0 0; - border-top: 1px solid #999; - padding: 1em 0 0 0; -} - -div#errorMsg p { - color: #F00; -} - -div#links a { - display: block; - line-height: 1.3em; - margin: 0 0 1.5em 0; -} - -div.outputSelector { - margin: -1.3em 0 2em 0; -} - -p.description { - margin: 0 0 0.5em 0; -} - -strong { - font-weight: 500; -} - -textarea { - font-family: 'Roboto', sans-serif; -} - -video { - background: #222; - margin: 0 0 20px 0; - --width: 100%; - width: var(--width); - height: calc(var(--width) * 0.75); -} - -ul { - margin: 0 0 0.5em 0; -} - -@media screen and (max-width: 650px) { - .highlight { - font-size: 1em; - margin: 0 0 20px 0; - padding: 0.2em 1em; - } - - h1 { - font-size: 24px; - } -} - -@media screen and (max-width: 550px) { - button:active { - background-color: darkRed; - } - - h1 { - font-size: 22px; - } -} - -@media screen and (max-width: 450px) { - h1 { - font-size: 20px; - } -} - -textarea { - width: 800px; - min-height: 60px; - margin-bottom: 10px; -} - -#webrtc textarea { - width: 800px; - min-height: 100px; - resize: none; -} - -#knownIssues { - color: red; -} - -/* The switch - the box around the slider */ -.switch { - position: relative; - display: inline-block; - width: 48px; - height: 26px; -} - -/* Hide default HTML checkbox */ -.switch input { - opacity: 0; - width: 0; - height: 0; -} - -/* The slider */ -.slider { - position: absolute; - cursor: pointer; - top: 0; - left: 0; - right: 0; - bottom: 0; - background-color: #ccc; - -webkit-transition: .4s; - transition: .4s; -} - -.slider:before { - position: absolute; - content: ""; - height: 20px; - width: 20px; - left: 4px; - bottom: 3px; - background-color: white; - -webkit-transition: .4s; - transition: .4s; -} - -input:checked + .slider { - background-color: #d84a38; -} - -input:focus + .slider { - box-shadow: 0 0 1px #d84a38; -} - -input:checked + .slider:before { - -webkit-transform: translateX(20px); - -ms-transform: translateX(20px); - transform: translateX(20px); -} - -/* Rounded sliders */ -.slider.round { - border-radius: 34px; -} - -.slider.round:before { - border-radius: 50%; -} - -.switchLabel { - display: inline-block; - vertical-align: middle; - position: relative; - margin-top: 5px; -} \ No newline at end of file diff --git a/samples/python/web/avatar/static/image/background.png b/samples/python/web/avatar/static/image/background.png deleted file mode 100644 index 31296f566..000000000 Binary files a/samples/python/web/avatar/static/image/background.png and /dev/null differ diff --git a/samples/python/web/avatar/static/image/favicon.ico b/samples/python/web/avatar/static/image/favicon.ico deleted file mode 100644 index 63e859b47..000000000 Binary files a/samples/python/web/avatar/static/image/favicon.ico and /dev/null differ diff --git a/samples/python/web/avatar/static/js/basic.js b/samples/python/web/avatar/static/js/basic.js deleted file mode 100644 index 95b552ef2..000000000 --- a/samples/python/web/avatar/static/js/basic.js +++ /dev/null @@ -1,323 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. - -// Global objects -var clientId -var peerConnection -var previousAnimationFrameTimestamp = 0; - -// Logger -const log = msg => { - document.getElementById('logging').innerHTML += msg + '
' -} - -// Setup WebRTC -function setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) { - // Create WebRTC peer connection - peerConnection = new RTCPeerConnection({ - iceServers: [{ - urls: [ iceServerUrl ], - username: iceServerUsername, - credential: iceServerCredential - }], - iceTransportPolicy: 'relay' - }) - - // Fetch WebRTC video stream and mount it to an HTML video element - peerConnection.ontrack = function (event) { - // Clean up existing video element if there is any - remoteVideoDiv = document.getElementById('remoteVideo') - for (var i = 0; i < remoteVideoDiv.childNodes.length; i++) { - if (remoteVideoDiv.childNodes[i].localName === event.track.kind) { - remoteVideoDiv.removeChild(remoteVideoDiv.childNodes[i]) - } - } - - const mediaPlayer = document.createElement(event.track.kind) - mediaPlayer.id = event.track.kind - mediaPlayer.srcObject = event.streams[0] - mediaPlayer.autoplay = true - document.getElementById('remoteVideo').appendChild(mediaPlayer) - document.getElementById('videoLabel').hidden = true - document.getElementById('overlayArea').hidden = false - - if (event.track.kind === 'video') { - mediaPlayer.playsInline = true - remoteVideoDiv = document.getElementById('remoteVideo') - canvas = document.getElementById('canvas') - if (document.getElementById('transparentBackground').checked) { - remoteVideoDiv.style.width = '0.1px' - canvas.getContext('2d').clearRect(0, 0, canvas.width, canvas.height) - canvas.hidden = false - } else { - canvas.hidden = true - } - - mediaPlayer.addEventListener('play', () => { - if (document.getElementById('transparentBackground').checked) { - window.requestAnimationFrame(makeBackgroundTransparent) - } else { - remoteVideoDiv.style.width = mediaPlayer.videoWidth / 2 + 'px' - } - }) - } - else - { - // Mute the audio player to make sure it can auto play, will unmute it when speaking - // Refer to https://developer.mozilla.org/en-US/docs/Web/Media/Autoplay_guide - mediaPlayer.muted = true - } - } - - // Listen to data channel, to get the event from the server - peerConnection.addEventListener("datachannel", event => { - const dataChannel = event.channel - dataChannel.onmessage = e => { - console.log("[" + (new Date()).toISOString() + "] WebRTC event received: " + e.data) - } - }) - - // This is a workaround to make sure the data channel listening is working by creating a data channel from the client side - c = peerConnection.createDataChannel("eventChannel") - - // Make necessary update to the web page when the connection state changes - peerConnection.oniceconnectionstatechange = e => { - log("WebRTC status: " + peerConnection.iceConnectionState) - - if (peerConnection.iceConnectionState === 'connected') { - document.getElementById('stopSession').disabled = false - document.getElementById('speak').disabled = false - document.getElementById('configuration').hidden = true - } - - if (peerConnection.iceConnectionState === 'disconnected' || peerConnection.iceConnectionState === 'failed') { - document.getElementById('speak').disabled = true - document.getElementById('stopSpeaking').disabled = true - document.getElementById('stopSession').disabled = true - document.getElementById('startSession').disabled = false - document.getElementById('configuration').hidden = false - } - } - - // Offer to receive 1 audio, and 1 video track - peerConnection.addTransceiver('video', { direction: 'sendrecv' }) - peerConnection.addTransceiver('audio', { direction: 'sendrecv' }) - - // Connect to avatar service when ICE candidates gathering is done - iceGatheringDone = false - - peerConnection.onicecandidate = e => { - if (!e.candidate && !iceGatheringDone) { - iceGatheringDone = true - connectToAvatarService(peerConnection) - } - } - - peerConnection.createOffer().then(sdp => { - peerConnection.setLocalDescription(sdp).then(() => { setTimeout(() => { - if (!iceGatheringDone) { - iceGatheringDone = true - connectToAvatarService(peerConnection) - } - }, 2000) }) - }) -} - -// Connect to TTS Avatar Service -function connectToAvatarService(peerConnection) { - let localSdp = btoa(JSON.stringify(peerConnection.localDescription)) - let headers = { - 'ClientId': clientId, - 'AvatarCharacter': document.getElementById('talkingAvatarCharacter').value, - 'AvatarStyle': document.getElementById('talkingAvatarStyle').value, - 'BackgroundColor': document.getElementById('backgroundColor').value, - 'BackgroundImageUrl': document.getElementById('backgroundImageUrl').value, - 'IsCustomAvatar': document.getElementById('customizedAvatar').checked, - 'TransparentBackground': document.getElementById('transparentBackground').checked, - 'VideoCrop': document.getElementById('videoCrop').checked - } - - if (document.getElementById('customVoiceEndpointId').value !== '') { - headers['CustomVoiceEndpointId'] = document.getElementById('customVoiceEndpointId').value - } - - fetch('/api/connectAvatar', { - method: 'POST', - headers: headers, - body: localSdp - }) - .then(response => { - if (response.ok) { - response.text().then(text => { - const remoteSdp = text - peerConnection.setRemoteDescription(new RTCSessionDescription(JSON.parse(atob(remoteSdp)))) - }) - } else { - document.getElementById('startSession').disabled = false; - document.getElementById('configuration').hidden = false; - throw new Error(`Failed connecting to the Avatar service: ${response.status} ${response.statusText}`) - } - }) -} - -// Make video background transparent by matting -function makeBackgroundTransparent(timestamp) { - // Throttle the frame rate to 30 FPS to reduce CPU usage - if (timestamp - previousAnimationFrameTimestamp > 30) { - video = document.getElementById('video') - tmpCanvas = document.getElementById('tmpCanvas') - tmpCanvasContext = tmpCanvas.getContext('2d', { willReadFrequently: true }) - tmpCanvasContext.drawImage(video, 0, 0, video.videoWidth, video.videoHeight) - if (video.videoWidth > 0) { - let frame = tmpCanvasContext.getImageData(0, 0, video.videoWidth, video.videoHeight) - for (let i = 0; i < frame.data.length / 4; i++) { - let r = frame.data[i * 4 + 0] - let g = frame.data[i * 4 + 1] - let b = frame.data[i * 4 + 2] - if (g - 150 > r + b) { - // Set alpha to 0 for pixels that are close to green - frame.data[i * 4 + 3] = 0 - } else if (g + g > r + b) { - // Reduce green part of the green pixels to avoid green edge issue - adjustment = (g - (r + b) / 2) / 3 - r += adjustment - g -= adjustment * 2 - b += adjustment - frame.data[i * 4 + 0] = r - frame.data[i * 4 + 1] = g - frame.data[i * 4 + 2] = b - // Reduce alpha part for green pixels to make the edge smoother - a = Math.max(0, 255 - adjustment * 4) - frame.data[i * 4 + 3] = a - } - } - - canvas = document.getElementById('canvas') - canvasContext = canvas.getContext('2d') - canvasContext.putImageData(frame, 0, 0); - } - - previousAnimationFrameTimestamp = timestamp - } - - window.requestAnimationFrame(makeBackgroundTransparent) -} - -// Do HTML encoding on given text -function htmlEncode(text) { - const entityMap = { - '&': '&', - '<': '<', - '>': '>', - '"': '"', - "'": ''', - '/': '/' - }; - - return String(text).replace(/[&<>"'\/]/g, (match) => entityMap[match]) -} - -window.onload = () => { - clientId = document.getElementById('clientId').value -} - -window.startSession = () => { - document.getElementById('startSession').disabled = true - - fetch('/api/getIceToken', { - method: 'GET', - }) - .then(response => { - if (response.ok) { - response.json().then(data => { - const iceServerUrl = data.Urls[0] - const iceServerUsername = data.Username - const iceServerCredential = data.Password - setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) - }) - } else { - throw new Error(`Failed fetching ICE token: ${response.status} ${response.statusText}`) - } - }) -} - -window.speak = () => { - document.getElementById('speak').disabled = true; - document.getElementById('stopSpeaking').disabled = false - document.getElementById('audio').muted = false - let spokenText = document.getElementById('spokenText').value - let ttsVoice = document.getElementById('ttsVoice').value - let personalVoiceSpeakerProfileID = document.getElementById('personalVoiceSpeakerProfileID').value - let spokenSsml = `${htmlEncode(spokenText)}` - console.log("[" + (new Date()).toISOString() + "] Speak request sent.") - - fetch('/api/speak', { - method: 'POST', - headers: { - 'ClientId': clientId, - 'Content-Type': 'application/ssml+xml' - }, - body: spokenSsml - }) - .then(response => { - document.getElementById('speak').disabled = false - document.getElementById('stopSpeaking').disabled = true - if (response.ok) { - response.text().then(text => { - console.log(`[${new Date().toISOString()}] Speech synthesized to speaker for text [ ${spokenText} ]. Result ID: ${text}`) - }) - } else { - throw new Error(`[${new Date().toISOString()}] Unable to speak text. ${response.status} ${response.statusText}`) - } - }) -} - -window.stopSpeaking = () => { - document.getElementById('stopSpeaking').disabled = true - - fetch('/api/stopSpeaking', { - method: 'POST', - headers: { - 'ClientId': clientId - }, - body: '' - }) - .then(response => { - if (response.ok) { - console.log(`[${new Date().toISOString()}] Speaking stopped.`) - document.getElementById('speak').disabled = false - document.getElementById('stopSpeaking').disabled = false - } else { - throw new Error(`[${new Date().toISOString()}] Unable to stop speaking. ${response.status} ${response.statusText}`) - } - }) -} - -window.stopSession = () => { - document.getElementById('speak').disabled = true - document.getElementById('stopSpeaking').disabled = true - document.getElementById('stopSession').disabled = true - - fetch('/api/disconnectAvatar', { - method: 'POST', - headers: { - 'ClientId': clientId - }, - body: '' - }) -} - -window.updataTransparentBackground = () => { - if (document.getElementById('transparentBackground').checked) { - document.body.background = './static/image/background.png' - document.getElementById('backgroundColor').value = '#00FF00FF' - document.getElementById('backgroundColor').disabled = true - document.getElementById('backgroundImageUrl').value = '' - document.getElementById('backgroundImageUrl').disabled = true - } else { - document.body.background = '' - document.getElementById('backgroundColor').value = '#FFFFFFFF' - document.getElementById('backgroundColor').disabled = false - document.getElementById('backgroundImageUrl').disabled = false - } -} diff --git a/samples/python/web/avatar/static/js/chat.js b/samples/python/web/avatar/static/js/chat.js deleted file mode 100644 index bc366ed0e..000000000 --- a/samples/python/web/avatar/static/js/chat.js +++ /dev/null @@ -1,545 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. - -// Global objects -var clientId -var speechRecognizer -var peerConnection -var isSpeaking = false -var sessionActive = false -var lastSpeakTime - -// Connect to avatar service -function connectAvatar() { - document.getElementById('startSession').disabled = true - - fetch('/api/getIceToken', { - method: 'GET', - }) - .then(response => { - if (response.ok) { - response.json().then(data => { - const iceServerUrl = data.Urls[0] - const iceServerUsername = data.Username - const iceServerCredential = data.Password - setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) - }) - } else { - throw new Error(`Failed fetching ICE token: ${response.status} ${response.statusText}`) - } - }) - - document.getElementById('configuration').hidden = true -} - -// Create speech recognizer -function createSpeechRecognizer() { - fetch('/api/getSpeechToken', { - method: 'GET', - }) - .then(response => { - if (response.ok) { - const speechRegion = response.headers.get('SpeechRegion') - const speechPrivateEndpoint = response.headers.get('SpeechPrivateEndpoint') - response.text().then(text => { - const speechToken = text - const speechRecognitionConfig = speechPrivateEndpoint ? - SpeechSDK.SpeechConfig.fromEndpoint(new URL(`wss://${speechPrivateEndpoint.replace('https://', '')}/stt/speech/universal/v2`), '') : - SpeechSDK.SpeechConfig.fromEndpoint(new URL(`wss://${speechRegion}.stt.speech.microsoft.com/speech/universal/v2`), '') - speechRecognitionConfig.authorizationToken = speechToken - speechRecognitionConfig.setProperty(SpeechSDK.PropertyId.SpeechServiceConnection_LanguageIdMode, "Continuous") - var sttLocales = document.getElementById('sttLocales').value.split(',') - var autoDetectSourceLanguageConfig = SpeechSDK.AutoDetectSourceLanguageConfig.fromLanguages(sttLocales) - speechRecognizer = SpeechSDK.SpeechRecognizer.FromConfig(speechRecognitionConfig, autoDetectSourceLanguageConfig, SpeechSDK.AudioConfig.fromDefaultMicrophoneInput()) - }) - } else { - throw new Error(`Failed fetching speech token: ${response.status} ${response.statusText}`) - } - }) -} - -// Disconnect from avatar service -function disconnectAvatar(closeSpeechRecognizer = false) { - fetch('/api/disconnectAvatar', { - method: 'POST', - headers: { - 'ClientId': clientId - }, - body: '' - }) - - if (speechRecognizer !== undefined) { - speechRecognizer.stopContinuousRecognitionAsync() - if (closeSpeechRecognizer) { - speechRecognizer.close() - } - } - - sessionActive = false -} - -// Setup WebRTC -function setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) { - // Create WebRTC peer connection - peerConnection = new RTCPeerConnection({ - iceServers: [{ - urls: [ iceServerUrl ], - username: iceServerUsername, - credential: iceServerCredential - }], - iceTransportPolicy: 'relay' - }) - - // Fetch WebRTC video stream and mount it to an HTML video element - peerConnection.ontrack = function (event) { - if (event.track.kind === 'audio') { - let audioElement = document.createElement('audio') - audioElement.id = 'audioPlayer' - audioElement.srcObject = event.streams[0] - audioElement.autoplay = true - - audioElement.onplaying = () => { - console.log(`WebRTC ${event.track.kind} channel connected.`) - } - - document.getElementById('remoteVideo').appendChild(audioElement) - } - - if (event.track.kind === 'video') { - let videoElement = document.createElement('video') - videoElement.id = 'videoPlayer' - videoElement.srcObject = event.streams[0] - videoElement.autoplay = true - videoElement.playsInline = true - - videoElement.onplaying = () => { - // Clean up existing video element if there is any - remoteVideoDiv = document.getElementById('remoteVideo') - for (var i = 0; i < remoteVideoDiv.childNodes.length; i++) { - if (remoteVideoDiv.childNodes[i].localName === event.track.kind) { - remoteVideoDiv.removeChild(remoteVideoDiv.childNodes[i]) - } - } - - // Append the new video element - document.getElementById('remoteVideo').appendChild(videoElement) - - console.log(`WebRTC ${event.track.kind} channel connected.`) - document.getElementById('microphone').disabled = false - document.getElementById('stopSession').disabled = false - document.getElementById('remoteVideo').style.width = '960px' - document.getElementById('chatHistory').hidden = false - document.getElementById('showTypeMessage').disabled = false - - if (document.getElementById('useLocalVideoForIdle').checked) { - document.getElementById('localVideo').hidden = true - if (lastSpeakTime === undefined) { - lastSpeakTime = new Date() - } - } - - setTimeout(() => { sessionActive = true }, 5000) // Set session active after 5 seconds - } - } - } - - // Listen to data channel, to get the event from the server - peerConnection.addEventListener("datachannel", event => { - const dataChannel = event.channel - dataChannel.onmessage = e => { - console.log("[" + (new Date()).toISOString() + "] WebRTC event received: " + e.data) - } - }) - - // This is a workaround to make sure the data channel listening is working by creating a data channel from the client side - c = peerConnection.createDataChannel("eventChannel") - - // Make necessary update to the web page when the connection state changes - peerConnection.oniceconnectionstatechange = e => { - console.log("WebRTC status: " + peerConnection.iceConnectionState) - if (peerConnection.iceConnectionState === 'disconnected') { - if (document.getElementById('useLocalVideoForIdle').checked) { - document.getElementById('localVideo').hidden = false - document.getElementById('remoteVideo').style.width = '0.1px' - } - } - } - - // Offer to receive 1 audio, and 1 video track - peerConnection.addTransceiver('video', { direction: 'sendrecv' }) - peerConnection.addTransceiver('audio', { direction: 'sendrecv' }) - - // Connect to avatar service when ICE candidates gathering is done - iceGatheringDone = false - - peerConnection.onicecandidate = e => { - if (!e.candidate && !iceGatheringDone) { - iceGatheringDone = true - connectToAvatarService(peerConnection) - } - } - - peerConnection.createOffer().then(sdp => { - peerConnection.setLocalDescription(sdp).then(() => { setTimeout(() => { - if (!iceGatheringDone) { - iceGatheringDone = true - connectToAvatarService(peerConnection) - } - }, 2000) }) - }) -} - -// Connect to TTS Avatar Service -function connectToAvatarService(peerConnection) { - let localSdp = btoa(JSON.stringify(peerConnection.localDescription)) - let headers = { - 'ClientId': clientId, - 'AvatarCharacter': document.getElementById('talkingAvatarCharacter').value, - 'AvatarStyle': document.getElementById('talkingAvatarStyle').value, - 'IsCustomAvatar': document.getElementById('customizedAvatar').checked - } - - if (document.getElementById('azureOpenAIDeploymentName').value !== '') { - headers['AoaiDeploymentName'] = document.getElementById('azureOpenAIDeploymentName').value - } - - if (document.getElementById('enableOyd').checked && document.getElementById('azureCogSearchIndexName').value !== '') { - headers['CognitiveSearchIndexName'] = document.getElementById('azureCogSearchIndexName').value - } - - if (document.getElementById('ttsVoice').value !== '') { - headers['TtsVoice'] = document.getElementById('ttsVoice').value - } - - if (document.getElementById('customVoiceEndpointId').value !== '') { - headers['CustomVoiceEndpointId'] = document.getElementById('customVoiceEndpointId').value - } - - if (document.getElementById('personalVoiceSpeakerProfileID').value !== '') { - headers['PersonalVoiceSpeakerProfileId'] = document.getElementById('personalVoiceSpeakerProfileID').value - } - - fetch('/api/connectAvatar', { - method: 'POST', - headers: headers, - body: localSdp - }) - .then(response => { - if (response.ok) { - response.text().then(text => { - const remoteSdp = text - peerConnection.setRemoteDescription(new RTCSessionDescription(JSON.parse(atob(remoteSdp)))) - }) - } else { - document.getElementById('startSession').disabled = false; - document.getElementById('configuration').hidden = false; - throw new Error(`Failed connecting to the Avatar service: ${response.status} ${response.statusText}`) - } - }) -} - -// Handle user query. Send user query to the chat API and display the response. -function handleUserQuery(userQuery) { - fetch('/api/chat', { - method: 'POST', - headers: { - 'ClientId': clientId, - 'SystemPrompt': document.getElementById('prompt').value, - 'Content-Type': 'text/plain' - }, - body: userQuery - }) - .then(response => { - if (!response.ok) { - throw new Error(`Chat API response status: ${response.status} ${response.statusText}`) - } - - let chatHistoryTextArea = document.getElementById('chatHistory') - chatHistoryTextArea.innerHTML += 'Assistant: ' - - const reader = response.body.getReader() - - // Function to recursively read chunks from the stream - function read() { - return reader.read().then(({ value, done }) => { - // Check if there is still data to read - if (done) { - // Stream complete - return - } - - // Process the chunk of data (value) - let chunkString = new TextDecoder().decode(value, { stream: true }) - - chatHistoryTextArea.innerHTML += `${chunkString}` - chatHistoryTextArea.scrollTop = chatHistoryTextArea.scrollHeight - - // Continue reading the next chunk - return read() - }) - } - - // Start reading the stream - return read() - }) -} - -// Handle local video. If the user is not speaking for 15 seconds, switch to local video. -function handleLocalVideo() { - if (lastSpeakTime === undefined) { - return - } - - let currentTime = new Date() - if (currentTime - lastSpeakTime > 15000) { - if (document.getElementById('useLocalVideoForIdle').checked && sessionActive && !isSpeaking) { - disconnectAvatar() - document.getElementById('localVideo').hidden = false - document.getElementById('remoteVideo').style.width = '0.1px' - sessionActive = false - } - } -} - -// Check whether the avatar video stream is hung -function checkHung() { - // Check whether the avatar video stream is hung, by checking whether the video time is advancing - let videoElement = document.getElementById('videoPlayer') - if (videoElement !== null && videoElement !== undefined && sessionActive) { - let videoTime = videoElement.currentTime - setTimeout(() => { - // Check whether the video time is advancing - if (videoElement.currentTime === videoTime) { - // Check whether the session is active to avoid duplicatedly triggering reconnect - if (sessionActive) { - sessionActive = false - if (document.getElementById('autoReconnectAvatar').checked) { - console.log(`[${(new Date()).toISOString()}] The video stream got disconnected, need reconnect.`) - connectAvatar() - createSpeechRecognizer() - } - } - } - }, 2000) - } -} - -// Fetch speaking status from backend. -function checkSpeakingStatus() { - fetch('/api/getSpeakingStatus', { - method: 'GET', - headers: { - 'ClientId': clientId - } - }) - .then(response => { - if (response.ok) { - response.json().then(data => { - isSpeaking = data.isSpeaking - if (data.lastSpeakTime !== null) { - lastSpeakTime = new Date(data.lastSpeakTime) - } - - if (isSpeaking) { - document.getElementById('stopSpeaking').disabled = false - } else { - document.getElementById('stopSpeaking').disabled = true - } - - handleLocalVideo() - }) - } else { - throw new Error(`Failed to get speaking status: ${response.status} ${response.statusText}`) - } - }) -} - -window.onload = () => { - clientId = document.getElementById('clientId').value - setInterval(() => { - checkHung() - checkSpeakingStatus() - }, 2000) // Check session activity every 2 seconds -} - -window.startSession = () => { - createSpeechRecognizer() - if (document.getElementById('useLocalVideoForIdle').checked) { - document.getElementById('startSession').disabled = true - document.getElementById('configuration').hidden = true - document.getElementById('microphone').disabled = false - document.getElementById('stopSession').disabled = false - document.getElementById('localVideo').hidden = false - document.getElementById('remoteVideo').style.width = '0.1px' - document.getElementById('chatHistory').hidden = false - document.getElementById('showTypeMessage').disabled = false - return - } - - connectAvatar() -} - -window.stopSpeaking = () => { - document.getElementById('stopSpeaking').disabled = true - - fetch('/api/stopSpeaking', { - method: 'POST', - headers: { - 'ClientId': clientId - }, - body: '' - }) - .then(response => { - if (response.ok) { - checkSpeakingStatus() - } else { - throw new Error(`Failed to stop speaking: ${response.status} ${response.statusText}`) - } - }) -} - -window.stopSession = () => { - document.getElementById('startSession').disabled = false - document.getElementById('microphone').disabled = true - document.getElementById('stopSession').disabled = true - document.getElementById('configuration').hidden = false - document.getElementById('chatHistory').hidden = true - document.getElementById('showTypeMessage').checked = false - document.getElementById('showTypeMessage').disabled = true - document.getElementById('userMessageBox').hidden = true - if (document.getElementById('useLocalVideoForIdle').checked) { - document.getElementById('localVideo').hidden = true - } - - disconnectAvatar(true) -} - -window.clearChatHistory = () => { - fetch('/api/chat/clearHistory', { - method: 'POST', - headers: { - 'ClientId': clientId, - 'SystemPrompt': document.getElementById('prompt').value - }, - body: '' - }) - .then(response => { - if (response.ok) { - document.getElementById('chatHistory').innerHTML = '' - } else { - throw new Error(`Failed to clear chat history: ${response.status} ${response.statusText}`) - } - }) -} - -window.microphone = () => { - if (document.getElementById('microphone').innerHTML === 'Stop Microphone') { - // Stop microphone - document.getElementById('microphone').disabled = true - speechRecognizer.stopContinuousRecognitionAsync( - () => { - document.getElementById('microphone').innerHTML = 'Start Microphone' - document.getElementById('microphone').disabled = false - }, (err) => { - console.log("Failed to stop continuous recognition:", err) - document.getElementById('microphone').disabled = false - }) - - return - } - - if (document.getElementById('useLocalVideoForIdle').checked) { - if (!sessionActive) { - connectAvatar() - } - - setTimeout(() => { - document.getElementById('audioPlayer').play() - }, 5000) - } else { - document.getElementById('audioPlayer').play() - } - - document.getElementById('microphone').disabled = true - speechRecognizer.recognized = async (s, e) => { - if (e.result.reason === SpeechSDK.ResultReason.RecognizedSpeech) { - let userQuery = e.result.text.trim() - if (userQuery === '') { - return - } - - // Auto stop microphone when a phrase is recognized, when it's not continuous conversation mode - if (!document.getElementById('continuousConversation').checked) { - document.getElementById('microphone').disabled = true - speechRecognizer.stopContinuousRecognitionAsync( - () => { - document.getElementById('microphone').innerHTML = 'Start Microphone' - document.getElementById('microphone').disabled = false - }, (err) => { - console.log("Failed to stop continuous recognition:", err) - document.getElementById('microphone').disabled = false - }) - } - - let chatHistoryTextArea = document.getElementById('chatHistory') - if (chatHistoryTextArea.innerHTML !== '' && !chatHistoryTextArea.innerHTML.endsWith('\n\n')) { - chatHistoryTextArea.innerHTML += '\n\n' - } - - chatHistoryTextArea.innerHTML += "User: " + userQuery + '\n\n' - chatHistoryTextArea.scrollTop = chatHistoryTextArea.scrollHeight - - handleUserQuery(userQuery) - } - } - - speechRecognizer.startContinuousRecognitionAsync( - () => { - document.getElementById('microphone').innerHTML = 'Stop Microphone' - document.getElementById('microphone').disabled = false - }, (err) => { - console.log("Failed to start continuous recognition:", err) - document.getElementById('microphone').disabled = false - }) -} - -window.updataEnableOyd = () => { - if (document.getElementById('enableOyd').checked) { - document.getElementById('cogSearchConfig').hidden = false - } else { - document.getElementById('cogSearchConfig').hidden = true - } -} - -window.updateTypeMessageBox = () => { - if (document.getElementById('showTypeMessage').checked) { - document.getElementById('userMessageBox').hidden = false - document.getElementById('userMessageBox').addEventListener('keyup', (e) => { - if (e.key === 'Enter') { - const userQuery = document.getElementById('userMessageBox').value - if (userQuery !== '') { - let chatHistoryTextArea = document.getElementById('chatHistory') - if (chatHistoryTextArea.innerHTML !== '' && !chatHistoryTextArea.innerHTML.endsWith('\n\n')) { - chatHistoryTextArea.innerHTML += '\n\n' - } - - chatHistoryTextArea.innerHTML += "User: " + userQuery.trim('\n') + '\n\n' - chatHistoryTextArea.scrollTop = chatHistoryTextArea.scrollHeight - - handleUserQuery(userQuery.trim('\n')) - document.getElementById('userMessageBox').value = '' - } - } - }) - } else { - document.getElementById('userMessageBox').hidden = true - } -} - -window.updateLocalVideoForIdle = () => { - if (document.getElementById('useLocalVideoForIdle').checked) { - document.getElementById('showTypeMessageCheckbox').hidden = true - } else { - document.getElementById('showTypeMessageCheckbox').hidden = false - } -} diff --git a/samples/swift/ios/embedded-speech/embedded-speech-to-text/embedded-speech-to-text/ViewController.swift b/samples/swift/ios/embedded-speech/embedded-speech-to-text/embedded-speech-to-text/ViewController.swift index 44f9c29f5..37b05dd88 100644 --- a/samples/swift/ios/embedded-speech/embedded-speech-to-text/embedded-speech-to-text/ViewController.swift +++ b/samples/swift/ios/embedded-speech/embedded-speech-to-text/embedded-speech-to-text/ViewController.swift @@ -22,9 +22,8 @@ let EmbeddedSpeechRecognitionModelFolderName = "STT" /// For example: "en-US" or "Microsoft Speech Recognizer en-US FP Model V8" let EmbeddedSpeechRecognitionModelName = "YourEmbeddedSpeechRecognitionModelName" -/// Decryption key of the (encrypted) embedded speech recognition model. -/// WARNING: The key may be visible in the program binary if hard-coded as a plain string. -let EmbeddedSpeechRecognitionModelKey = "YourEmbeddedSpeechRecognitionModelKey" +/// Embedded speech model license (text). +let EmbeddedSpeechModelLicense = "YourEmbeddedSpeechModelLicense" class ViewController: UIViewController { @@ -42,7 +41,7 @@ class ViewController: UIViewController { do { embeddedSpeechConfig = try SPXEmbeddedSpeechConfiguration(fromPath: absoluteModelPath) embeddedSpeechConfig?.setSpeechRecognitionModel(EmbeddedSpeechRecognitionModelName, - key: EmbeddedSpeechRecognitionModelKey) + license: EmbeddedSpeechModelLicense) } catch { print("Error: \(error) in initializing embedded speech configuration.") diff --git a/samples/swift/ios/embedded-speech/embedded-speech-translation/embedded-speech-translation/ViewController.swift b/samples/swift/ios/embedded-speech/embedded-speech-translation/embedded-speech-translation/ViewController.swift index 2bfe43005..0a6d5b866 100644 --- a/samples/swift/ios/embedded-speech/embedded-speech-translation/embedded-speech-translation/ViewController.swift +++ b/samples/swift/ios/embedded-speech/embedded-speech-translation/embedded-speech-translation/ViewController.swift @@ -21,9 +21,8 @@ let EmbeddedSpeechTranslationModelFolderName = "ST" /// For example: "Microsoft Speech Translator Many-to-English Model V3" let EmbeddedSpeechTranslationModelName = "YourEmbeddedSpeechTranslationModelName" -/// Decryption key of the (encrypted) embedded speech translation model. -/// WARNING: The key may be visible in the program binary if hard-coded as a plain string. -let EmbeddedSpeechTranslationModelKey = "YourEmbeddedSpeechTranslationModelKey" +/// Embedded speech model license (text). +let EmbeddedSpeechModelLicense = "YourEmbeddedSpeechModelLicense" class ViewController: UIViewController { @@ -41,7 +40,7 @@ class ViewController: UIViewController { do { embeddedSpeechConfig = try SPXEmbeddedSpeechConfiguration(fromPath: absoluteModelPath) embeddedSpeechConfig?.setSpeechTranslationModel(EmbeddedSpeechTranslationModelName, - key: EmbeddedSpeechTranslationModelKey) + license: EmbeddedSpeechModelLicense) } catch { print("Error: \(error) in initializing embedded speech configuration.") diff --git a/samples/swift/ios/embedded-speech/embedded-text-to-speech/embedded-text-to-speech/ViewController.swift b/samples/swift/ios/embedded-speech/embedded-text-to-speech/embedded-text-to-speech/ViewController.swift index 8dd3b2af5..3da9c62c6 100644 --- a/samples/swift/ios/embedded-speech/embedded-text-to-speech/embedded-text-to-speech/ViewController.swift +++ b/samples/swift/ios/embedded-speech/embedded-text-to-speech/embedded-text-to-speech/ViewController.swift @@ -19,9 +19,8 @@ let EmbeddedSpeechSynthesisVoicesFolderName = "TTS" /// For example: "en-US-JennyNeural" or "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)" let EmbeddedSpeechSynthesisVoiceName = "YourEmbeddedSpeechSynthesisVoiceName" -/// Decryption key of the (encrypted) embedded speech synthesis voice. -/// WARNING: The key may be visible in the program binary if hard-coded as a plain string. -let EmbeddedSpeechSynthesisVoiceKey = "YourEmbeddedSpeechSynthesisVoiceKey" +/// Embedded speech model license (text). +let EmbeddedSpeechModelLicense = "YourEmbeddedSpeechModelLicense" class ViewController: UIViewController, UITextFieldDelegate{ @@ -39,7 +38,7 @@ class ViewController: UIViewController, UITextFieldDelegate{ do { embeddedSpeechConfig = try SPXEmbeddedSpeechConfiguration(fromPath: absoluteModelPath) embeddedSpeechConfig?.setSpeechSynthesisVoice(EmbeddedSpeechSynthesisVoiceName, - key: EmbeddedSpeechSynthesisVoiceKey) + license: EmbeddedSpeechModelLicense) } catch { print("Error: \(error) in initializing embedded speech configuration.") diff --git a/samples/swift/ios/speech-samples/speech-samples/ViewController.swift b/samples/swift/ios/speech-samples/speech-samples/ViewController.swift index 04da1f9e1..d18377818 100644 --- a/samples/swift/ios/speech-samples/speech-samples/ViewController.swift +++ b/samples/swift/ios/speech-samples/speech-samples/ViewController.swift @@ -115,7 +115,7 @@ class ViewController: UIViewController { if (path == nil) { print("Cannot find audio file!"); self.updateLabel(text: "Cannot find audio file", color: UIColor.red) - return + return; } print("pronunciation assessment audio file path: ", path!) @@ -233,7 +233,7 @@ class ViewController: UIViewController { if (path == nil) { print("Cannot find audio file!"); self.updateLabel(text: "Cannot find audio file", color: UIColor.red) - return + return; } print("pronunciation assessment audio file path: ", path!) @@ -305,7 +305,7 @@ class ViewController: UIViewController { if (path == nil) { print("Cannot find audio file!"); self.updateLabel(text: "Cannot find audio file", color: UIColor.red) - return + return; } print("pronunciation assessment audio file path: ", path!) @@ -422,7 +422,7 @@ class ViewController: UIViewController { if (path == nil) { print("Cannot find audio file!"); self.updateLabel(text: "Cannot find audio file", color: UIColor.red) - return + return; } print("pronunciation assessment audio file path: ", path!) diff --git a/samples/swift/macos/speech-keyword-recognition/helloworld/Podfile b/samples/swift/macos/speech-keyword-recognition/helloworld/Podfile index a0413bbe8..3d95c1e77 100644 --- a/samples/swift/macos/speech-keyword-recognition/helloworld/Podfile +++ b/samples/swift/macos/speech-keyword-recognition/helloworld/Podfile @@ -1,5 +1,5 @@ target 'helloworld' do platform :osx, 10.14 - pod 'MicrosoftCognitiveServicesSpeech-macOS', '~> 1.38' + pod 'MicrosoftCognitiveServicesSpeech-macOS', '~> 1.40' use_frameworks! end diff --git a/scenarios/cpp/windows/captioning/captioning/captioning.vcxproj b/scenarios/cpp/windows/captioning/captioning/captioning.vcxproj index 488bb86a9..fbc79f689 100644 --- a/scenarios/cpp/windows/captioning/captioning/captioning.vcxproj +++ b/scenarios/cpp/windows/captioning/captioning/captioning.vcxproj @@ -159,12 +159,12 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/scenarios/cpp/windows/captioning/captioning/packages.config b/scenarios/cpp/windows/captioning/captioning/packages.config index 2d8ad1382..b2fd9e72f 100644 --- a/scenarios/cpp/windows/captioning/captioning/packages.config +++ b/scenarios/cpp/windows/captioning/captioning/packages.config @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/scenarios/csharp/dotnetcore/captioning/captioning/captioning.csproj b/scenarios/csharp/dotnetcore/captioning/captioning/captioning.csproj index 78d9b4ee0..25ecc11cd 100644 --- a/scenarios/csharp/dotnetcore/captioning/captioning/captioning.csproj +++ b/scenarios/csharp/dotnetcore/captioning/captioning/captioning.csproj @@ -8,7 +8,7 @@ - + \ No newline at end of file diff --git a/scenarios/java/jre/console/captioning/pom.xml b/scenarios/java/jre/console/captioning/pom.xml index f85240713..0f22430f1 100644 --- a/scenarios/java/jre/console/captioning/pom.xml +++ b/scenarios/java/jre/console/captioning/pom.xml @@ -46,7 +46,7 @@ mvn clean dependency:copy-dependencies com.microsoft.cognitiveservices.speech client-sdk - 1.38.0 + 1.40.0 \ No newline at end of file diff --git a/scenarios/python/console/captioning/captioning.py b/scenarios/python/console/captioning/captioning.py index 58c93146c..57c875bbe 100644 --- a/scenarios/python/console/captioning/captioning.py +++ b/scenarios/python/console/captioning/captioning.py @@ -219,7 +219,7 @@ def audio_config_from_user_config(self) -> helper.Read_Only_Dict : "audio_stream_format" : None, "pull_input_audio_stream_callback" : None, "pull_input_audio_stream" : None - }) + }); else : audio_stream_format = None if not self._user_config["use_compressed_audio"] : @@ -350,4 +350,4 @@ def stopped_handler(e : speechsdk.SessionEventArgs) : captioning.initialize() speech_recognizer_data = captioning.speech_recognizer_from_user_config() captioning.recognize_continuous(speech_recognizer=speech_recognizer_data["speech_recognizer"], format=speech_recognizer_data["audio_stream_format"], callback=speech_recognizer_data["pull_input_audio_stream_callback"], stream=speech_recognizer_data["pull_input_audio_stream"]) - captioning.finish() + captioning.finish() \ No newline at end of file