Update Chinese word segmentation and score calculation and sessionId (#…

…2633) * csharp_cpp_java udpate * split cn words with short audio * remove the BOM of project file --------- Co-authored-by: Steven Zhao (Shanghai Centific Technology) <[email protected]> Co-authored-by: Ke Wang <[email protected]>
Azure-Samples · Oct 24, 2024 · bb361bc · bb361bc
1 parent eeac4cd
commit bb361bc
Show file tree

Hide file tree

Showing 7 changed files with 282 additions and 33 deletions.
diff --git a/samples/cpp/windows/console/samples/speech_recognition_samples.cpp b/samples/cpp/windows/console/samples/speech_recognition_samples.cpp
@@ -568,6 +568,10 @@ void PronunciationAssessmentWithMicrophone()
     // Creates a speech recognizer using microphone as audio input.
     auto recognizer = SpeechRecognizer::FromConfig(config, "en-US");
 
+    recognizer->SessionStarted.Connect([](const SessionEventArgs& e) {
+        std::cout << "SESSION ID: " << e.SessionId << std::endl;
+        });
+
     while (true)
     {
         // Receives reference text from console input.
@@ -632,6 +636,10 @@ void PronunciationAssessmentWithStreamInternalAsync(shared_ptr<SpeechConfig> spe
 
     pronAssessmentConfig->EnableProsodyAssessment();
 
+    speechRecognizer->SessionStarted.Connect([](const SessionEventArgs& e) {
+        std::cout << "SESSION ID: " << e.SessionId << std::endl;
+        });
+
     pronAssessmentConfig->ApplyTo(speechRecognizer);
 
     audioInputStream->Write(audioData.data(), static_cast<uint32_t>(audioData.size()));
@@ -712,6 +720,10 @@ void PronunciationAssessmentConfiguredWithJson()
     // Creates a speech recognizer.
     auto recognizer = SpeechRecognizer::FromConfig(config, "en-US", audioConfig);
 
+    recognizer->SessionStarted.Connect([](const SessionEventArgs& e) {
+        std::cout << "SESSION ID: " << e.SessionId << std::endl;
+        });
+
     pronunciationConfig->ApplyTo(recognizer);
 
     // Starts speech recognition, and returns after a single utterance is recognized.
@@ -769,6 +781,10 @@ void PronunciationAssessmentWithContentAssessment()
     // Creates a speech recognizer.
     auto recognizer = SpeechRecognizer::FromConfig(config, "en-US", audioConfig);
 
+    recognizer->SessionStarted.Connect([](const SessionEventArgs& e) {
+        std::cout << "SESSION ID: " << e.SessionId << std::endl;
+        });
+
     pronunciationConfig->ApplyTo(recognizer);
 
     vector<string> recognizedTexts;

diff --git a/samples/csharp/dotnet-windows/console/samples/samples.csproj b/samples/csharp/dotnet-windows/console/samples/samples.csproj
@@ -139,6 +139,15 @@
       <Link>LanguageDetection_enUS.wav</Link>
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
     </Content>
+    <Content Include="zhcn_continuous_mode_sample.txt">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
+    <Content Include="zhcn_continuous_mode_sample.wav">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
+    <Content Include="zhcn_short_dummy_sample.wav">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
   </ItemGroup>
   <ItemGroup>
     <PackageReference Include="DiffPlex">

diff --git a/samples/csharp/dotnet-windows/console/samples/zhcn_continuous_mode_sample.txt b/samples/csharp/dotnet-windows/console/samples/zhcn_continuous_mode_sample.txt
@@ -0,0 +1 @@
+秋天总是那么富有诗意。树叶渐渐变红，街道旁的银杏树也开始落叶。人们穿上厚重的外套，享受着凉爽的秋风。黄昏时分，夕阳洒在街道上，给忙碌的一天增添了一抹温暖。无论是散步还是小憩，这个季节总能带来宁静和满足。清晨，薄雾笼罩大地，空气中弥漫着一丝清新的凉意。中午阳光明媚，照在身上暖洋洋的，仿佛是一场心灵的抚慰。傍晚时分，天空被染成了金黄和橙红，街上的行人脚步也不由得慢了下来，享受这份静谧和美好。你最喜欢哪个季节？
diff --git a/samples/csharp/dotnet-windows/console/samples/zhcn_continuous_mode_sample.wav b/samples/csharp/dotnet-windows/console/samples/zhcn_continuous_mode_sample.wav
diff --git a/samples/csharp/dotnet-windows/console/samples/zhcn_short_dummy_sample.wav b/samples/csharp/dotnet-windows/console/samples/zhcn_short_dummy_sample.wav
diff --git a/samples/csharp/sharedcontent/console/speech_recognition_samples.cs b/samples/csharp/sharedcontent/console/speech_recognition_samples.cs
@@ -16,6 +16,8 @@
 using DiffPlex;
 using DiffPlex.DiffBuilder;
 using DiffPlex.DiffBuilder.Model;
+using System.Text.Json;
+using System.Text;
 // </toplevel>
 
 namespace MicrosoftSpeechSDKSamples
@@ -995,6 +997,10 @@ public static async Task PronunciationAssessmentWithMicrophoneAsync()
             // Creates a speech recognizer for the specified language, using microphone as audio input.
             using (var recognizer = new SpeechRecognizer(config, language))
             {
+                recognizer.SessionStarted += (s, e) => {
+                    Console.WriteLine($"SESSION ID: {e.SessionId}");
+                };
+
                 while (true)
                 {
                     // Receives reference text from console input.
@@ -1096,6 +1102,10 @@ private static async Task PronunciationAssessmentWithStreamInternalAsync(SpeechC
 
                 pronAssessmentConfig.EnableProsodyAssessment();
 
+                speechRecognizer.SessionStarted += (s, e) => {
+                    Console.WriteLine($"SESSION ID: {e.SessionId}");
+                };
+
                 pronAssessmentConfig.ApplyTo(speechRecognizer);
 
                 audioInputStream.Write(audioData);
@@ -1142,6 +1152,11 @@ public static async Task PronunciationAssessmentConfiguredWithJson()
             // Creates a speech recognizer for the specified language
             using (var recognizer = new SpeechRecognizer(config, language, audioConfig))
             {
+
+                recognizer.SessionStarted += (s, e) => {
+                    Console.WriteLine($"SESSION ID: {e.SessionId}");
+                };
+
                 // Starts recognizing.
                 pronunciationConfig.ApplyTo(recognizer);
 
@@ -1185,37 +1200,110 @@ public static async Task PronunciationAssessmentConfiguredWithJson()
             }
         }
 
+        public static List<string> GetReferenceWords(string waveFilename, string referenceText, string language, SpeechConfig speechConfig)
+        {
+            var audioConfig = AudioConfig.FromWavFileInput(waveFilename);
+            speechConfig.SpeechRecognitionLanguage = language;
+
+            var speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig);
+
+            // Create pronunciation assessment config, set grading system, granularity, and enable miscue based on requirement
+            bool enableMiscue = true;
+            var pronunciationConfig = new PronunciationAssessmentConfig(referenceText,
+                GradingSystem.HundredMark, Granularity.Phoneme, enableMiscue);
+
+            // Apply pronunciation assessment config to speech recognizer
+            pronunciationConfig.ApplyTo(speechRecognizer);
+
+            // Perform speech recognition
+            var result = speechRecognizer.RecognizeOnceAsync().Result;
+
+            if (result.Reason == ResultReason.RecognizedSpeech)
+            {
+                var referenceWords = new List<string>();
+
+                var responseJson = result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult);
+                // Parse the JSON result to extract NBest and Words
+                JsonDocument doc = JsonDocument.Parse(responseJson);
+                JsonElement root = doc.RootElement;
+
+
+                JsonElement words = root.GetProperty("NBest")[0].GetProperty("Words");
+                foreach (JsonElement item in words.EnumerateArray())
+                {
+                    string word_item = item.GetProperty("Word").GetString();
+                    string errorType_item = item.GetProperty("PronunciationAssessment").GetProperty("ErrorType").GetString();
+
+                    if (errorType_item != "Insertion")
+                    {
+                        referenceWords.Add(word_item);
+                    }
+                }
+
+                return referenceWords;
+            }
+            else if (result.Reason == ResultReason.NoMatch)
+            {
+                Console.WriteLine("No speech could be recognized");
+                return null;
+            }
+            else if (result.Reason == ResultReason.Canceled)
+            {
+                var cancellation = CancellationDetails.FromResult(result);
+                Console.WriteLine($"Speech Recognition canceled: {cancellation.Reason}");
+                if (cancellation.Reason == CancellationReason.Error)
+                {
+                    Console.WriteLine($"Error details: {cancellation.ErrorDetails}");
+                }
+                return null;
+            }
+
+            return null;
+        }
+
         // Pronunciation assessment continous from file
         // See more information at https://aka.ms/csspeech/pa
         public static async Task PronunciationAssessmentContinuousWithFile()
         {
             // Creates an instance of a speech config with specified subscription key and service region.
             // Replace with your own subscription key and service region (e.g., "westus").
             var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
+            var waveFileName = @"zhcn_continuous_mode_sample.wav";
+            var scriptFileName = @"zhcn_continuous_mode_sample.txt";
+            var referenceText = File.ReadAllText(scriptFileName);
+
+            // Switch to other languages for example Spanish, change language "en-US" to "es-ES". Language name is not case sensitive.
+            var language = "zh-CN";
+            if (language == "zh-CN")
+            {
+                Console.OutputEncoding = Encoding.UTF8;
+            }
 
             // Creates a speech recognizer using file as audio input. 
-            using (var audioInput = AudioConfig.FromWavFileInput(@"whatstheweatherlike.wav"))
+            using (var audioInput = AudioConfig.FromWavFileInput(waveFileName))
             {
-                // Switch to other languages for example Spanish, change language "en-US" to "es-ES". Language name is not case sensitive.
-                var language = "en-US";
 
                 using (var recognizer = new SpeechRecognizer(config, language, audioInput))
                 {
-                    var referenceText = "what's the weather like";
 
                     bool enableMiscue = true;
 
                     var pronConfig = new PronunciationAssessmentConfig(referenceText, GradingSystem.HundredMark, Granularity.Phoneme, enableMiscue);
 
                     pronConfig.EnableProsodyAssessment();
-
+
+                    recognizer.SessionStarted += (s, e) => {
+                        Console.WriteLine($"SESSION ID: {e.SessionId}");
+                    };
+
                     pronConfig.ApplyTo(recognizer);
 
                     var recognizedWords = new List<string>();
                     var pronWords = new List<Word>();
                     var finalWords = new List<Word>();
-                    var fluency_scores = new List<double>();
                     var prosody_scores = new List<double>();
+                    var startOffset = 0L;
+                    var endOffset = 0L;
                     var durations = new List<int>();
                     var done = false;
 
@@ -1234,7 +1322,6 @@ public static async Task PronunciationAssessmentContinuousWithFile()
                         var pronResult = PronunciationAssessmentResult.FromResult(e.Result);
                         Console.WriteLine($"    Accuracy score: {pronResult.AccuracyScore}, prosody score:{pronResult.ProsodyScore}, pronunciation score: {pronResult.PronunciationScore}, completeness score: {pronResult.CompletenessScore}, fluency score: {pronResult.FluencyScore}");
 
-                        fluency_scores.Add(pronResult.FluencyScore);
                         prosody_scores.Add(pronResult.ProsodyScore);
 
                         foreach(var word in pronResult.Words)
@@ -1245,9 +1332,12 @@ public static async Task PronunciationAssessmentContinuousWithFile()
 
                         foreach (var result in e.Result.Best())
                         {
-                            durations.Add(result.Words.Sum(item => item.Duration));
+                            durations.AddRange(result.Words.Select(item => item.Duration + 100000).ToList());
                             recognizedWords.AddRange(result.Words.Select(item => item.Word).ToList());
 
+                            if (startOffset == 0) startOffset = result.Words.First().Offset;
+
+                            endOffset = result.Words.Last().Offset + result.Words.Last().Duration + 100000;
                         }
                     };
 
@@ -1263,13 +1353,26 @@ public static async Task PronunciationAssessmentContinuousWithFile()
                     // Waits for completion.
                     await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
 
+                    // set the duration of Word in pronWords
+                    pronWords.Zip(durations, (word, duration) => word.Duration = duration).ToList();
+
                     // For continuous pronunciation assessment mode, the service won't return the words with `Insertion` or `Omission`
                     // even if miscue is enabled.
                     // We need to compare with the reference text after received all recognized words to get these error words.
-                    string[] referenceWords = referenceText.ToLower().Split(' ');
-                    for (int j = 0; j < referenceWords.Length; j++)
+                    string[] referenceWords;
+
+                    if (language == "zh-CN")
                     {
-                        referenceWords[j] = Regex.Replace(referenceWords[j], "^[\\p{P}\\s]+|[\\p{P}\\s]+$", "");
+                        // Split words for Chinese using the reference text and any short wave file
+                        referenceWords = GetReferenceWords(@"zhcn_short_dummy_sample.wav", referenceText, language, config).ToArray();
+                    }
+                    else
+                    {
+                        referenceWords = referenceText.ToLower().Split(' ');
+                        for (int j = 0; j < referenceWords.Length; j++)
+                        {
+                            referenceWords[j] = Regex.Replace(referenceWords[j], "^[\\p{P}\\s]+|[\\p{P}\\s]+$", "");
+                        }
                     }
 
                     if (enableMiscue)
@@ -1297,7 +1400,7 @@ public static async Task PronunciationAssessmentContinuousWithFile()
 
                             if (delta.Type == ChangeType.Inserted || delta.Type == ChangeType.Modified)
                             {
-                                Word w = pronWords[currentIdx];
+                                Word w = new Word(pronWords[currentIdx].WordText, pronWords[currentIdx].ErrorType, pronWords[currentIdx].AccuracyScore, pronWords[currentIdx].Duration);
                                 if (w.ErrorType == "None")
                                 {
                                     w.ErrorType = "Insertion";
@@ -1321,13 +1424,20 @@ public static async Task PronunciationAssessmentContinuousWithFile()
                     var prosodyScore = prosody_scores.Average();
 
                     // Recalculate fluency score
-                    var fluencyScore = fluency_scores.Zip(durations, (x, y) => x * y).Sum() / durations.Sum();
+                    var durations_sum = finalWords.Where(item => item.ErrorType == "None")
+                        .Sum(item => item.Duration);
+
+                    var fluencyScore = durations_sum * 1.0 / (endOffset - startOffset) * 100;
 
                     // Calculate whole completeness score
-                    var completenessScore = (double)pronWords.Count(item => item.ErrorType == "None") / referenceWords.Length * 100;
+                    var completenessScore = (double)finalWords.Count(item => item.ErrorType == "None") / filteredWords.Count() * 100;
                     completenessScore = completenessScore <= 100 ? completenessScore : 100;
 
-                    Console.WriteLine("Paragraph accuracy score: {0}, prosody score: {1} completeness score: {2}, fluency score: {3}", accuracyScore, prosodyScore, completenessScore, fluencyScore);
+                    List<double> scores_list = new List<double> {accuracyScore, prosodyScore, completenessScore, fluencyScore };
+
+                    double pronunciationScore = scores_list.Sum(n => n * 0.2) + scores_list.Min() * 0.2;
+
+                    Console.WriteLine("Paragraph accuracy score: {0}, prosody score: {1} completeness score: {2}, fluency score: {3}, pronunciation score: {4}", accuracyScore, prosodyScore, completenessScore, fluencyScore, pronunciationScore);
 
                     for (int idx = 0; idx < finalWords.Count(); idx++)
                     {
@@ -1364,6 +1474,10 @@ public static async Task PronunciationAssessmentWithContentAssessment()
                     pronConfig.EnableProsodyAssessment();
                     pronConfig.EnableContentAssessmentWithTopic(theTopic);
 
+                    recognizer.SessionStarted += (s, e) => {
+                        Console.WriteLine($"SESSION ID: {e.SessionId}");
+                    };
+
                     pronConfig.ApplyTo(recognizer);
 
                     var recognizedTexts = new List<string>();
@@ -1921,18 +2035,25 @@ public class Word
         public string WordText { get; set; }
         public string ErrorType { get; set; }
         public double AccuracyScore { get; set; }
+        public double Duration { get; set; }
 
         public Word(string wordText, string errorType)
         {
             WordText = wordText;
             ErrorType = errorType;
             AccuracyScore = 0;
+            Duration = 0;
         }
 
         public Word(string wordText, string errorType, double accuracyScore) : this(wordText, errorType)
         {
             AccuracyScore = accuracyScore;
         }
+
+        public Word(string wordText, string errorType, double accuracyScore, double duration) : this(wordText, errorType, accuracyScore)
+        {
+            Duration = duration;
+        }
     }
 
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		秋天总是那么富有诗意。树叶渐渐变红，街道旁的银杏树也开始落叶。人们穿上厚重的外套，享受着凉爽的秋风。黄昏时分，夕阳洒在街道上，给忙碌的一天增添了一抹温暖。无论是散步还是小憩，这个季节总能带来宁静和满足。清晨，薄雾笼罩大地，空气中弥漫着一丝清新的凉意。中午阳光明媚，照在身上暖洋洋的，仿佛是一场心灵的抚慰。傍晚时分，天空被染成了金黄和橙红，街上的行人脚步也不由得慢了下来，享受这份静谧和美好。你最喜欢哪个季节？