Skip to content

Commit

Permalink
Update Chinese word segmentation and score calculation and sessionId (#…
Browse files Browse the repository at this point in the history
…2633)

* csharp_cpp_java udpate

* split cn words with short audio

* remove the BOM of project file

---------

Co-authored-by: Steven Zhao (Shanghai Centific Technology) <[email protected]>
Co-authored-by: Ke Wang <[email protected]>
  • Loading branch information
3 people authored Oct 24, 2024
1 parent eeac4cd commit bb361bc
Show file tree
Hide file tree
Showing 7 changed files with 282 additions and 33 deletions.
16 changes: 16 additions & 0 deletions samples/cpp/windows/console/samples/speech_recognition_samples.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,10 @@ void PronunciationAssessmentWithMicrophone()
// Creates a speech recognizer using microphone as audio input.
auto recognizer = SpeechRecognizer::FromConfig(config, "en-US");

recognizer->SessionStarted.Connect([](const SessionEventArgs& e) {
std::cout << "SESSION ID: " << e.SessionId << std::endl;
});

while (true)
{
// Receives reference text from console input.
Expand Down Expand Up @@ -632,6 +636,10 @@ void PronunciationAssessmentWithStreamInternalAsync(shared_ptr<SpeechConfig> spe

pronAssessmentConfig->EnableProsodyAssessment();

speechRecognizer->SessionStarted.Connect([](const SessionEventArgs& e) {
std::cout << "SESSION ID: " << e.SessionId << std::endl;
});

pronAssessmentConfig->ApplyTo(speechRecognizer);

audioInputStream->Write(audioData.data(), static_cast<uint32_t>(audioData.size()));
Expand Down Expand Up @@ -712,6 +720,10 @@ void PronunciationAssessmentConfiguredWithJson()
// Creates a speech recognizer.
auto recognizer = SpeechRecognizer::FromConfig(config, "en-US", audioConfig);

recognizer->SessionStarted.Connect([](const SessionEventArgs& e) {
std::cout << "SESSION ID: " << e.SessionId << std::endl;
});

pronunciationConfig->ApplyTo(recognizer);

// Starts speech recognition, and returns after a single utterance is recognized.
Expand Down Expand Up @@ -769,6 +781,10 @@ void PronunciationAssessmentWithContentAssessment()
// Creates a speech recognizer.
auto recognizer = SpeechRecognizer::FromConfig(config, "en-US", audioConfig);

recognizer->SessionStarted.Connect([](const SessionEventArgs& e) {
std::cout << "SESSION ID: " << e.SessionId << std::endl;
});

pronunciationConfig->ApplyTo(recognizer);

vector<string> recognizedTexts;
Expand Down
9 changes: 9 additions & 0 deletions samples/csharp/dotnet-windows/console/samples/samples.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,15 @@
<Link>LanguageDetection_enUS.wav</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="zhcn_continuous_mode_sample.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="zhcn_continuous_mode_sample.wav">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="zhcn_short_dummy_sample.wav">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>
<ItemGroup>
<PackageReference Include="DiffPlex">
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
秋天总是那么富有诗意。树叶渐渐变红,街道旁的银杏树也开始落叶。人们穿上厚重的外套,享受着凉爽的秋风。黄昏时分,夕阳洒在街道上,给忙碌的一天增添了一抹温暖。无论是散步还是小憩,这个季节总能带来宁静和满足。清晨,薄雾笼罩大地,空气中弥漫着一丝清新的凉意。中午阳光明媚,照在身上暖洋洋的,仿佛是一场心灵的抚慰。傍晚时分,天空被染成了金黄和橙红,街上的行人脚步也不由得慢了下来,享受这份静谧和美好。你最喜欢哪个季节?
Binary file not shown.
Binary file not shown.
151 changes: 136 additions & 15 deletions samples/csharp/sharedcontent/console/speech_recognition_samples.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
using DiffPlex;
using DiffPlex.DiffBuilder;
using DiffPlex.DiffBuilder.Model;
using System.Text.Json;
using System.Text;
// </toplevel>

namespace MicrosoftSpeechSDKSamples
Expand Down Expand Up @@ -995,6 +997,10 @@ public static async Task PronunciationAssessmentWithMicrophoneAsync()
// Creates a speech recognizer for the specified language, using microphone as audio input.
using (var recognizer = new SpeechRecognizer(config, language))
{
recognizer.SessionStarted += (s, e) => {
Console.WriteLine($"SESSION ID: {e.SessionId}");
};

while (true)
{
// Receives reference text from console input.
Expand Down Expand Up @@ -1096,6 +1102,10 @@ private static async Task PronunciationAssessmentWithStreamInternalAsync(SpeechC

pronAssessmentConfig.EnableProsodyAssessment();

speechRecognizer.SessionStarted += (s, e) => {
Console.WriteLine($"SESSION ID: {e.SessionId}");
};

pronAssessmentConfig.ApplyTo(speechRecognizer);

audioInputStream.Write(audioData);
Expand Down Expand Up @@ -1142,6 +1152,11 @@ public static async Task PronunciationAssessmentConfiguredWithJson()
// Creates a speech recognizer for the specified language
using (var recognizer = new SpeechRecognizer(config, language, audioConfig))
{

recognizer.SessionStarted += (s, e) => {
Console.WriteLine($"SESSION ID: {e.SessionId}");
};

// Starts recognizing.
pronunciationConfig.ApplyTo(recognizer);

Expand Down Expand Up @@ -1185,37 +1200,110 @@ public static async Task PronunciationAssessmentConfiguredWithJson()
}
}

public static List<string> GetReferenceWords(string waveFilename, string referenceText, string language, SpeechConfig speechConfig)
{
var audioConfig = AudioConfig.FromWavFileInput(waveFilename);
speechConfig.SpeechRecognitionLanguage = language;

var speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig);

// Create pronunciation assessment config, set grading system, granularity, and enable miscue based on requirement
bool enableMiscue = true;
var pronunciationConfig = new PronunciationAssessmentConfig(referenceText,
GradingSystem.HundredMark, Granularity.Phoneme, enableMiscue);

// Apply pronunciation assessment config to speech recognizer
pronunciationConfig.ApplyTo(speechRecognizer);

// Perform speech recognition
var result = speechRecognizer.RecognizeOnceAsync().Result;

if (result.Reason == ResultReason.RecognizedSpeech)
{
var referenceWords = new List<string>();

var responseJson = result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult);
// Parse the JSON result to extract NBest and Words
JsonDocument doc = JsonDocument.Parse(responseJson);
JsonElement root = doc.RootElement;


JsonElement words = root.GetProperty("NBest")[0].GetProperty("Words");
foreach (JsonElement item in words.EnumerateArray())
{
string word_item = item.GetProperty("Word").GetString();
string errorType_item = item.GetProperty("PronunciationAssessment").GetProperty("ErrorType").GetString();

if (errorType_item != "Insertion")
{
referenceWords.Add(word_item);
}
}

return referenceWords;
}
else if (result.Reason == ResultReason.NoMatch)
{
Console.WriteLine("No speech could be recognized");
return null;
}
else if (result.Reason == ResultReason.Canceled)
{
var cancellation = CancellationDetails.FromResult(result);
Console.WriteLine($"Speech Recognition canceled: {cancellation.Reason}");
if (cancellation.Reason == CancellationReason.Error)
{
Console.WriteLine($"Error details: {cancellation.ErrorDetails}");
}
return null;
}

return null;
}

// Pronunciation assessment continous from file
// See more information at https://aka.ms/csspeech/pa
public static async Task PronunciationAssessmentContinuousWithFile()
{
// Creates an instance of a speech config with specified subscription key and service region.
// Replace with your own subscription key and service region (e.g., "westus").
var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");
var waveFileName = @"zhcn_continuous_mode_sample.wav";
var scriptFileName = @"zhcn_continuous_mode_sample.txt";
var referenceText = File.ReadAllText(scriptFileName);

// Switch to other languages for example Spanish, change language "en-US" to "es-ES". Language name is not case sensitive.
var language = "zh-CN";
if (language == "zh-CN")
{
Console.OutputEncoding = Encoding.UTF8;
}

// Creates a speech recognizer using file as audio input.
using (var audioInput = AudioConfig.FromWavFileInput(@"whatstheweatherlike.wav"))
using (var audioInput = AudioConfig.FromWavFileInput(waveFileName))
{
// Switch to other languages for example Spanish, change language "en-US" to "es-ES". Language name is not case sensitive.
var language = "en-US";

using (var recognizer = new SpeechRecognizer(config, language, audioInput))
{
var referenceText = "what's the weather like";

bool enableMiscue = true;

var pronConfig = new PronunciationAssessmentConfig(referenceText, GradingSystem.HundredMark, Granularity.Phoneme, enableMiscue);

pronConfig.EnableProsodyAssessment();


recognizer.SessionStarted += (s, e) => {
Console.WriteLine($"SESSION ID: {e.SessionId}");
};

pronConfig.ApplyTo(recognizer);

var recognizedWords = new List<string>();
var pronWords = new List<Word>();
var finalWords = new List<Word>();
var fluency_scores = new List<double>();
var prosody_scores = new List<double>();
var startOffset = 0L;
var endOffset = 0L;
var durations = new List<int>();
var done = false;

Expand All @@ -1234,7 +1322,6 @@ public static async Task PronunciationAssessmentContinuousWithFile()
var pronResult = PronunciationAssessmentResult.FromResult(e.Result);
Console.WriteLine($" Accuracy score: {pronResult.AccuracyScore}, prosody score:{pronResult.ProsodyScore}, pronunciation score: {pronResult.PronunciationScore}, completeness score: {pronResult.CompletenessScore}, fluency score: {pronResult.FluencyScore}");

fluency_scores.Add(pronResult.FluencyScore);
prosody_scores.Add(pronResult.ProsodyScore);

foreach(var word in pronResult.Words)
Expand All @@ -1245,9 +1332,12 @@ public static async Task PronunciationAssessmentContinuousWithFile()

foreach (var result in e.Result.Best())
{
durations.Add(result.Words.Sum(item => item.Duration));
durations.AddRange(result.Words.Select(item => item.Duration + 100000).ToList());
recognizedWords.AddRange(result.Words.Select(item => item.Word).ToList());

if (startOffset == 0) startOffset = result.Words.First().Offset;

endOffset = result.Words.Last().Offset + result.Words.Last().Duration + 100000;
}
};

Expand All @@ -1263,13 +1353,26 @@ public static async Task PronunciationAssessmentContinuousWithFile()
// Waits for completion.
await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);

// set the duration of Word in pronWords
pronWords.Zip(durations, (word, duration) => word.Duration = duration).ToList();

// For continuous pronunciation assessment mode, the service won't return the words with `Insertion` or `Omission`
// even if miscue is enabled.
// We need to compare with the reference text after received all recognized words to get these error words.
string[] referenceWords = referenceText.ToLower().Split(' ');
for (int j = 0; j < referenceWords.Length; j++)
string[] referenceWords;

if (language == "zh-CN")
{
referenceWords[j] = Regex.Replace(referenceWords[j], "^[\\p{P}\\s]+|[\\p{P}\\s]+$", "");
// Split words for Chinese using the reference text and any short wave file
referenceWords = GetReferenceWords(@"zhcn_short_dummy_sample.wav", referenceText, language, config).ToArray();
}
else
{
referenceWords = referenceText.ToLower().Split(' ');
for (int j = 0; j < referenceWords.Length; j++)
{
referenceWords[j] = Regex.Replace(referenceWords[j], "^[\\p{P}\\s]+|[\\p{P}\\s]+$", "");
}
}

if (enableMiscue)
Expand Down Expand Up @@ -1297,7 +1400,7 @@ public static async Task PronunciationAssessmentContinuousWithFile()

if (delta.Type == ChangeType.Inserted || delta.Type == ChangeType.Modified)
{
Word w = pronWords[currentIdx];
Word w = new Word(pronWords[currentIdx].WordText, pronWords[currentIdx].ErrorType, pronWords[currentIdx].AccuracyScore, pronWords[currentIdx].Duration);
if (w.ErrorType == "None")
{
w.ErrorType = "Insertion";
Expand All @@ -1321,13 +1424,20 @@ public static async Task PronunciationAssessmentContinuousWithFile()
var prosodyScore = prosody_scores.Average();

// Recalculate fluency score
var fluencyScore = fluency_scores.Zip(durations, (x, y) => x * y).Sum() / durations.Sum();
var durations_sum = finalWords.Where(item => item.ErrorType == "None")
.Sum(item => item.Duration);

var fluencyScore = durations_sum * 1.0 / (endOffset - startOffset) * 100;

// Calculate whole completeness score
var completenessScore = (double)pronWords.Count(item => item.ErrorType == "None") / referenceWords.Length * 100;
var completenessScore = (double)finalWords.Count(item => item.ErrorType == "None") / filteredWords.Count() * 100;
completenessScore = completenessScore <= 100 ? completenessScore : 100;

Console.WriteLine("Paragraph accuracy score: {0}, prosody score: {1} completeness score: {2}, fluency score: {3}", accuracyScore, prosodyScore, completenessScore, fluencyScore);
List<double> scores_list = new List<double> {accuracyScore, prosodyScore, completenessScore, fluencyScore };

double pronunciationScore = scores_list.Sum(n => n * 0.2) + scores_list.Min() * 0.2;

Console.WriteLine("Paragraph accuracy score: {0}, prosody score: {1} completeness score: {2}, fluency score: {3}, pronunciation score: {4}", accuracyScore, prosodyScore, completenessScore, fluencyScore, pronunciationScore);

for (int idx = 0; idx < finalWords.Count(); idx++)
{
Expand Down Expand Up @@ -1364,6 +1474,10 @@ public static async Task PronunciationAssessmentWithContentAssessment()
pronConfig.EnableProsodyAssessment();
pronConfig.EnableContentAssessmentWithTopic(theTopic);

recognizer.SessionStarted += (s, e) => {
Console.WriteLine($"SESSION ID: {e.SessionId}");
};

pronConfig.ApplyTo(recognizer);

var recognizedTexts = new List<string>();
Expand Down Expand Up @@ -1921,18 +2035,25 @@ public class Word
public string WordText { get; set; }
public string ErrorType { get; set; }
public double AccuracyScore { get; set; }
public double Duration { get; set; }

public Word(string wordText, string errorType)
{
WordText = wordText;
ErrorType = errorType;
AccuracyScore = 0;
Duration = 0;
}

public Word(string wordText, string errorType, double accuracyScore) : this(wordText, errorType)
{
AccuracyScore = accuracyScore;
}

public Word(string wordText, string errorType, double accuracyScore, double duration) : this(wordText, errorType, accuracyScore)
{
Duration = duration;
}
}

}
Loading

0 comments on commit bb361bc

Please sign in to comment.