Skip to content

Commit

Permalink
Merge branch 'master' into rhurey/samples_1.41
Browse files Browse the repository at this point in the history
  • Loading branch information
yulin-li authored Oct 11, 2024
2 parents 318a15d + cbe01b6 commit 45e5bf8
Show file tree
Hide file tree
Showing 8 changed files with 402 additions and 26 deletions.
51 changes: 39 additions & 12 deletions samples/csharp/web/avatar/Controllers/AvatarController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,17 @@ public async Task<IActionResult> ConnectAvatar()
speechConfig.EndpointId = customVoiceEndpointId;
}

var speechSynthesizer = new SpeechSynthesizer(speechConfig);
var speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
clientContext.SpeechSynthesizer = speechSynthesizer;

if (ClientSettings.EnableAudioAudit)
{
speechSynthesizer.Synthesizing += (o, e) =>
{
Console.WriteLine($"Audio chunk received: {e.Result.AudioData.Length} bytes.");
};
}

if (string.IsNullOrEmpty(GlobalVariables.IceToken))
{
return BadRequest("IceToken is missing or invalid.");
Expand All @@ -168,7 +176,7 @@ public async Task<IActionResult> ConnectAvatar()
{
iceTokenObj = new Dictionary<string, object>
{
{ "Urls", string.IsNullOrEmpty(_clientSettings.IceServerUrlRemote) ? [_clientSettings.IceServerUrl] : new[] { _clientSettings.IceServerUrlRemote } },
{ "Urls", string.IsNullOrEmpty(_clientSettings.IceServerUrlRemote) ? new JArray(_clientSettings.IceServerUrl) : new JArray(_clientSettings.IceServerUrlRemote) },
{ "Username", _clientSettings.IceServerUsername },
{ "Password", _clientSettings.IceServerPassword }
};
Expand All @@ -189,7 +197,7 @@ public async Task<IActionResult> ConnectAvatar()
var videoCrop = Request.Headers["VideoCrop"].FirstOrDefault() ?? "false";

// Configure avatar settings
var urlsArray = iceTokenObj?.TryGetValue("Urls", out var value) == true ? value as string[] : null;
var urlsArray = iceTokenObj?.TryGetValue("Urls", out var value) == true ? value as JArray : null;

var firstUrl = urlsArray?.FirstOrDefault()?.ToString();

Expand All @@ -213,7 +221,8 @@ public async Task<IActionResult> ConnectAvatar()
username = iceTokenObj!["Username"],
credential = iceTokenObj["Password"]
}
}
},
auditAudio = ClientSettings.EnableAudioAudit
}
},
format = new
Expand Down Expand Up @@ -255,7 +264,7 @@ public async Task<IActionResult> ConnectAvatar()
connection.SetMessageProperty("speech.config", "context", JsonConvert.SerializeObject(avatarConfig));

var speechSynthesisResult = speechSynthesizer.SpeakTextAsync("").Result;
Console.WriteLine($"Result ID: {speechSynthesisResult.ResultId}");
Console.WriteLine($"Result ID: {speechSynthesisResult.ResultId}");
if (speechSynthesisResult.Reason == ResultReason.Canceled)
{
var cancellationDetails = SpeechSynthesisCancellationDetails.FromResult(speechSynthesisResult);
Expand Down Expand Up @@ -456,7 +465,7 @@ public async Task HandleUserQuery(string userQuery, Guid clientId, HttpResponse
// We return some quick reply here before the chat API returns to mitigate.
if (ClientSettings.EnableQuickReply)
{
await SpeakWithQueue(ClientSettings.QuickReplies[new Random().Next(ClientSettings.QuickReplies.Count)], 2000, clientId);
await SpeakWithQueue(ClientSettings.QuickReplies[new Random().Next(ClientSettings.QuickReplies.Count)], 2000, clientId, httpResponse);
}

// Process the responseContent as needed
Expand Down Expand Up @@ -507,9 +516,13 @@ public async Task HandleUserQuery(string userQuery, Guid clientId, HttpResponse
responseToken = ClientSettings.OydDocRegex.Replace(responseToken, string.Empty);
}

await httpResponse.WriteAsync(responseToken).ConfigureAwait(false);
if (!ClientSettings.EnableDisplayTextAlignmentWithSpeech)
{
await httpResponse.WriteAsync(responseToken).ConfigureAwait(false);
}

assistantReply.Append(responseToken);
spokenSentence.Append(responseToken); // build up the spoken sentence
if (responseToken == "\n" || responseToken == "\n\n")
{
if (isFirstSentence)
Expand All @@ -520,13 +533,12 @@ public async Task HandleUserQuery(string userQuery, Guid clientId, HttpResponse
isFirstSentence = false;
}

await SpeakWithQueue(spokenSentence.ToString().Trim(), 0, clientId);
await SpeakWithQueue(spokenSentence.ToString(), 0, clientId, httpResponse);
spokenSentence.Clear();
}
else
{
responseToken = responseToken.Replace("\n", string.Empty);
spokenSentence.Append(responseToken); // build up the spoken sentence
if (responseToken.Length == 1 || responseToken.Length == 2)
{
foreach (var punctuation in ClientSettings.SentenceLevelPunctuations)
Expand All @@ -541,7 +553,7 @@ public async Task HandleUserQuery(string userQuery, Guid clientId, HttpResponse
isFirstSentence = false;
}

await SpeakWithQueue(spokenSentence.ToString().Trim(), 0, clientId);
await SpeakWithQueue(spokenSentence.ToString(), 0, clientId, httpResponse);
spokenSentence.Clear();
break;
}
Expand All @@ -553,11 +565,21 @@ public async Task HandleUserQuery(string userQuery, Guid clientId, HttpResponse

if (spokenSentence.Length > 0)
{
await SpeakWithQueue(spokenSentence.ToString().Trim(), 0, clientId);
await SpeakWithQueue(spokenSentence.ToString(), 0, clientId, httpResponse);
}

var assistantMessage = new AssistantChatMessage(assistantReply.ToString());
messages.Add(assistantMessage);

if (ClientSettings.EnableDisplayTextAlignmentWithSpeech)
{
while (clientContext.SpokenTextQueue.Count > 0)
{
await Task.Delay(200);
}

await Task.Delay(200);
}
}

public void InitializeChatContext(string systemPrompt, Guid clientId)
Expand All @@ -572,7 +594,7 @@ public void InitializeChatContext(string systemPrompt, Guid clientId)
}

// Speak the given text. If there is already a speaking in progress, add the text to the queue. For chat scenario.
public Task SpeakWithQueue(string text, int endingSilenceMs, Guid clientId)
public Task SpeakWithQueue(string text, int endingSilenceMs, Guid clientId, HttpResponse httpResponse)
{
var clientContext = _clientService.GetClientContext(clientId);

Expand All @@ -595,6 +617,11 @@ public Task SpeakWithQueue(string text, int endingSilenceMs, Guid clientId)
while (spokenTextQueue.Count > 0)
{
var currentText = spokenTextQueue.Dequeue();
if (ClientSettings.EnableDisplayTextAlignmentWithSpeech)
{
httpResponse.WriteAsync(currentText);
}

await SpeakText(currentText, ttsVoice!, personalVoiceSpeakerProfileId!, endingSilenceMs, clientId);
clientContext.LastSpeakTime = DateTime.UtcNow;
}
Expand Down
4 changes: 4 additions & 0 deletions samples/csharp/web/avatar/Models/ClientSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ public class ClientSettings

public static readonly bool EnableQuickReply = false;

public static readonly bool EnableDisplayTextAlignmentWithSpeech = false;

public static readonly bool EnableAudioAudit = false;

public string? SpeechRegion { get; set; }

public string? SpeechKey { get; set; }
Expand Down
2 changes: 1 addition & 1 deletion samples/csharp/web/avatar/Views/Home/chat.cshtml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
<label style="font-size: medium;" for="sttLocale">STT Locale(s):</label>
<input id="sttLocales" type="text" size="64" style="font-size: medium;" value="en-US"></input><br />
<label style="font-size: medium;" for="ttsVoice">TTS Voice:</label>
<input id="ttsVoice" type="text" size="32" style="font-size: medium;" value="en-US-JennyNeural"></input><br />
<input id="ttsVoice" type="text" size="32" style="font-size: medium;" value="en-US-AvaNeural"></input><br />
<label style="font-size: medium;" for="customVoiceEndpointId">Custom Voice Deployment ID (Endpoint ID):</label>
<input id="customVoiceEndpointId" type="text" size="32" style="font-size: medium;" value=""></input><br />
<label style="font-size: medium;" for="personalVoiceSpeakerProfileID">Personal Voice Speaker Profile ID:</label>
Expand Down
24 changes: 17 additions & 7 deletions samples/js/browser/avatar/js/chat.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ var messages = []
var messageInitiated = false
var dataSources = []
var sentenceLevelPunctuations = [ '.', '?', '!', ':', ';', '。', '?', '!', ':', ';' ]
var enableDisplayTextAlignmentWithSpeech = true
var enableQuickReply = false
var quickReplies = [ 'Let me take a look.', 'Let me check.', 'One moment, please.' ]
var byodDocRegex = new RegExp(/\[doc(\d+)\]/g)
Expand Down Expand Up @@ -322,6 +323,12 @@ function speakNext(text, endingSilenceMs = 0) {
ssml = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xml:lang='en-US'><voice name='${ttsVoice}'><mstts:ttsembedding speakerProfileId='${personalVoiceSpeakerProfileID}'><mstts:leadingsilence-exact value='0'/>${htmlEncode(text)}<break time='${endingSilenceMs}ms' /></mstts:ttsembedding></voice></speak>`
}

if (enableDisplayTextAlignmentWithSpeech) {
let chatHistoryTextArea = document.getElementById('chatHistory')
chatHistoryTextArea.innerHTML += text.replace(/\n/g, '<br/>')
chatHistoryTextArea.scrollTop = chatHistoryTextArea.scrollHeight
}

lastSpeakTime = new Date()
isSpeaking = true
document.getElementById('stopSpeaking').disabled = false
Expand Down Expand Up @@ -506,17 +513,18 @@ function handleUserQuery(userQuery, userQueryHTML, imgUrlPath) {
// console.log(`Current token: ${responseToken}`)

if (responseToken === '\n' || responseToken === '\n\n') {
speak(spokenSentence.trim())
spokenSentence += responseToken
speak(spokenSentence)
spokenSentence = ''
} else {
responseToken = responseToken.replace(/\n/g, '')
spokenSentence += responseToken // build up the spoken sentence

responseToken = responseToken.replace(/\n/g, '')
if (responseToken.length === 1 || responseToken.length === 2) {
for (let i = 0; i < sentenceLevelPunctuations.length; ++i) {
let sentenceLevelPunctuation = sentenceLevelPunctuations[i]
if (responseToken.startsWith(sentenceLevelPunctuation)) {
speak(spokenSentence.trim())
speak(spokenSentence)
spokenSentence = ''
break
}
Expand All @@ -531,9 +539,11 @@ function handleUserQuery(userQuery, userQueryHTML, imgUrlPath) {
}
})

chatHistoryTextArea.innerHTML += `${displaySentence}`
chatHistoryTextArea.scrollTop = chatHistoryTextArea.scrollHeight
displaySentence = ''
if (!enableDisplayTextAlignmentWithSpeech) {
chatHistoryTextArea.innerHTML += displaySentence.replace(/\n/g, '<br/>')
chatHistoryTextArea.scrollTop = chatHistoryTextArea.scrollHeight
displaySentence = ''
}

// Continue reading the next chunk
return read()
Expand All @@ -545,7 +555,7 @@ function handleUserQuery(userQuery, userQueryHTML, imgUrlPath) {
})
.then(() => {
if (spokenSentence !== '') {
speak(spokenSentence.trim())
speak(spokenSentence)
spokenSentence = ''
}

Expand Down
Loading

0 comments on commit 45e5bf8

Please sign in to comment.