diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ebb016749..45d7fadba0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -150,6 +150,7 @@ Breaking changes in this release: - The `botframework-webchat` package now uses CSS modules for styling purposes, in PR [#5666](https://github.com/microsoft/BotFramework-WebChat/pull/5666), in PR [#5677](https://github.com/microsoft/BotFramework-WebChat/pull/5677) by [@OEvgeny](https://github.com/OEvgeny) - π·π» Added `npm run build-browser` script for building test harness package only, in PR [#5667](https://github.com/microsoft/BotFramework-WebChat/pull/5667), by [@compulim](https://github.com/compulim) - Added pull-based capabilities system for dynamically discovering adapter capabilities at runtime, in PR [#5679](https://github.com/microsoft/BotFramework-WebChat/pull/5679), by [@pranavjoshi001](https://github.com/pranavjoshi001) +- Added Speech-to-Speech (S2S) support for real-time voice conversations, in PR [#5654](https://github.com/microsoft/BotFramework-WebChat/pull/5654), by [@pranavjoshi](https://github.com/pranavjoshi001) ### Changed diff --git a/__tests__/assets/esm/speechToSpeech/mockAudioPlayback.js b/__tests__/assets/esm/speechToSpeech/mockAudioPlayback.js new file mode 100644 index 0000000000..f7bba9cbf5 --- /dev/null +++ b/__tests__/assets/esm/speechToSpeech/mockAudioPlayback.js @@ -0,0 +1,23 @@ +/* global AudioContext */ + +/** + * Mocks AudioContext.createBuffer to return buffers with minimum duration. + * + */ +export function setupMockAudioPlayback() { + const originalCreateBuffer = AudioContext.prototype.createBuffer; + + AudioContext.prototype.createBuffer = function (numberOfChannels, length, sampleRate) { + // Ensure minimum duration of 0.5 seconds for testing + const minSamples = Math.floor(sampleRate * 0.5); + const actualLength = Math.max(length, minSamples); + + return originalCreateBuffer.call(this, numberOfChannels, actualLength, sampleRate); + }; + + return { + restore: () => { + AudioContext.prototype.createBuffer = originalCreateBuffer; + } + }; +} diff --git a/__tests__/assets/esm/speechToSpeech/mockMediaDevices.js b/__tests__/assets/esm/speechToSpeech/mockMediaDevices.js new file mode 100644 index 0000000000..fb69332445 --- /dev/null +++ b/__tests__/assets/esm/speechToSpeech/mockMediaDevices.js @@ -0,0 +1,87 @@ +/* global clearInterval, MessageChannel, navigator, setInterval, URL, window */ + +/** + * Mocks browser audio APIs for speechToSpeech testing. + * + * - Intercepts AudioContext.audioWorklet.addModule() to prevent blob execution + * - Mocks AudioWorkletNode for the 'audio-recorder' processor + * - Mocks navigator.mediaDevices.getUserMedia() to return a test audio stream + */ +export function setupMockMediaDevices() { + const OriginalAudioContext = window.AudioContext; + + // Intercept AudioContext to mock audioWorklet.addModule + window.AudioContext = function (options) { + const ctx = new OriginalAudioContext(options); + + ctx.audioWorklet.addModule = url => { + if (url.startsWith('blob:')) { + URL.revokeObjectURL(url); + } + return Promise.resolve(); + }; + + return ctx; + }; + + Object.setPrototypeOf(window.AudioContext, OriginalAudioContext); + window.AudioContext.prototype = OriginalAudioContext.prototype; + + // Mock AudioWorkletNode - uses GainNode as base so source.connect() works + window.AudioWorkletNode = function (context, name, options) { + const node = context.createGain(); + const channel = new MessageChannel(); + let recording = false; + let intervalId = null; + + node.port = channel.port1; + + // port1 is exposed as worklet.port to the real code + // Real code sends to port1 β received by port2.onmessage (commands) + // Mock sends from port2 β received by port1.onmessage (audio chunks) + channel.port2.onmessage = ({ data }) => { + if (data.command === 'START') { + recording = true; + const bufferSize = options?.processorOptions?.bufferSize || 2400; + + // Send chunks at ~100ms intervals while recording + // Use port2.postMessage so port1.onmessage (set by real code) receives it + intervalId = setInterval(() => { + if (recording) { + channel.port2.postMessage({ eventType: 'audio', audioData: new Float32Array(bufferSize) }); + } + }, 100); + } else if (data.command === 'STOP') { + recording = false; + if (intervalId) { + clearInterval(intervalId); + intervalId = null; + } + } + }; + + return node; + }; + + // Mock getUserMedia with oscillator-based test stream + navigator.mediaDevices.getUserMedia = constraints => { + const sampleRate = constraints?.audio?.sampleRate || 24000; + const ctx = new OriginalAudioContext({ sampleRate }); + const oscillator = ctx.createOscillator(); + const destination = ctx.createMediaStreamDestination(); + + oscillator.connect(destination); + oscillator.start(); + + destination.stream.getTracks().forEach(track => { + const originalStop = track.stop.bind(track); + track.stop = () => { + oscillator.stop(); + ctx.close(); + originalStop(); + }; + }); + + return Promise.resolve(destination.stream); + }; +} diff --git a/__tests__/html2/speechToSpeech/barge.in.html b/__tests__/html2/speechToSpeech/barge.in.html new file mode 100644 index 0000000000..d12f20c51f --- /dev/null +++ b/__tests__/html2/speechToSpeech/barge.in.html @@ -0,0 +1,197 @@ + + +
+ + + + + + + + + + + + + + + + diff --git a/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html b/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html new file mode 100644 index 0000000000..56b2608a7f --- /dev/null +++ b/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html @@ -0,0 +1,80 @@ + + + + + + + + + + + + + + + + + diff --git a/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html.snap-1.png b/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html.snap-1.png new file mode 100644 index 0000000000..de8e85d3d3 Binary files /dev/null and b/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html.snap-1.png differ diff --git a/__tests__/html2/speechToSpeech/csp.recording.html b/__tests__/html2/speechToSpeech/csp.recording.html new file mode 100644 index 0000000000..eda24c8721 --- /dev/null +++ b/__tests__/html2/speechToSpeech/csp.recording.html @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/__tests__/html2/speechToSpeech/dtmf.input.html b/__tests__/html2/speechToSpeech/dtmf.input.html new file mode 100644 index 0000000000..124a7605be --- /dev/null +++ b/__tests__/html2/speechToSpeech/dtmf.input.html @@ -0,0 +1,196 @@ + + + + + + + + + + + + + + + + + + + diff --git a/__tests__/html2/speechToSpeech/dtmf.input.html.snap-1.png b/__tests__/html2/speechToSpeech/dtmf.input.html.snap-1.png new file mode 100644 index 0000000000..c29b07065a Binary files /dev/null and b/__tests__/html2/speechToSpeech/dtmf.input.html.snap-1.png differ diff --git a/__tests__/html2/speechToSpeech/dtmf.input.html.snap-2.png b/__tests__/html2/speechToSpeech/dtmf.input.html.snap-2.png new file mode 100644 index 0000000000..be9f100b54 Binary files /dev/null and b/__tests__/html2/speechToSpeech/dtmf.input.html.snap-2.png differ diff --git a/__tests__/html2/speechToSpeech/happy.path.html b/__tests__/html2/speechToSpeech/happy.path.html new file mode 100644 index 0000000000..e326982a9c --- /dev/null +++ b/__tests__/html2/speechToSpeech/happy.path.html @@ -0,0 +1,175 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/__tests__/html2/speechToSpeech/happy.path.html.snap-1.png b/__tests__/html2/speechToSpeech/happy.path.html.snap-1.png new file mode 100644 index 0000000000..f9e16e326a Binary files /dev/null and b/__tests__/html2/speechToSpeech/happy.path.html.snap-1.png differ diff --git a/__tests__/html2/speechToSpeech/happy.path.html.snap-2.png b/__tests__/html2/speechToSpeech/happy.path.html.snap-2.png new file mode 100644 index 0000000000..d3a3724d22 Binary files /dev/null and b/__tests__/html2/speechToSpeech/happy.path.html.snap-2.png differ diff --git a/__tests__/html2/speechToSpeech/multiple.turns.html b/__tests__/html2/speechToSpeech/multiple.turns.html new file mode 100644 index 0000000000..7a5ccc5971 --- /dev/null +++ b/__tests__/html2/speechToSpeech/multiple.turns.html @@ -0,0 +1,331 @@ + + + + + + + + + + + + + + + + + + + diff --git a/__tests__/html2/speechToSpeech/multiple.turns.html.snap-1.png b/__tests__/html2/speechToSpeech/multiple.turns.html.snap-1.png new file mode 100644 index 0000000000..65531ebe9c Binary files /dev/null and b/__tests__/html2/speechToSpeech/multiple.turns.html.snap-1.png differ diff --git a/__tests__/html2/speechToSpeech/outgoing.audio.interval.html b/__tests__/html2/speechToSpeech/outgoing.audio.interval.html new file mode 100644 index 0000000000..d2167aba2a --- /dev/null +++ b/__tests__/html2/speechToSpeech/outgoing.audio.interval.html @@ -0,0 +1,140 @@ + + + + + + + + + + + + + + + + + + + diff --git a/packages/api/src/StyleOptions.ts b/packages/api/src/StyleOptions.ts index 875a02022f..109ae1e23a 100644 --- a/packages/api/src/StyleOptions.ts +++ b/packages/api/src/StyleOptions.ts @@ -998,6 +998,15 @@ type StyleOptions = { * New in 4.19.0. */ disableFileUpload?: boolean; + /** + * Controls microphone button visibility in Fluent theme send box. + * + * - `'auto'` - Show microphone button if the chat adapter supports voice (has voiceConfiguration capability) + * - `'hide'` - Do not show microphone button regardless of adapter capabilities + * + * @default 'auto' + */ + showMicrophoneButton?: 'auto' | 'hide'; }; // StrictStyleOptions is only used internally in Web Chat and for simplifying our code: diff --git a/packages/api/src/boot/hook.ts b/packages/api/src/boot/hook.ts index cd0cbff82b..dda4464d10 100644 --- a/packages/api/src/boot/hook.ts +++ b/packages/api/src/boot/hook.ts @@ -61,7 +61,9 @@ export { useSetNotification, useShouldSpeakIncomingActivity, useStartDictate, + useStartVoice, useStopDictate, + useStopVoice, useStyleOptions, useSubmitSendBox, useSuggestedActions, @@ -74,5 +76,6 @@ export { useUIState, useUserID, useUsername, - useVoiceSelector + useVoiceSelector, + useVoiceState } from '../hooks/index'; diff --git a/packages/api/src/boot/internal.ts b/packages/api/src/boot/internal.ts index 1bb222c21f..1deec63da8 100644 --- a/packages/api/src/boot/internal.ts +++ b/packages/api/src/boot/internal.ts @@ -1,4 +1,6 @@ export { default as LowPriorityDecoratorComposer } from '../decorator/internal/LowPriorityDecoratorComposer'; +export { default as usePostVoiceActivity } from '../hooks/internal/usePostVoiceActivity'; export { default as useSetDictateState } from '../hooks/internal/useSetDictateState'; +export { default as useShouldShowMicrophoneButton } from '../hooks/internal/useShouldShowMicrophoneButton'; export { LegacyActivityContextProvider, type LegacyActivityContextType } from '../legacy/LegacyActivityBridgeContext'; export { default as StyleOptionsComposer } from '../providers/StyleOptions/StyleOptionsComposer'; diff --git a/packages/api/src/decorator/ActivityBorder/ActivityBorderDecorator.tsx b/packages/api/src/decorator/ActivityBorder/ActivityBorderDecorator.tsx index e8a27830e5..15b100540c 100644 --- a/packages/api/src/decorator/ActivityBorder/ActivityBorderDecorator.tsx +++ b/packages/api/src/decorator/ActivityBorder/ActivityBorderDecorator.tsx @@ -1,4 +1,9 @@ -import { getActivityLivestreamingMetadata, type WebChatActivity } from 'botframework-webchat-core'; +import { + getActivityLivestreamingMetadata, + getVoiceActivityRole, + isVoiceActivity, + type WebChatActivity +} from 'botframework-webchat-core'; import React, { memo, useMemo, type ReactNode } from 'react'; import { @@ -25,7 +30,12 @@ function ActivityBorderDecorator({ activity, children }: ActivityBorderDecorator const { type } = getActivityLivestreamingMetadata(activity) || {}; return { - from: supportedActivityRoles.includes(activity?.from?.role) ? activity?.from?.role : undefined, + from: isVoiceActivity(activity) + ? getVoiceActivityRole(activity) + : supportedActivityRoles.includes(activity?.from?.role) + ? activity?.from?.role + : undefined, + modality: new Set(isVoiceActivity(activity) ? ['audio', 'text'] : ['text']), livestreamingState: type === 'final activity' ? 'completing' diff --git a/packages/api/src/decorator/ActivityBorder/private/ActivityBorderDecoratorRequestContext.ts b/packages/api/src/decorator/ActivityBorder/private/ActivityBorderDecoratorRequestContext.ts index 0e7c379146..0c8e01453a 100644 --- a/packages/api/src/decorator/ActivityBorder/private/ActivityBorderDecoratorRequestContext.ts +++ b/packages/api/src/decorator/ActivityBorder/private/ActivityBorderDecoratorRequestContext.ts @@ -20,6 +20,14 @@ type ActivityBorderDecoratorRequest = Readonly<{ * - `undefined` - the sender is unknown */ from: 'bot' | 'channel' | `user` | undefined; + + /** + * Gets the modalities of the activity. + * + * - `'audio'` - the activity originated from audio/voice input + * - `'text'` - the activity originated from text input + */ + modality: Set<'audio' | 'text'>; }>; type ActivityBorderDecoratorRequestContextType = Readonly<{ @@ -30,6 +38,7 @@ const ActivityBorderDecoratorRequestContext = createContext