diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ebb016749..45d7fadba0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -150,6 +150,7 @@ Breaking changes in this release: - The `botframework-webchat` package now uses CSS modules for styling purposes, in PR [#5666](https://github.com/microsoft/BotFramework-WebChat/pull/5666), in PR [#5677](https://github.com/microsoft/BotFramework-WebChat/pull/5677) by [@OEvgeny](https://github.com/OEvgeny) - πŸ‘·πŸ» Added `npm run build-browser` script for building test harness package only, in PR [#5667](https://github.com/microsoft/BotFramework-WebChat/pull/5667), by [@compulim](https://github.com/compulim) - Added pull-based capabilities system for dynamically discovering adapter capabilities at runtime, in PR [#5679](https://github.com/microsoft/BotFramework-WebChat/pull/5679), by [@pranavjoshi001](https://github.com/pranavjoshi001) +- Added Speech-to-Speech (S2S) support for real-time voice conversations, in PR [#5654](https://github.com/microsoft/BotFramework-WebChat/pull/5654), by [@pranavjoshi](https://github.com/pranavjoshi001) ### Changed diff --git a/__tests__/assets/esm/speechToSpeech/mockAudioPlayback.js b/__tests__/assets/esm/speechToSpeech/mockAudioPlayback.js new file mode 100644 index 0000000000..f7bba9cbf5 --- /dev/null +++ b/__tests__/assets/esm/speechToSpeech/mockAudioPlayback.js @@ -0,0 +1,23 @@ +/* global AudioContext */ + +/** + * Mocks AudioContext.createBuffer to return buffers with minimum duration. + * + */ +export function setupMockAudioPlayback() { + const originalCreateBuffer = AudioContext.prototype.createBuffer; + + AudioContext.prototype.createBuffer = function (numberOfChannels, length, sampleRate) { + // Ensure minimum duration of 0.5 seconds for testing + const minSamples = Math.floor(sampleRate * 0.5); + const actualLength = Math.max(length, minSamples); + + return originalCreateBuffer.call(this, numberOfChannels, actualLength, sampleRate); + }; + + return { + restore: () => { + AudioContext.prototype.createBuffer = originalCreateBuffer; + } + }; +} diff --git a/__tests__/assets/esm/speechToSpeech/mockMediaDevices.js b/__tests__/assets/esm/speechToSpeech/mockMediaDevices.js new file mode 100644 index 0000000000..fb69332445 --- /dev/null +++ b/__tests__/assets/esm/speechToSpeech/mockMediaDevices.js @@ -0,0 +1,87 @@ +/* global clearInterval, MessageChannel, navigator, setInterval, URL, window */ + +/** + * Mocks browser audio APIs for speechToSpeech testing. + * + * - Intercepts AudioContext.audioWorklet.addModule() to prevent blob execution + * - Mocks AudioWorkletNode for the 'audio-recorder' processor + * - Mocks navigator.mediaDevices.getUserMedia() to return a test audio stream + */ +export function setupMockMediaDevices() { + const OriginalAudioContext = window.AudioContext; + + // Intercept AudioContext to mock audioWorklet.addModule + window.AudioContext = function (options) { + const ctx = new OriginalAudioContext(options); + + ctx.audioWorklet.addModule = url => { + if (url.startsWith('blob:')) { + URL.revokeObjectURL(url); + } + return Promise.resolve(); + }; + + return ctx; + }; + + Object.setPrototypeOf(window.AudioContext, OriginalAudioContext); + window.AudioContext.prototype = OriginalAudioContext.prototype; + + // Mock AudioWorkletNode - uses GainNode as base so source.connect() works + window.AudioWorkletNode = function (context, name, options) { + const node = context.createGain(); + const channel = new MessageChannel(); + let recording = false; + let intervalId = null; + + node.port = channel.port1; + + // port1 is exposed as worklet.port to the real code + // Real code sends to port1 β†’ received by port2.onmessage (commands) + // Mock sends from port2 β†’ received by port1.onmessage (audio chunks) + channel.port2.onmessage = ({ data }) => { + if (data.command === 'START') { + recording = true; + const bufferSize = options?.processorOptions?.bufferSize || 2400; + + // Send chunks at ~100ms intervals while recording + // Use port2.postMessage so port1.onmessage (set by real code) receives it + intervalId = setInterval(() => { + if (recording) { + channel.port2.postMessage({ eventType: 'audio', audioData: new Float32Array(bufferSize) }); + } + }, 100); + } else if (data.command === 'STOP') { + recording = false; + if (intervalId) { + clearInterval(intervalId); + intervalId = null; + } + } + }; + + return node; + }; + + // Mock getUserMedia with oscillator-based test stream + navigator.mediaDevices.getUserMedia = constraints => { + const sampleRate = constraints?.audio?.sampleRate || 24000; + const ctx = new OriginalAudioContext({ sampleRate }); + const oscillator = ctx.createOscillator(); + const destination = ctx.createMediaStreamDestination(); + + oscillator.connect(destination); + oscillator.start(); + + destination.stream.getTracks().forEach(track => { + const originalStop = track.stop.bind(track); + track.stop = () => { + oscillator.stop(); + ctx.close(); + originalStop(); + }; + }); + + return Promise.resolve(destination.stream); + }; +} diff --git a/__tests__/html2/speechToSpeech/barge.in.html b/__tests__/html2/speechToSpeech/barge.in.html new file mode 100644 index 0000000000..d12f20c51f --- /dev/null +++ b/__tests__/html2/speechToSpeech/barge.in.html @@ -0,0 +1,197 @@ + + + + + + + + + + + + + +
+ + + + + diff --git a/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html b/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html new file mode 100644 index 0000000000..56b2608a7f --- /dev/null +++ b/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html @@ -0,0 +1,80 @@ + + + + + + + + + + + + + +
+ + + diff --git a/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html.snap-1.png b/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html.snap-1.png new file mode 100644 index 0000000000..de8e85d3d3 Binary files /dev/null and b/__tests__/html2/speechToSpeech/basic.sendbox.with.mic.html.snap-1.png differ diff --git a/__tests__/html2/speechToSpeech/csp.recording.html b/__tests__/html2/speechToSpeech/csp.recording.html new file mode 100644 index 0000000000..eda24c8721 --- /dev/null +++ b/__tests__/html2/speechToSpeech/csp.recording.html @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + +
+ + + + diff --git a/__tests__/html2/speechToSpeech/dtmf.input.html b/__tests__/html2/speechToSpeech/dtmf.input.html new file mode 100644 index 0000000000..124a7605be --- /dev/null +++ b/__tests__/html2/speechToSpeech/dtmf.input.html @@ -0,0 +1,196 @@ + + + + + + + + + + + + + +
+ + + + + diff --git a/__tests__/html2/speechToSpeech/dtmf.input.html.snap-1.png b/__tests__/html2/speechToSpeech/dtmf.input.html.snap-1.png new file mode 100644 index 0000000000..c29b07065a Binary files /dev/null and b/__tests__/html2/speechToSpeech/dtmf.input.html.snap-1.png differ diff --git a/__tests__/html2/speechToSpeech/dtmf.input.html.snap-2.png b/__tests__/html2/speechToSpeech/dtmf.input.html.snap-2.png new file mode 100644 index 0000000000..be9f100b54 Binary files /dev/null and b/__tests__/html2/speechToSpeech/dtmf.input.html.snap-2.png differ diff --git a/__tests__/html2/speechToSpeech/happy.path.html b/__tests__/html2/speechToSpeech/happy.path.html new file mode 100644 index 0000000000..e326982a9c --- /dev/null +++ b/__tests__/html2/speechToSpeech/happy.path.html @@ -0,0 +1,175 @@ + + + + + + + + + + + + + +
+ + + + \ No newline at end of file diff --git a/__tests__/html2/speechToSpeech/happy.path.html.snap-1.png b/__tests__/html2/speechToSpeech/happy.path.html.snap-1.png new file mode 100644 index 0000000000..f9e16e326a Binary files /dev/null and b/__tests__/html2/speechToSpeech/happy.path.html.snap-1.png differ diff --git a/__tests__/html2/speechToSpeech/happy.path.html.snap-2.png b/__tests__/html2/speechToSpeech/happy.path.html.snap-2.png new file mode 100644 index 0000000000..d3a3724d22 Binary files /dev/null and b/__tests__/html2/speechToSpeech/happy.path.html.snap-2.png differ diff --git a/__tests__/html2/speechToSpeech/multiple.turns.html b/__tests__/html2/speechToSpeech/multiple.turns.html new file mode 100644 index 0000000000..7a5ccc5971 --- /dev/null +++ b/__tests__/html2/speechToSpeech/multiple.turns.html @@ -0,0 +1,331 @@ + + + + + + + + + + + + + +
+ + + + + diff --git a/__tests__/html2/speechToSpeech/multiple.turns.html.snap-1.png b/__tests__/html2/speechToSpeech/multiple.turns.html.snap-1.png new file mode 100644 index 0000000000..65531ebe9c Binary files /dev/null and b/__tests__/html2/speechToSpeech/multiple.turns.html.snap-1.png differ diff --git a/__tests__/html2/speechToSpeech/outgoing.audio.interval.html b/__tests__/html2/speechToSpeech/outgoing.audio.interval.html new file mode 100644 index 0000000000..d2167aba2a --- /dev/null +++ b/__tests__/html2/speechToSpeech/outgoing.audio.interval.html @@ -0,0 +1,140 @@ + + + + + + + + + + + + + +
+ + + + + diff --git a/packages/api/src/StyleOptions.ts b/packages/api/src/StyleOptions.ts index 875a02022f..109ae1e23a 100644 --- a/packages/api/src/StyleOptions.ts +++ b/packages/api/src/StyleOptions.ts @@ -998,6 +998,15 @@ type StyleOptions = { * New in 4.19.0. */ disableFileUpload?: boolean; + /** + * Controls microphone button visibility in Fluent theme send box. + * + * - `'auto'` - Show microphone button if the chat adapter supports voice (has voiceConfiguration capability) + * - `'hide'` - Do not show microphone button regardless of adapter capabilities + * + * @default 'auto' + */ + showMicrophoneButton?: 'auto' | 'hide'; }; // StrictStyleOptions is only used internally in Web Chat and for simplifying our code: diff --git a/packages/api/src/boot/hook.ts b/packages/api/src/boot/hook.ts index cd0cbff82b..dda4464d10 100644 --- a/packages/api/src/boot/hook.ts +++ b/packages/api/src/boot/hook.ts @@ -61,7 +61,9 @@ export { useSetNotification, useShouldSpeakIncomingActivity, useStartDictate, + useStartVoice, useStopDictate, + useStopVoice, useStyleOptions, useSubmitSendBox, useSuggestedActions, @@ -74,5 +76,6 @@ export { useUIState, useUserID, useUsername, - useVoiceSelector + useVoiceSelector, + useVoiceState } from '../hooks/index'; diff --git a/packages/api/src/boot/internal.ts b/packages/api/src/boot/internal.ts index 1bb222c21f..1deec63da8 100644 --- a/packages/api/src/boot/internal.ts +++ b/packages/api/src/boot/internal.ts @@ -1,4 +1,6 @@ export { default as LowPriorityDecoratorComposer } from '../decorator/internal/LowPriorityDecoratorComposer'; +export { default as usePostVoiceActivity } from '../hooks/internal/usePostVoiceActivity'; export { default as useSetDictateState } from '../hooks/internal/useSetDictateState'; +export { default as useShouldShowMicrophoneButton } from '../hooks/internal/useShouldShowMicrophoneButton'; export { LegacyActivityContextProvider, type LegacyActivityContextType } from '../legacy/LegacyActivityBridgeContext'; export { default as StyleOptionsComposer } from '../providers/StyleOptions/StyleOptionsComposer'; diff --git a/packages/api/src/decorator/ActivityBorder/ActivityBorderDecorator.tsx b/packages/api/src/decorator/ActivityBorder/ActivityBorderDecorator.tsx index e8a27830e5..15b100540c 100644 --- a/packages/api/src/decorator/ActivityBorder/ActivityBorderDecorator.tsx +++ b/packages/api/src/decorator/ActivityBorder/ActivityBorderDecorator.tsx @@ -1,4 +1,9 @@ -import { getActivityLivestreamingMetadata, type WebChatActivity } from 'botframework-webchat-core'; +import { + getActivityLivestreamingMetadata, + getVoiceActivityRole, + isVoiceActivity, + type WebChatActivity +} from 'botframework-webchat-core'; import React, { memo, useMemo, type ReactNode } from 'react'; import { @@ -25,7 +30,12 @@ function ActivityBorderDecorator({ activity, children }: ActivityBorderDecorator const { type } = getActivityLivestreamingMetadata(activity) || {}; return { - from: supportedActivityRoles.includes(activity?.from?.role) ? activity?.from?.role : undefined, + from: isVoiceActivity(activity) + ? getVoiceActivityRole(activity) + : supportedActivityRoles.includes(activity?.from?.role) + ? activity?.from?.role + : undefined, + modality: new Set(isVoiceActivity(activity) ? ['audio', 'text'] : ['text']), livestreamingState: type === 'final activity' ? 'completing' diff --git a/packages/api/src/decorator/ActivityBorder/private/ActivityBorderDecoratorRequestContext.ts b/packages/api/src/decorator/ActivityBorder/private/ActivityBorderDecoratorRequestContext.ts index 0e7c379146..0c8e01453a 100644 --- a/packages/api/src/decorator/ActivityBorder/private/ActivityBorderDecoratorRequestContext.ts +++ b/packages/api/src/decorator/ActivityBorder/private/ActivityBorderDecoratorRequestContext.ts @@ -20,6 +20,14 @@ type ActivityBorderDecoratorRequest = Readonly<{ * - `undefined` - the sender is unknown */ from: 'bot' | 'channel' | `user` | undefined; + + /** + * Gets the modalities of the activity. + * + * - `'audio'` - the activity originated from audio/voice input + * - `'text'` - the activity originated from text input + */ + modality: Set<'audio' | 'text'>; }>; type ActivityBorderDecoratorRequestContextType = Readonly<{ @@ -30,6 +38,7 @@ const ActivityBorderDecoratorRequestContext = createContext(), livestreamingState: undefined }) }) diff --git a/packages/api/src/defaultStyleOptions.ts b/packages/api/src/defaultStyleOptions.ts index 841a5dd5bd..a77ae7d48b 100644 --- a/packages/api/src/defaultStyleOptions.ts +++ b/packages/api/src/defaultStyleOptions.ts @@ -317,7 +317,10 @@ const DEFAULT_OPTIONS: Required = { // Send box attachment bar sendBoxAttachmentBarMaxHeight: 114, - sendBoxAttachmentBarMaxThumbnail: 3 + sendBoxAttachmentBarMaxThumbnail: 3, + + // Speech-to-speech options + showMicrophoneButton: 'auto' }; export default DEFAULT_OPTIONS; diff --git a/packages/api/src/hooks/Composer.tsx b/packages/api/src/hooks/Composer.tsx index 046b24218d..3b1794d86b 100644 --- a/packages/api/src/hooks/Composer.tsx +++ b/packages/api/src/hooks/Composer.tsx @@ -30,8 +30,10 @@ import { singleToArray, startDictate, startSpeakingActivity, + startVoiceRecording, stopDictate, stopSpeakingActivity, + stopVoiceRecording, submitSendBox, type DirectLineJSBotConnection, type GlobalScopePonyfill, @@ -63,6 +65,7 @@ import ActivityTypingComposer from '../providers/ActivityTyping/ActivityTypingCo import CapabilitiesComposer from '../providers/Capabilities/CapabilitiesComposer'; import GroupActivitiesComposer from '../providers/GroupActivities/GroupActivitiesComposer'; import PonyfillComposer from '../providers/Ponyfill/PonyfillComposer'; +import { SpeechToSpeechComposer } from '../providers/SpeechToSpeech/SpeechToSpeechComposer'; import StyleOptionsComposer from '../providers/StyleOptions/StyleOptionsComposer'; import { type ActivityStatusMiddleware, type RenderActivityStatus } from '../types/ActivityStatusMiddleware'; import AttachmentForScreenReaderMiddleware from '../types/AttachmentForScreenReaderMiddleware'; @@ -81,6 +84,7 @@ import isObject from '../utils/isObject'; import mapMap from '../utils/mapMap'; import normalizeLanguage from '../utils/normalizeLanguage'; import Tracker from './internal/Tracker'; +import useVoiceHandlers from './internal/useVoiceHandlers'; import WebChatAPIContext, { type WebChatAPIContextType } from './internal/WebChatAPIContext'; import WebChatReduxContext, { useDispatch } from './internal/WebChatReduxContext'; import defaultSelectVoice from './internal/defaultSelectVoice'; @@ -297,6 +301,7 @@ const ComposerCore = ({ const [styleOptions] = useStyleOptions(); const dispatch = useDispatch(); const telemetryDimensionsRef = useRef({}); + const [voiceHandlers] = useVoiceHandlers(); const patchedDir = useMemo(() => (dir === 'ltr' || dir === 'rtl' ? dir : 'auto'), [dir]); const patchedGrammars = useMemo(() => grammars || [], [grammars]); @@ -367,6 +372,15 @@ const ComposerCore = ({ [dispatch] ); + const startVoice = useCallback(() => { + dispatch(startVoiceRecording()); + }, [dispatch]); + + const stopVoice = useCallback(() => { + voiceHandlers.forEach(handler => handler.stopAllAudio()); + dispatch(stopVoiceRecording()); + }, [dispatch, voiceHandlers]); + const patchedLocalizedStrings = useMemo( () => mergeStringsOverrides(getAllLocalizedStrings()[normalizeLanguage(locale)], locale, overrideLocalizedStrings), [locale, overrideLocalizedStrings] @@ -554,6 +568,8 @@ const ComposerCore = ({ scrollToEndButtonRenderer, selectVoice: patchedSelectVoice, sendTypingIndicator, + startVoice, + stopVoice, telemetryDimensionsRef, toastRenderer: patchedToastRenderer, trackDimension, @@ -583,6 +599,8 @@ const ComposerCore = ({ renderMarkdown, scrollToEndButtonRenderer, sendTypingIndicator, + startVoice, + stopVoice, telemetryDimensionsRef, trackDimension, uiState, @@ -601,7 +619,9 @@ const ComposerCore = ({ - {typeof children === 'function' ? children(context) : children} + + {typeof children === 'function' ? children(context) : children} + diff --git a/packages/api/src/hooks/index.ts b/packages/api/src/hooks/index.ts index da6e0151a4..f1fb41c3ce 100644 --- a/packages/api/src/hooks/index.ts +++ b/packages/api/src/hooks/index.ts @@ -59,7 +59,9 @@ import useSendTypingIndicator from './useSendTypingIndicator'; import useSetNotification from './useSetNotification'; import useShouldSpeakIncomingActivity from './useShouldSpeakIncomingActivity'; import useStartDictate from './useStartDictate'; +import useStartVoice from './useStartVoice'; import useStopDictate from './useStopDictate'; +import useStopVoice from './useStopVoice'; import useStyleOptions from './useStyleOptions'; import useSubmitSendBox from './useSubmitSendBox'; import useSuggestedActions from './useSuggestedActions'; @@ -72,6 +74,7 @@ import useUIState from './useUIState'; import useUserID from './useUserID'; import useUsername from './useUsername'; import useVoiceSelector from './useVoiceSelector'; +import useVoiceState from './useVoiceState'; export { useBuildRenderActivityCallback } from '@msinternal/botframework-webchat-api-middleware'; export { useSuggestedActionsHooks } from '@msinternal/botframework-webchat-redux-store'; @@ -138,7 +141,9 @@ export { useSetNotification, useShouldSpeakIncomingActivity, useStartDictate, + useStartVoice, useStopDictate, + useStopVoice, useStyleOptions, useSubmitSendBox, useSuggestedActions, @@ -150,5 +155,6 @@ export { useUIState, useUserID, useUsername, - useVoiceSelector + useVoiceSelector, + useVoiceState }; diff --git a/packages/api/src/hooks/internal/WebChatAPIContext.ts b/packages/api/src/hooks/internal/WebChatAPIContext.ts index 0981b4ad8c..a0bc434b51 100644 --- a/packages/api/src/hooks/internal/WebChatAPIContext.ts +++ b/packages/api/src/hooks/internal/WebChatAPIContext.ts @@ -71,8 +71,10 @@ export type WebChatAPIContextType = { setSendTimeout?: (timeout: number) => void; startDictate?: () => void; startSpeakingActivity?: () => void; + startVoice?: () => void; stopDictate?: () => void; stopSpeakingActivity?: () => void; + stopVoice?: () => void; submitSendBox?: (method?: string, { channelData }?: { channelData: any }) => void; telemetryDimensionsRef?: React.Ref; toastRenderer?: RenderToast; diff --git a/packages/api/src/hooks/internal/usePostVoiceActivity.ts b/packages/api/src/hooks/internal/usePostVoiceActivity.ts new file mode 100644 index 0000000000..18f735cff5 --- /dev/null +++ b/packages/api/src/hooks/internal/usePostVoiceActivity.ts @@ -0,0 +1,18 @@ +import { postVoiceActivity, type WebChatActivity } from 'botframework-webchat-core'; +import { useCallback } from 'react'; +import { useDispatch } from './WebChatReduxContext'; + +/** + * Hook to post voice activities (fire-and-forget, no echo back). + * Use this for DTMF and other voice-related event activities. + */ +export default function usePostVoiceActivity(): (activity: WebChatActivity) => void { + const dispatch = useDispatch(); + + return useCallback( + (activity: WebChatActivity) => { + dispatch(postVoiceActivity(activity)); + }, + [dispatch] + ); +} diff --git a/packages/api/src/hooks/internal/useRegisterVoiceHandler.ts b/packages/api/src/hooks/internal/useRegisterVoiceHandler.ts new file mode 100644 index 0000000000..149bd97f2a --- /dev/null +++ b/packages/api/src/hooks/internal/useRegisterVoiceHandler.ts @@ -0,0 +1,23 @@ +import { registerVoiceHandler, unregisterVoiceHandler, type VoiceHandler } from 'botframework-webchat-core'; +import { useCallback } from 'react'; +import randomId from '../../utils/randomId'; +import { useDispatch } from './WebChatReduxContext'; + +/** + * Internal hook to register a voice handler for audio playback. + * @returns A function that registers a voice handler and returns an unregister function. + */ +export default function useRegisterVoiceHandler(): (voiceHandler: VoiceHandler) => () => void { + const dispatch = useDispatch(); + + return useCallback( + (voiceHandler: VoiceHandler) => { + const id = randomId(); + dispatch(registerVoiceHandler(id, voiceHandler)); + return () => { + dispatch(unregisterVoiceHandler(id)); + }; + }, + [dispatch] + ); +} diff --git a/packages/api/src/hooks/internal/useShouldShowMicrophoneButton.ts b/packages/api/src/hooks/internal/useShouldShowMicrophoneButton.ts new file mode 100644 index 0000000000..a428812227 --- /dev/null +++ b/packages/api/src/hooks/internal/useShouldShowMicrophoneButton.ts @@ -0,0 +1,22 @@ +import useStyleOptions from '../useStyleOptions'; +import useCapabilities from '../../providers/Capabilities/useCapabilities'; + +/** + * Internal hook to determine if the microphone button should be shown based on: + * - `showMicrophoneButton` style option ('auto' | 'hide') + * - Adapter voice capability (voiceConfiguration) + * + * - 'auto': Show if adapter has voiceConfiguration capability, hide otherwise + * - 'hide': Never show + */ +export default function useShouldShowMicrophoneButton(): boolean { + const [{ showMicrophoneButton }] = useStyleOptions(); + // If adapter has voice capability, voiceConfiguration will be defined, + const voiceConfiguration = useCapabilities(caps => caps.voiceConfiguration); + + if (showMicrophoneButton === 'hide') { + return false; + } + + return !!voiceConfiguration; +} diff --git a/packages/api/src/hooks/internal/useVoiceHandlers.ts b/packages/api/src/hooks/internal/useVoiceHandlers.ts new file mode 100644 index 0000000000..63eb4c3925 --- /dev/null +++ b/packages/api/src/hooks/internal/useVoiceHandlers.ts @@ -0,0 +1,11 @@ +import type { VoiceHandler } from 'botframework-webchat-core'; +import { useSelector } from './WebChatReduxContext'; + +/** + * Internal hook to get all registered voice handlers from Redux state. + */ +export default function useVoiceHandlers(): readonly [ReadonlyMap] { + return Object.freeze([ + useSelector((state: { voice: { voiceHandlers: Map } }) => state.voice.voiceHandlers) + ]); +} diff --git a/packages/api/src/hooks/internal/useVoiceStateWritable.ts b/packages/api/src/hooks/internal/useVoiceStateWritable.ts new file mode 100644 index 0000000000..4d3a409edb --- /dev/null +++ b/packages/api/src/hooks/internal/useVoiceStateWritable.ts @@ -0,0 +1,18 @@ +import { setVoiceState, type VoiceState } from 'botframework-webchat-core'; +import { useCallback } from 'react'; +import { useDispatch, useSelector } from './WebChatReduxContext'; + +/** + * Internal hook to set the voice state. + */ +export default function useVoiceStateWritable(): readonly [VoiceState, (state: VoiceState) => void] { + const dispatch = useDispatch(); + const setter = useCallback( + (state: VoiceState) => { + dispatch(setVoiceState(state)); + }, + [dispatch] + ); + const value = useSelector(({ voice }) => voice.voiceState); + return Object.freeze([value, setter]); +} diff --git a/packages/api/src/hooks/useStartVoice.ts b/packages/api/src/hooks/useStartVoice.ts new file mode 100644 index 0000000000..952d2b9d38 --- /dev/null +++ b/packages/api/src/hooks/useStartVoice.ts @@ -0,0 +1,9 @@ +import useWebChatAPIContext from './internal/useWebChatAPIContext'; + +/** + * Hook to start voice mode (turns on microphone and enables audio synthesis). + * This starts speech-to-speech interaction. + */ +export default function useStartVoice(): () => void { + return useWebChatAPIContext().startVoice; +} diff --git a/packages/api/src/hooks/useStopVoice.ts b/packages/api/src/hooks/useStopVoice.ts new file mode 100644 index 0000000000..b7be0f5447 --- /dev/null +++ b/packages/api/src/hooks/useStopVoice.ts @@ -0,0 +1,9 @@ +import useWebChatAPIContext from './internal/useWebChatAPIContext'; + +/** + * Hook to stop voice mode (turns off microphone and stops audio playback). + * This ends speech-to-speech interaction. + */ +export default function useStopVoice(): () => void { + return useWebChatAPIContext().stopVoice; +} diff --git a/packages/api/src/hooks/useVoiceState.ts b/packages/api/src/hooks/useVoiceState.ts new file mode 100644 index 0000000000..a3898aade7 --- /dev/null +++ b/packages/api/src/hooks/useVoiceState.ts @@ -0,0 +1,15 @@ +import type { VoiceState } from 'botframework-webchat-core'; +import useVoiceStateWritable from './internal/useVoiceStateWritable'; + +/** + * Hook to get the voice state. + * The voice state represents the current state of the speech-to-speech interaction: + * - 'idle': No active speech session, microphone and audio playback are off + * - 'listening': Microphone is active + * - 'user_speaking': User is actively speaking + * - 'processing': User finished speaking, server is processing + * - 'bot_speaking': Bot is speaking (audio playback) + */ +export default function useVoiceState(): readonly [VoiceState] { + return Object.freeze([useVoiceStateWritable()[0]]); +} diff --git a/packages/api/src/localization/en-US.json b/packages/api/src/localization/en-US.json index 74bfece917..7e22ba4c18 100644 --- a/packages/api/src/localization/en-US.json +++ b/packages/api/src/localization/en-US.json @@ -6,6 +6,8 @@ "_ACCESS_KEY_FOR_MAC_ALT.comment": "These are keyboard modifier keys for screen reader on a Mac.", "ACTIVITY_BOT_SAID_ALT": "Bot $1 said:", "_ACTIVITY_BOT_SAID_ALT.comment": "This is for screen reader only. $1 is the initials for the bot (e.g. \"JD\").", + "ACTIVITY_STATUS_VOICE_TRANSCRIPT_AGENT_LABEL": "Agent", + "_ACTIVITY_STATUS_VOICE_TRANSCRIPT_AGENT_LABEL.comment": "Label shown before timestamp for incoming voice transcript messages from the agent.", "ACTIVITY_YOU_SAID_ALT": "You said:", "_ACTIVITY_YOU_SAID_ALT.comment": "This is for screen reader only.", "ACTIVITY_BOT_ATTACHED_ALT": "Bot attached:", @@ -131,6 +133,14 @@ "TEXT_INPUT_ALT": "Message input box", "_TEXT_INPUT_ALT.comment": "This is for screen reader for the label of the message input box.", "TEXT_INPUT_PLACEHOLDER": "Type your message", + "TEXT_INPUT_SPEECH_IDLE_PLACEHOLDER": "Start talking...", + "_TEXT_INPUT_SPEECH_IDLE_PLACEHOLDER.comment": "This is the placeholder text shown in the message input box when speech-to-speech is enabled and in idle state.", + "TEXT_INPUT_SPEECH_LISTENING_PLACEHOLDER": "Listening...", + "_TEXT_INPUT_SPEECH_LISTENING_PLACEHOLDER.comment": "This is the placeholder text shown in the message input box when speech-to-speech is enabled and actively listening to user speech.", + "TEXT_INPUT_SPEECH_PROCESSING_PLACEHOLDER": "Processing...", + "_TEXT_INPUT_SPEECH_PROCESSING_PLACEHOLDER.comment": "This is the placeholder text shown in the message input box when speech-to-speech is enabled and processing the user's speech input.", + "TEXT_INPUT_SPEECH_BOT_SPEAKING_PLACEHOLDER": "Talk to interrupt...", + "_TEXT_INPUT_SPEECH_BOT_SPEAKING_PLACEHOLDER.comment": "This is the placeholder text shown in the message input box when speech-to-speech is enabled and the bot is speaking.", "TEXT_INPUT_SEND_BUTTON_ALT": "Send", "TEXT_INPUT_SPEAK_BUTTON_ALT": "Speak", "TEXT_INPUT_UPLOAD_BUTTON_ALT": "Upload file", diff --git a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx new file mode 100644 index 0000000000..da6ee79b7e --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx @@ -0,0 +1,20 @@ +import React, { type ReactNode } from 'react'; +import { VoiceHandlerBridge } from './private/VoiceHandlerBridge'; +import { VoiceRecorderBridge } from './private/VoiceRecorderBridge'; + +/** + * SpeechToSpeechComposer sets up the speech-to-speech infrastructure. + * + * This component renders invisible bridge components that: + * 1. VoiceHandlerBridge - registers audio player functions with Redux + * 2. VoiceRecorderBridge - reacts to recording state and manages microphone + * + * Use the `useVoiceState`, `useStartVoice`, and `useStopVoice` hooks to access state and controls. + */ +export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> = ({ children }) => ( + + + + {children} + +); diff --git a/packages/api/src/providers/SpeechToSpeech/private/VoiceHandlerBridge.tsx b/packages/api/src/providers/SpeechToSpeech/private/VoiceHandlerBridge.tsx new file mode 100644 index 0000000000..de52f17999 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/VoiceHandlerBridge.tsx @@ -0,0 +1,23 @@ +import { useEffect } from 'react'; +import { useAudioPlayer } from './useAudioPlayer'; +import useRegisterVoiceHandler from '../../../hooks/internal/useRegisterVoiceHandler'; +import useShouldShowMicrophoneButton from '../../../hooks/internal/useShouldShowMicrophoneButton'; + +/** + * VoiceHandlerBridge is an invisible component that registers the audio player + * functions (queueAudio, stopAllAudio) with Redux + */ +export const VoiceHandlerBridge = () => { + const { queueAudio, stopAllAudio } = useAudioPlayer(); + const registerVoiceHandler = useRegisterVoiceHandler(); + const shouldShowMicrophoneButton = useShouldShowMicrophoneButton(); + + useEffect(() => { + if (!shouldShowMicrophoneButton) { + return; + } + return registerVoiceHandler({ queueAudio, stopAllAudio }); + }, [queueAudio, registerVoiceHandler, shouldShowMicrophoneButton, stopAllAudio]); + + return null; +}; diff --git a/packages/api/src/providers/SpeechToSpeech/private/VoiceRecorderBridge.tsx b/packages/api/src/providers/SpeechToSpeech/private/VoiceRecorderBridge.tsx new file mode 100644 index 0000000000..ea00dbe81d --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/VoiceRecorderBridge.tsx @@ -0,0 +1,41 @@ +import { useEffect, useCallback } from 'react'; +import { useRecorder } from './useRecorder'; +import usePostVoiceActivity from '../../../hooks/internal/usePostVoiceActivity'; +import useVoiceState from '../../../hooks/useVoiceState'; + +/** + * VoiceRecorderBridge is an invisible component that bridges the Redux recording state + * with the actual microphone recording functionality. + */ +export function VoiceRecorderBridge(): null { + const [voiceState] = useVoiceState(); + const postVoiceActivity = usePostVoiceActivity(); + + // Derive recording state from voiceState - recording is active when not idle + const recording = voiceState !== 'idle'; + + const handleAudioChunk = useCallback( + (base64: string, timestamp: string) => { + postVoiceActivity({ + name: 'media.chunk', + type: 'event', + value: { + contentType: 'audio/webm', + content: base64, + timestamp + } + } as any); + }, + [postVoiceActivity] + ); + + const { record } = useRecorder(handleAudioChunk); + + useEffect(() => { + if (recording) { + return record(); + } + }, [record, recording]); + + return null; +} diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx new file mode 100644 index 0000000000..2497b0379a --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx @@ -0,0 +1,355 @@ +/** @jest-environment @happy-dom/jest-environment */ +/// +/// + +import { render, type RenderResult } from '@testing-library/react'; +import React, { type ComponentType } from 'react'; +import { useAudioPlayer } from './useAudioPlayer'; + +// Mock setVoiceState function +const mockSetVoiceState = jest.fn(); + +// Mock useVoiceStateWritable hook - returns [state, setVoiceState] array +jest.mock('../../../hooks/internal/useVoiceStateWritable', () => ({ + __esModule: true, + default: jest.fn(() => [undefined, mockSetVoiceState]) +})); + +jest.mock('../../Capabilities/useCapabilities', () => ({ + __esModule: true, + default: jest.fn((selector: (caps: { voiceConfiguration?: { sampleRate: number } }) => unknown) => + selector({ voiceConfiguration: { sampleRate: 24000 } }) + ) +})); + +// Mock AudioContext and related APIs +const mockAudioContext = { + close: jest.fn().mockResolvedValue(undefined), + createBuffer: jest.fn(), + createBufferSource: jest.fn(), + currentTime: 0, + destination: {}, + resume: jest.fn().mockResolvedValue(undefined), + sampleRate: 24000, + state: 'running' +}; + +const mockAudioBuffer = { + duration: 0.1, // 100ms + getChannelData: jest.fn().mockReturnValue(new Float32Array(2400)) +}; + +// Factory to create unique buffer source mocks +const createMockBufferSource = () => ({ + buffer: null as typeof mockAudioBuffer | null, + connect: jest.fn(), + disconnect: jest.fn(), + onended: null as (() => void) | null, + start: jest.fn(), + stop: jest.fn() +}); + +// Track all created buffer sources for assertions +let createdBufferSources: ReturnType[] = []; + +// Mock global AudioContext +global.AudioContext = jest.fn(() => mockAudioContext) as unknown as typeof AudioContext; +global.atob = jest.fn(str => str); // Simple mock for base64 decode + +type UseAudioPlayerReturn = ReturnType; + +describe('useAudioPlayer', () => { + let HookApp: ComponentType; + let hookData: UseAudioPlayerReturn | undefined; + // eslint-disable-next-line @typescript-eslint/no-unused-vars + let renderResult: RenderResult; + const originalAudioContext = global.AudioContext; + + beforeEach(() => { + jest.clearAllMocks(); + createdBufferSources = []; + mockAudioContext.currentTime = 0; + mockAudioContext.createBuffer.mockReturnValue(mockAudioBuffer); + mockAudioContext.createBufferSource.mockImplementation(() => { + const source = createMockBufferSource(); + createdBufferSources.push(source); + return source; + }); + + HookApp = () => { + hookData = useAudioPlayer(); + return null; + }; + }); + + afterEach(() => { + global.AudioContext = originalAudioContext; + }); + + describe('Initialization', () => { + test('should return queueAudio and stopAllAudio functions', () => { + render(); + + expect(typeof hookData?.queueAudio).toBe('function'); + expect(typeof hookData?.stopAllAudio).toBe('function'); + }); + + test('should create AudioContext on first queueAudio call', async () => { + render(); + + await hookData?.queueAudio('dGVzdA=='); // base64 for 'test' + + expect(AudioContext).toHaveBeenCalledWith({ sampleRate: 24000 }); + }); + + test('should reuse existing AudioContext on subsequent calls', async () => { + render(); + + await hookData?.queueAudio('dGVzdA=='); + await hookData?.queueAudio('dGVzdDI='); + + expect(AudioContext).toHaveBeenCalledTimes(1); + }); + }); + + describe('Audio playback', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should process base64 audio data correctly', async () => { + await hookData?.queueAudio('dGVzdA=='); + + expect(global.atob).toHaveBeenCalledWith('dGVzdA=='); + expect(mockAudioContext.createBuffer).toHaveBeenCalledWith(1, expect.any(Number), 24000); + expect(mockAudioContext.createBufferSource).toHaveBeenCalled(); + }); + + test('should set up audio buffer source correctly', async () => { + await hookData?.queueAudio('dGVzdA=='); + + const [source] = createdBufferSources; + expect(source.connect).toHaveBeenCalledWith(mockAudioContext.destination); + expect(source.start).toHaveBeenCalled(); + expect(source.buffer).toBe(mockAudioBuffer); + }); + + test('should resume AudioContext if needed', async () => { + await hookData?.queueAudio('dGVzdA=='); + + expect(mockAudioContext.resume).toHaveBeenCalled(); + }); + + test('should queue multiple audio chunks correctly', async () => { + mockAudioBuffer.duration = 0.1; // 100ms + + await hookData?.queueAudio('dGVzdA=='); + await hookData?.queueAudio('dGVzdDI='); + + expect(createdBufferSources).toHaveLength(2); + // First chunk starts at currentTime (0), second at 0.1 + expect(createdBufferSources[0].start).toHaveBeenCalledWith(0); + expect(createdBufferSources[1].start).toHaveBeenCalledWith(0.1); + }); + }); + + describe('Voice state management', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should set voice state to bot_speaking on first chunk', async () => { + mockAudioContext.currentTime = 0; + + await hookData?.queueAudio('dGVzdA=='); + + expect(mockSetVoiceState).toHaveBeenCalledWith('bot_speaking'); + }); + + test('should not set bot_speaking on subsequent chunks while playing', async () => { + mockAudioContext.currentTime = 0; + mockAudioBuffer.duration = 0.1; + + await hookData?.queueAudio('dGVzdA=='); // First chunk + mockSetVoiceState.mockClear(); + + await hookData?.queueAudio('dGVzdDI='); // Second chunk (while first is still playing) + + expect(mockSetVoiceState).not.toHaveBeenCalledWith('bot_speaking'); + }); + + test('should set voice state to listening when last audio ends', async () => { + await hookData?.queueAudio('dGVzdA=='); + mockSetVoiceState.mockClear(); + + // Simulate audio ended + const [source] = createdBufferSources; + if (source.onended) { + source.onended(); + } + + expect(mockSetVoiceState).toHaveBeenCalledWith('listening'); + }); + + test('should only trigger listening on the last source ended', async () => { + mockAudioBuffer.duration = 0.1; + + await hookData?.queueAudio('dGVzdA=='); + await hookData?.queueAudio('dGVzdDI='); + mockSetVoiceState.mockClear(); + + const [firstSource, lastSource] = createdBufferSources; + + // Simulate first chunk ended (should not trigger listening) + if (firstSource.onended) { + firstSource.onended(); + } + + expect(mockSetVoiceState).not.toHaveBeenCalledWith('listening'); + + // Simulate last chunk ended (should trigger listening) + if (lastSource.onended) { + lastSource.onended(); + } + + expect(mockSetVoiceState).toHaveBeenCalledWith('listening'); + }); + }); + + describe('Audio cleanup', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should clean up buffer source on ended', async () => { + await hookData?.queueAudio('dGVzdA=='); + + const [source] = createdBufferSources; + // Simulate audio ended + if (source.onended) { + source.onended(); + } + + expect(source.disconnect).toHaveBeenCalled(); + expect(source.buffer).toBeNull(); + }); + + test('should stop all audio and close context', async () => { + await hookData?.queueAudio('dGVzdA=='); + + hookData?.stopAllAudio(); + + expect(mockAudioContext.close).toHaveBeenCalled(); + }); + + test('should clear lastSourceRef onended callback on stop', async () => { + await hookData?.queueAudio('dGVzdA=='); + const [source] = createdBufferSources; + const onEndedBefore = source.onended; + + expect(onEndedBefore).not.toBeNull(); + + hookData?.stopAllAudio(); + + // After stopAllAudio, the onended should be cleared + expect(source.onended).toBeNull(); + }); + }); + + describe('Error handling', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should handle invalid base64 data gracefully', async () => { + await expect(async () => { + await hookData?.queueAudio('invalid-base64!@#'); + }).not.toThrow(); + }); + + test('should handle AudioContext creation failure', async () => { + global.AudioContext = jest.fn(() => { + throw new Error('AudioContext not supported'); + }) as unknown as typeof AudioContext; + + await expect(async () => { + await hookData?.queueAudio('dGVzdA=='); + }).rejects.toThrow('AudioContext not supported'); + }); + }); + + describe('Real-world scenarios', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should handle streaming audio chunks', async () => { + mockAudioBuffer.duration = 0.05; // 50ms chunks + + // Simulate streaming 5 chunks + await Promise.all(Array.from({ length: 5 }, (_, i) => hookData?.queueAudio(`chunk${i}`))); + + expect(createdBufferSources).toHaveLength(5); + createdBufferSources.forEach(source => { + expect(source.start).toHaveBeenCalled(); + }); + // Should only call bot_speaking once (first chunk) + expect(mockSetVoiceState).toHaveBeenCalledWith('bot_speaking'); + expect(mockSetVoiceState).toHaveBeenCalledTimes(1); + }); + + test('should handle playback interruption', async () => { + await hookData?.queueAudio('dGVzdA=='); + + hookData?.stopAllAudio(); + + expect(mockAudioContext.close).toHaveBeenCalled(); + }); + + test('should handle resume after stop', async () => { + // Play, stop, then play again + await hookData?.queueAudio('dGVzdA=='); + hookData?.stopAllAudio(); + await hookData?.queueAudio('dGVzdDI='); + + expect(AudioContext).toHaveBeenCalledTimes(2); // New context after stop + }); + + test('should reset nextPlayTime after stop allowing immediate playback', async () => { + mockAudioBuffer.duration = 0.1; + + await hookData?.queueAudio('dGVzdA=='); + hookData?.stopAllAudio(); + mockSetVoiceState.mockClear(); + + await hookData?.queueAudio('dGVzdDI='); + + // Should trigger bot_speaking again since it's a fresh start + expect(mockSetVoiceState).toHaveBeenCalledWith('bot_speaking'); + }); + }); + + describe('Performance considerations', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should handle large audio data', async () => { + const largeBase64 = 'A'.repeat(10000); + + await expect(async () => { + await hookData?.queueAudio(largeBase64); + }).not.toThrow(); + }); + + test('should handle rapid successive calls', async () => { + // Ensure the mock "base64" data has an even length as Int16Array requires multiples of 2 bytes + await Promise.all(Array.from({ length: 100 }, (_, i) => hookData?.queueAudio(`chunk${i}`.padEnd(8, ' ')))); + + expect(createdBufferSources).toHaveLength(100); + createdBufferSources.forEach(source => { + expect(source.start).toHaveBeenCalled(); + }); + }); + }); +}); diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts new file mode 100644 index 0000000000..478c447994 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts @@ -0,0 +1,92 @@ +import { useRef, useCallback, useMemo } from 'react'; +import useCapabilities from '../../Capabilities/useCapabilities'; +import useVoiceStateWritable from '../../../hooks/internal/useVoiceStateWritable'; + +const DEFAULT_SAMPLE_RATE = 24000; +const INT16_SCALE = 32768; + +export function useAudioPlayer() { + const audioCtxRef = useRef(undefined); + const lastSourceRef = useRef(undefined); + const nextPlayTimeRef = useRef(0); + const voiceConfiguration = useCapabilities(caps => caps.voiceConfiguration); + const [, setVoiceState] = useVoiceStateWritable(); + + const sampleRate = voiceConfiguration?.sampleRate ?? DEFAULT_SAMPLE_RATE; + + const queueAudio = useCallback( + async (base64: string) => { + if (!audioCtxRef.current) { + audioCtxRef.current = new AudioContext({ sampleRate }); + } + const audioCtx = audioCtxRef.current; + await audioCtx.resume(); + + try { + const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0)); + const int16Bytes = new Int16Array(bytes.buffer); + const float32Bytes = new Float32Array(int16Bytes.length); + + for (let i = 0; i < int16Bytes.length; i++) { + float32Bytes[+i] = int16Bytes.at(i) / INT16_SCALE; + } + + const buffer = audioCtx.createBuffer(1, float32Bytes.length, audioCtx.sampleRate); + buffer.getChannelData(0).set(float32Bytes); + + const src = audioCtx.createBufferSource(); + src.buffer = buffer; + src.connect(audioCtx.destination); + + // Clear previous source's onended to avoid stale callbacks + if (lastSourceRef.current) { + lastSourceRef.current.onended = null; + } + + src.onended = () => { + src.disconnect(); + src.buffer = null; + // Only the last source's onended should trigger state change to 'listening' + if (lastSourceRef.current === src) { + setVoiceState('listening'); + } + }; + + lastSourceRef.current = src; + const isFirstChunk = nextPlayTimeRef.current <= audioCtx.currentTime; + // Only dispatch bot_speaking on first chunk, we are resetting refs on stopAllAudio (bargein, mic off) + if (isFirstChunk) { + setVoiceState('bot_speaking'); + } + + nextPlayTimeRef.current = Math.max(nextPlayTimeRef.current, audioCtx.currentTime); + src.start(nextPlayTimeRef.current); + nextPlayTimeRef.current += buffer.duration; + } catch (error) { + console.warn('botframework-webchat: Error during audio playback in useAudioPlayer:', error); + } + }, + [setVoiceState, sampleRate] + ); + + const stopAllAudio = useCallback(() => { + nextPlayTimeRef.current = 0; + if (lastSourceRef.current) { + lastSourceRef.current.onended = null; + lastSourceRef.current = undefined; + } + if (audioCtxRef.current) { + audioCtxRef.current.close(); + audioCtxRef.current = undefined; + } + }, []); + + return useMemo( + () => + Object.freeze({ + queueAudio, + stopAllAudio + }), + [queueAudio, stopAllAudio] + ); +} diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx new file mode 100644 index 0000000000..6bb47cfa14 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx @@ -0,0 +1,221 @@ +/** @jest-environment @happy-dom/jest-environment */ +/// + +import { act, render, waitFor, type RenderResult } from '@testing-library/react'; +import React, { type ComponentType } from 'react'; +import { useRecorder } from './useRecorder'; + +jest.mock('../../Ponyfill/usePonyfill', () => ({ __esModule: true, default: jest.fn(() => [{ Date: global.Date }]) })); +jest.mock('../../Capabilities/useCapabilities', () => ({ + __esModule: true, + default: jest.fn((selector: (caps: { voiceConfiguration?: { sampleRate: number } }) => unknown) => + selector({ voiceConfiguration: { sampleRate: 24000 } }) + ) +})); + +const mockTrack = { + stop: jest.fn() +}; + +const mockMediaStream = { + getTracks: jest.fn(() => [mockTrack]) +}; + +const mockMediaDevices = { + getUserMedia: jest.fn().mockResolvedValue(mockMediaStream) +}; + +const mockWorkletPort = { + onmessage: null as ((event: { data: unknown }) => void) | null, + postMessage: jest.fn() +}; + +const mockWorkletNode = { + connect: jest.fn(), + disconnect: jest.fn(), + port: mockWorkletPort +}; + +const mockAudioContext = { + audioWorklet: { + addModule: jest.fn().mockResolvedValue(undefined) + }, + createMediaStreamSource: jest.fn(() => ({ + connect: jest.fn() + })), + destination: {}, + resume: jest.fn().mockResolvedValue(undefined), + state: 'running' +}; + +// --- Global Mocks Setup --- + +Object.defineProperty(global.navigator, 'mediaDevices', { + value: mockMediaDevices, + writable: true +}); + +global.AudioContext = jest.fn(() => mockAudioContext) as unknown as typeof AudioContext; +global.AudioWorkletNode = jest.fn(() => mockWorkletNode) as unknown as typeof AudioWorkletNode; +global.Blob = jest.fn(parts => ({ parts, type: (parts as { type?: string }[])[1]?.type })) as unknown as typeof Blob; +global.URL.createObjectURL = jest.fn(() => 'blob:http://localhost/mock-url'); +global.URL.revokeObjectURL = jest.fn(); +global.btoa = jest.fn(str => `btoa(${str})`); + +// --- Tests --- + +describe('useRecorder', () => { + let onAudioChunk: jest.Mock; + let HookApp: ComponentType<{ onAudioChunk: (base64: string, timestamp: string) => void }>; + let hookData: ReturnType | undefined; + // eslint-disable-next-line @typescript-eslint/no-unused-vars + let renderResult: RenderResult; + + beforeEach(() => { + // Clear all mocks before each test + jest.clearAllMocks(); + onAudioChunk = jest.fn(); + hookData = undefined; + mockWorkletPort.onmessage = null; + (mockAudioContext.state as string) = 'running'; + + HookApp = ({ onAudioChunk: onChunk }) => { + hookData = useRecorder(onChunk); + return null; + }; + }); + + test('should return record function', () => { + render(); + expect(typeof hookData?.record).toBe('function'); + }); + + test('should start recording when record is called', async () => { + renderResult = render(); + + act(() => { + hookData?.record(); + }); + + await waitFor(() => { + expect(navigator.mediaDevices.getUserMedia).toHaveBeenCalledTimes(1); + }); + + expect(global.AudioContext).toHaveBeenCalledTimes(1); + expect(mockAudioContext.audioWorklet.addModule).toHaveBeenCalledTimes(1); + expect(global.AudioWorkletNode).toHaveBeenCalledWith(expect.anything(), 'audio-recorder', { + processorOptions: { bufferSize: 2400 } + }); + expect(mockWorkletNode.connect).toHaveBeenCalledTimes(1); + expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'START' }); + }); + + test('should stop recording when returned cleanup function is called', async () => { + renderResult = render(); + + let stopRecording: (() => void) | undefined; + // Start recording + act(() => { + stopRecording = hookData?.record(); + }); + + // Wait for async startRecording to complete + await waitFor(() => { + expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'START' }); + }); + + // Stop recording + act(() => { + stopRecording?.(); + }); + + expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'STOP' }); + expect(mockWorkletNode.disconnect).toHaveBeenCalledTimes(1); + expect(mockTrack.stop).toHaveBeenCalledTimes(1); + }); + + test('should process audio chunks sent from the worklet', async () => { + render(); + + act(() => { + hookData?.record(); + }); + + await waitFor(() => expect(mockWorkletPort.onmessage).not.toBeNull()); + + // Simulate a message from the audio worklet + const mockAudioData = new Float32Array([0.1, 0.2, -0.1]); + act(() => { + mockWorkletPort.onmessage!({ + data: { + eventType: 'audio', + audioData: mockAudioData + } + }); + }); + + await waitFor(() => expect(onAudioChunk).toHaveBeenCalledTimes(1)); + expect(global.btoa).toHaveBeenCalled(); + // Check that timestamp is passed as second argument + expect(onAudioChunk).toHaveBeenCalledWith(expect.any(String), expect.any(String)); + }); + + test('should handle suspended audio context by resuming it', async () => { + (mockAudioContext.state as string) = 'suspended'; + render(); + + act(() => { + hookData?.record(); + }); + + await waitFor(() => { + expect(mockAudioContext.resume).toHaveBeenCalledTimes(1); + }); + }); + + test('should reuse existing AudioContext on subsequent calls', async () => { + render(); + + let stopRecording: (() => void) | undefined; + act(() => { + stopRecording = hookData?.record(); + }); + + await waitFor(() => { + expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'START' }); + }); + + act(() => { + stopRecording?.(); + }); + + act(() => { + hookData?.record(); + }); + + await waitFor(() => { + expect(mockWorkletPort.postMessage).toHaveBeenCalledTimes(3); // START, STOP, START + }); + + // AudioContext should only be created once + expect(global.AudioContext).toHaveBeenCalledTimes(1); + }); + + test('should request microphone with correct audio constraints', async () => { + render(); + + act(() => { + hookData?.record(); + }); + + await waitFor(() => { + expect(navigator.mediaDevices.getUserMedia).toHaveBeenCalledWith({ + audio: { + channelCount: 1, + echoCancellation: true, + sampleRate: 24000 + } + }); + }); + }); +}); diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts new file mode 100644 index 0000000000..05ed029003 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts @@ -0,0 +1,146 @@ +import { useRef, useCallback, useMemo } from 'react'; +import useCapabilities from '../../Capabilities/useCapabilities'; +import usePonyfill from '../../Ponyfill/usePonyfill'; + +// Minimum AudioWorkletProcessor definition for TypeScript recognition +// adding reference of worker does not work +declare class AudioWorkletProcessor { + buffer: number[]; + bufferSize: number; + constructor(options?: AudioWorkletNodeOptions); + process(inputs: Float32Array[][], outputs: Float32Array[][], parameters: Record): boolean; + readonly port: MessagePort; + recording: boolean; +} +declare function registerProcessor(name: string, processorCtor: typeof AudioWorkletProcessor): void; + +/** + * CSP Compliant: check __tests__/html2/speechToSpeech/csp.recording.html for CSP compliance tests. + * NOTE: This code is stringified and run in an AudioWorklet context, so it must be plain JavaScript + * without any TypeScript annotations that could be transformed by the compiler. + */ +const audioProcessorCode = `(${function () { + class AudioRecorderProcessor extends AudioWorkletProcessor { + constructor(options: AudioWorkletNodeOptions) { + super(); + this.buffer = []; + this.bufferSize = options.processorOptions.bufferSize; + this.recording = false; + + this.port.onmessage = e => { + if (e.data.command === 'START') { + this.recording = true; + } else if (e.data.command === 'STOP') { + this.recording = false; + this.buffer = []; + } + }; + } + + process(inputs: Float32Array[][]) { + if (inputs[0] && inputs[0].length && this.recording) { + this.buffer.push(...inputs[0][0]); + while (this.buffer.length >= this.bufferSize) { + const chunk = this.buffer.splice(0, this.bufferSize); + this.port.postMessage({ eventType: 'audio', audioData: new Float32Array(chunk) }); + } + } + return true; + } + } + + registerProcessor('audio-recorder', AudioRecorderProcessor); +}})()`; + +const INT16_MIN = -32768; +const INT16_MAX = 32767; +const INT16_SCALE = 32767; +const DEFAULT_SAMPLE_RATE = 24000; +const DEFAULT_CHUNK_SIZE_IN_MS = 100; +const MS_IN_SECOND = 1000; + +export function useRecorder(onAudioChunk: (base64: string, timestamp: string) => void) { + const [{ Date }] = usePonyfill(); + const audioCtxRef = useRef(undefined); + const streamRef = useRef(undefined); + const voiceConfiguration = useCapabilities(caps => caps.voiceConfiguration); + const workletRef = useRef(undefined); + + const chunkIntervalMs = voiceConfiguration?.chunkIntervalMs ?? DEFAULT_CHUNK_SIZE_IN_MS; + const sampleRate = voiceConfiguration?.sampleRate ?? DEFAULT_SAMPLE_RATE; + + const stopRecording = useCallback(() => { + if (workletRef.current) { + workletRef.current.port.postMessage({ command: 'STOP' }); + workletRef.current.disconnect(); + workletRef.current = undefined; + } + if (streamRef.current) { + streamRef.current.getTracks().forEach(track => track.stop()); + streamRef.current = undefined; + } + }, [streamRef, workletRef]); + + const initAudio = useCallback(async () => { + if (audioCtxRef.current) { + return; + } + const audioCtx = new AudioContext({ sampleRate }); + const blob = new Blob([audioProcessorCode], { + type: 'application/javascript' + }); + // eslint-disable-next-line no-restricted-properties + const url = URL.createObjectURL(blob); + await audioCtx.audioWorklet.addModule(url); + URL.revokeObjectURL(url); + // eslint-disable-next-line require-atomic-updates + audioCtxRef.current = audioCtx; + }, [audioCtxRef, sampleRate]); + + const startRecording = useCallback(async () => { + await initAudio(); + const audioCtx = audioCtxRef.current!; // audioCtx must be available after initAudio(). + if (audioCtx.state === 'suspended') { + await audioCtx.resume(); + } + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + echoCancellation: true, + sampleRate + } + }); + streamRef.current = stream; + const source = audioCtx.createMediaStreamSource(stream); + const worklet = new AudioWorkletNode(audioCtx, 'audio-recorder', { + processorOptions: { + bufferSize: (sampleRate * chunkIntervalMs) / MS_IN_SECOND + } + }); + + worklet.port.onmessage = e => { + if (e.data.eventType === 'audio') { + const timestamp = new Date().toISOString(); + const float32 = e.data.audioData; + const int16 = new Int16Array(float32.length); + for (let i = 0; i < float32.length; i++) { + int16[+i] = Math.max(INT16_MIN, Math.min(INT16_MAX, float32.at(i) * INT16_SCALE)); + } + const base64 = btoa(String.fromCharCode(...new Uint8Array(int16.buffer))); + onAudioChunk(base64, timestamp); + } + }; + + source.connect(worklet); + worklet.connect(audioCtx.destination); + worklet.port.postMessage({ command: 'START' }); + workletRef.current = worklet; + }, [audioCtxRef, chunkIntervalMs, Date, initAudio, onAudioChunk, sampleRate]); + + const record = useCallback(() => { + startRecording(); + return stopRecording; + }, [startRecording, stopRecording]); + + return useMemo(() => ({ record }), [record]); +} diff --git a/packages/bundle/src/boot/actual/hook/minimal.ts b/packages/bundle/src/boot/actual/hook/minimal.ts index 616f8e59ef..b09c196177 100644 --- a/packages/bundle/src/boot/actual/hook/minimal.ts +++ b/packages/bundle/src/boot/actual/hook/minimal.ts @@ -76,7 +76,9 @@ export { useShouldReduceMotion, useShouldSpeakIncomingActivity, useStartDictate, + useStartVoice, useStopDictate, + useStopVoice, useStyleOptions, useStyleSet, useSubmitSendBox, @@ -95,6 +97,7 @@ export { useUserID, useUsername, useVoiceSelector, + useVoiceState, useWebSpeechPonyfill, type SendBoxFocusOptions } from 'botframework-webchat-component/hook'; diff --git a/packages/bundle/src/boot/actual/internal.ts b/packages/bundle/src/boot/actual/internal.ts index b416e8208e..5949642426 100644 --- a/packages/bundle/src/boot/actual/internal.ts +++ b/packages/bundle/src/boot/actual/internal.ts @@ -1,5 +1,10 @@ // We should review exports in this file to make sure 1P = 3P. -export { type ActivityMiddleware, type TypingIndicatorMiddleware } from 'botframework-webchat-api'; +export { + type ActivityMiddleware, + type ActivityStatusMiddleware, + type TypingIndicatorMiddleware +} from 'botframework-webchat-api'; +export { usePostVoiceActivity, useShouldShowMicrophoneButton } from 'botframework-webchat-api/internal'; export { CodeHighlighterComposer, createIconComponent, @@ -13,4 +18,11 @@ export { useLiveRegion, type HighlightCodeFn } from 'botframework-webchat-component/internal'; -export { getOrgSchemaMessage, type DirectLineCardAction, type WebChatActivity } from 'botframework-webchat-core'; +export { + getOrgSchemaMessage, + getVoiceActivityRole, + getVoiceActivityText, + isVoiceTranscriptActivity, + type DirectLineCardAction, + type WebChatActivity +} from 'botframework-webchat-core'; diff --git a/packages/component/src/Activity/StackedLayout.tsx b/packages/component/src/Activity/StackedLayout.tsx index f387eb41ad..9e173e36fb 100644 --- a/packages/component/src/Activity/StackedLayout.tsx +++ b/packages/component/src/Activity/StackedLayout.tsx @@ -3,7 +3,14 @@ import { hooks } from 'botframework-webchat-api'; import type { RenderAttachment } from 'botframework-webchat-api'; import { ActivityBorderDecorator } from 'botframework-webchat-api/decorator'; -import { getActivityLivestreamingMetadata, getOrgSchemaMessage, type WebChatActivity } from 'botframework-webchat-core'; +import { + getActivityLivestreamingMetadata, + getOrgSchemaMessage, + getVoiceActivityRole, + getVoiceActivityText, + isVoiceActivity, + type WebChatActivity +} from 'botframework-webchat-core'; import { useStyles } from '@msinternal/botframework-webchat-styles/react'; import cx from 'classnames'; import React, { memo, useCallback, useMemo, type ReactNode } from 'react'; @@ -124,7 +131,7 @@ const StackedLayout = ({ const isMessageOrTyping = activity.type === 'message' || activity.type === 'typing'; const attachments = useMemo(() => (isMessageOrTyping && activity.attachments) || [], [activity, isMessageOrTyping]); - const fromUser = activity.from.role === 'user'; + const fromUser = activity.from.role === 'user' || getVoiceActivityRole(activity) === 'user'; const messageBackDisplayText: string = (isMessageOrTyping && activity.channelData?.messageBack?.displayText) || ''; const messageThing = useMemo(() => getOrgSchemaMessage(activity.entities), [activity]); const isCollapsible = useMemo(() => messageThing?.keywords?.includes('Collapsible'), [messageThing]); @@ -134,7 +141,9 @@ const StackedLayout = ({ ? messageBackDisplayText || activity.text : isLivestreaming && 'text' in activity ? (activity.text as string) - : ''; + : isVoiceActivity(activity) + ? getVoiceActivityText(activity) + : ''; const initials = fromUser ? userInitials : botInitials; const nubOffset = fromUser ? bubbleFromUserNubOffset : bubbleNubOffset; diff --git a/packages/component/src/Middleware/Activity/createCoreMiddleware.tsx b/packages/component/src/Middleware/Activity/createCoreMiddleware.tsx index a9479b8188..e6b6434cd1 100644 --- a/packages/component/src/Middleware/Activity/createCoreMiddleware.tsx +++ b/packages/component/src/Middleware/Activity/createCoreMiddleware.tsx @@ -1,43 +1,59 @@ /* eslint complexity: ["error", 21] */ import { ActivityMiddleware } from 'botframework-webchat-api'; -import { getActivityLivestreamingMetadata, getOrgSchemaMessage } from 'botframework-webchat-core'; +import { + getActivityLivestreamingMetadata, + getOrgSchemaMessage, + isVoiceTranscriptActivity +} from 'botframework-webchat-core'; import React from 'react'; import CarouselLayout from '../../Activity/CarouselLayout'; import StackedLayout from '../../Activity/StackedLayout'; +// TODO: [P4] Can we simplify these if-statement to something more readable? +function shouldFilterActivity(activity, messageThing) { + const { type } = activity; + if ( + type === 'conversationUpdate' || + (type === 'event' && !isVoiceTranscriptActivity(activity)) || + type === 'invoke' || + // Do not show content for contentless livestream interims, or finalized activity without content. + (type === 'typing' && + (getActivityLivestreamingMetadata(activity)?.type === 'contentless' || + !(activity['text'] || activity.attachments?.length > 0 || messageThing?.abstract))) || + (type === 'message' && + // Do not show postback + (activity.channelData?.postBack || + // Do not show messageBack if displayText is undefined + (activity.channelData?.messageBack && !activity.channelData.messageBack.displayText) || + // Do not show empty bubbles (no text and attachments) + !(activity.text || activity.attachments?.length || messageThing?.abstract))) + ) { + return true; + } + + return false; +} + export default function createCoreMiddleware(): ActivityMiddleware[] { return [ () => next => (...args) => { const [{ activity }] = args; + const isMessageOrTyping = activity.type === 'message' || activity.type === 'typing'; - // TODO: [P4] Can we simplify these if-statement to something more readable? - - const { type } = activity; const messageThing = getOrgSchemaMessage(activity.entities); // Filter out activities that should not visible. - if ( - type === 'conversationUpdate' || - type === 'event' || - type === 'invoke' || - // Do not show content for contentless livestream interims, or finalized activity without content. - (type === 'typing' && - (getActivityLivestreamingMetadata(activity)?.type === 'contentless' || - !(activity['text'] || activity.attachments?.length > 0 || messageThing?.abstract))) || - (type === 'message' && - // Do not show postback - (activity.channelData?.postBack || - // Do not show messageBack if displayText is undefined - (activity.channelData?.messageBack && !activity.channelData.messageBack.displayText) || - // Do not show empty bubbles (no text and attachments) - !(activity.text || activity.attachments?.length || messageThing?.abstract))) - ) { + if (shouldFilterActivity(activity, messageThing)) { return false; - } else if (type === 'message' || type === 'typing') { - if ((activity.attachments?.length || 0) > 1 && activity.attachmentLayout === 'carousel') { + } else if (isMessageOrTyping || isVoiceTranscriptActivity(activity)) { + if ( + isMessageOrTyping && + (activity.attachments?.length || 0) > 1 && + activity.attachmentLayout === 'carousel' + ) { // The following line is not a React functional component, it's a render function called by useCreateActivityRenderer() hook. // The function signature need to be compatible with older version of activity middleware, which was: // diff --git a/packages/component/src/TextArea/TextArea.tsx b/packages/component/src/TextArea/TextArea.tsx index 15ab98a91c..8ea830fb0b 100644 --- a/packages/component/src/TextArea/TextArea.tsx +++ b/packages/component/src/TextArea/TextArea.tsx @@ -37,6 +37,7 @@ const TextArea = forwardRef< onClick?: MouseEventHandler | undefined; onInput?: FormEventHandler | undefined; placeholder?: string | undefined; + readOnly?: boolean | undefined; startRows?: number | undefined; value?: string | undefined; }> @@ -45,7 +46,7 @@ const TextArea = forwardRef< const classNames = useStyles(styles); const isInCompositionRef = useRef(false); - const disabled = uiState === 'disabled'; + const disabled = uiState === 'disabled' || props.readOnly; const handleCompositionEnd = useCallback(() => { isInCompositionRef.current = false; diff --git a/packages/component/src/boot/hook.ts b/packages/component/src/boot/hook.ts index 91976e66fb..426948db69 100644 --- a/packages/component/src/boot/hook.ts +++ b/packages/component/src/boot/hook.ts @@ -59,7 +59,9 @@ export { useSetNotification, useShouldSpeakIncomingActivity, useStartDictate, + useStartVoice, useStopDictate, + useStopVoice, useStyleOptions, useSubmitSendBox, useSuggestedActions, @@ -72,7 +74,8 @@ export { useUIState, useUserID, useUsername, - useVoiceSelector + useVoiceSelector, + useVoiceState } from 'botframework-webchat-api/hook'; // #region Overrides diff --git a/packages/component/src/decorator/private/WebChatDecorator.tsx b/packages/component/src/decorator/private/WebChatDecorator.tsx index 33d76363ee..edce239b69 100644 --- a/packages/component/src/decorator/private/WebChatDecorator.tsx +++ b/packages/component/src/decorator/private/WebChatDecorator.tsx @@ -13,6 +13,16 @@ import BorderFlair from './BorderFlair'; import BorderLoader from './BorderLoader'; const middleware: readonly DecoratorMiddleware[] = Object.freeze([ + createActivityBorderMiddleware(function FluentBorderFlair({ request, Next, ...props }) { + if (request.modality.has('audio') && request.from === 'bot') { + return ( + + + + ); + } + return ; + }), createActivityBorderMiddleware(function BorderFlairDecorator({ request, Next, ...props }) { return ( diff --git a/packages/core/src/actions/postVoiceActivity.ts b/packages/core/src/actions/postVoiceActivity.ts new file mode 100644 index 0000000000..8ab7087012 --- /dev/null +++ b/packages/core/src/actions/postVoiceActivity.ts @@ -0,0 +1,21 @@ +import type { WebChatActivity } from '../types/WebChatActivity'; + +const VOICE_POST_ACTIVITY = 'WEB_CHAT/VOICE_POST_ACTIVITY' as const; + +type VoicePostActivityAction = { + type: typeof VOICE_POST_ACTIVITY; + payload: { activity: WebChatActivity }; +}; + +function postVoiceActivity(activity: WebChatActivity): VoicePostActivityAction { + return { + type: VOICE_POST_ACTIVITY, + payload: { activity } + }; +} + +export default postVoiceActivity; + +export { VOICE_POST_ACTIVITY }; + +export type { VoicePostActivityAction }; diff --git a/packages/core/src/actions/registerVoiceHandler.ts b/packages/core/src/actions/registerVoiceHandler.ts new file mode 100644 index 0000000000..3c0029835a --- /dev/null +++ b/packages/core/src/actions/registerVoiceHandler.ts @@ -0,0 +1,24 @@ +const VOICE_REGISTER_HANDLER = 'WEB_CHAT/VOICE_REGISTER_HANDLER' as const; + +type VoiceHandler = { + queueAudio: (base64: string) => void; + stopAllAudio: () => void; +}; + +type VoiceRegisterHandlerAction = { + type: typeof VOICE_REGISTER_HANDLER; + payload: { id: string; voiceHandler: VoiceHandler }; +}; + +function registerVoiceHandler(id: string, voiceHandler: VoiceHandler): VoiceRegisterHandlerAction { + return { + type: VOICE_REGISTER_HANDLER, + payload: { id, voiceHandler } + }; +} + +export default registerVoiceHandler; + +export { VOICE_REGISTER_HANDLER }; + +export type { VoiceHandler, VoiceRegisterHandlerAction }; diff --git a/packages/core/src/actions/setVoiceState.ts b/packages/core/src/actions/setVoiceState.ts new file mode 100644 index 0000000000..53fc12b7c2 --- /dev/null +++ b/packages/core/src/actions/setVoiceState.ts @@ -0,0 +1,21 @@ +const VOICE_SET_STATE = 'WEB_CHAT/VOICE_SET_STATE' as const; + +type VoiceState = 'idle' | 'listening' | 'user_speaking' | 'processing' | 'bot_speaking'; + +type VoiceSetStateAction = { + type: typeof VOICE_SET_STATE; + payload: { voiceState: VoiceState }; +}; + +function setVoiceState(voiceState: VoiceState): VoiceSetStateAction { + return { + type: VOICE_SET_STATE, + payload: { voiceState } + }; +} + +export default setVoiceState; + +export { VOICE_SET_STATE }; + +export type { VoiceState, VoiceSetStateAction }; diff --git a/packages/core/src/actions/startVoiceRecording.ts b/packages/core/src/actions/startVoiceRecording.ts new file mode 100644 index 0000000000..cbf050b392 --- /dev/null +++ b/packages/core/src/actions/startVoiceRecording.ts @@ -0,0 +1,17 @@ +const VOICE_START_RECORDING = 'WEB_CHAT/VOICE_START_RECORDING' as const; + +type VoiceStartRecordingAction = { + type: typeof VOICE_START_RECORDING; +}; + +function startVoiceRecording(): VoiceStartRecordingAction { + return { + type: VOICE_START_RECORDING + }; +} + +export default startVoiceRecording; + +export { VOICE_START_RECORDING }; + +export type { VoiceStartRecordingAction }; diff --git a/packages/core/src/actions/stopVoiceRecording.ts b/packages/core/src/actions/stopVoiceRecording.ts new file mode 100644 index 0000000000..f547fe9b81 --- /dev/null +++ b/packages/core/src/actions/stopVoiceRecording.ts @@ -0,0 +1,17 @@ +const VOICE_STOP_RECORDING = 'WEB_CHAT/VOICE_STOP_RECORDING' as const; + +type VoiceStopRecordingAction = { + type: typeof VOICE_STOP_RECORDING; +}; + +function stopVoiceRecording(): VoiceStopRecordingAction { + return { + type: VOICE_STOP_RECORDING + }; +} + +export default stopVoiceRecording; + +export { VOICE_STOP_RECORDING }; + +export type { VoiceStopRecordingAction }; diff --git a/packages/core/src/actions/unregisterVoiceHandler.ts b/packages/core/src/actions/unregisterVoiceHandler.ts new file mode 100644 index 0000000000..59fedcc27a --- /dev/null +++ b/packages/core/src/actions/unregisterVoiceHandler.ts @@ -0,0 +1,19 @@ +const VOICE_UNREGISTER_HANDLER = 'WEB_CHAT/VOICE_UNREGISTER_HANDLER' as const; + +type VoiceUnregisterHandlerAction = { + type: typeof VOICE_UNREGISTER_HANDLER; + payload: { id: string }; +}; + +function unregisterVoiceHandler(id: string): VoiceUnregisterHandlerAction { + return { + type: VOICE_UNREGISTER_HANDLER, + payload: { id } + }; +} + +export default unregisterVoiceHandler; + +export { VOICE_UNREGISTER_HANDLER }; + +export type { VoiceUnregisterHandlerAction }; diff --git a/packages/core/src/createReducer.ts b/packages/core/src/createReducer.ts index 1e10dfc220..e20f908304 100644 --- a/packages/core/src/createReducer.ts +++ b/packages/core/src/createReducer.ts @@ -17,6 +17,7 @@ import sendTypingIndicator from './reducers/sendTypingIndicator'; import shouldSpeakIncomingActivity from './reducers/shouldSpeakIncomingActivity'; import suggestedActions from './reducers/suggestedActions'; import suggestedActionsOriginActivity from './reducers/suggestedActionsOriginActivity'; +import voiceActivity from './reducers/voiceActivity'; import type { GlobalScopePonyfill } from './types/GlobalScopePonyfill'; import type { RestrictedStoreDebugAPI } from './types/StoreDebugAPI'; @@ -41,7 +42,8 @@ export default function createReducer(ponyfill: GlobalScopePonyfill, restrictedS shouldSpeakIncomingActivity, suggestedActions, suggestedActionsOriginActivity, - typing: createTypingReducer(ponyfill) + typing: createTypingReducer(ponyfill), + voice: voiceActivity }) ); } diff --git a/packages/core/src/createSagas.ts b/packages/core/src/createSagas.ts index 916b62598d..1fe2cac6e3 100644 --- a/packages/core/src/createSagas.ts +++ b/packages/core/src/createSagas.ts @@ -24,6 +24,7 @@ import startSpeakActivityOnPostActivitySaga from './sagas/startSpeakActivityOnPo import stopDictateOnCardActionSaga from './sagas/stopDictateOnCardActionSaga'; import stopSpeakingActivityOnInputSaga from './sagas/stopSpeakingActivityOnInputSaga'; import submitSendBoxSaga from './sagas/submitSendBoxSaga'; +import postVoiceActivitySaga from './sagas/postVoiceActivitySaga'; import { type GlobalScopePonyfill } from './types/GlobalScopePonyfill'; type CreateSagasOptions = { @@ -44,6 +45,7 @@ export default function createSagas({ ponyfill }: CreateSagasOptions): Saga { yield fork(markAllAsSpokenOnStopSpeakActivitySaga); yield fork(observeActivitySaga); yield fork(postActivitySaga, ponyfill); + yield fork(postVoiceActivitySaga, ponyfill); yield fork(queueIncomingActivitySaga, ponyfill); yield fork(sendEventToPostActivitySaga); yield fork(sendFilesToPostActivitySaga); diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index e2534aa94d..27fde76d63 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -40,6 +40,18 @@ import getActivityLivestreamingMetadata from './utils/getActivityLivestreamingMe import getOrgSchemaMessage from './utils/getOrgSchemaMessage'; import onErrorResumeNext from './utils/onErrorResumeNext'; import singleToArray from './utils/singleToArray'; +import isVoiceActivity from './utils/voiceActivity/isVoiceActivity'; +import isVoiceTranscriptActivity from './utils/voiceActivity/isVoiceTranscriptActivity'; +import getVoiceActivityRole from './utils/voiceActivity/getVoiceActivityRole'; +import getVoiceActivityText from './utils/voiceActivity/getVoiceActivityText'; +import startVoiceRecording from './actions/startVoiceRecording'; +import stopVoiceRecording from './actions/stopVoiceRecording'; +import setVoiceState from './actions/setVoiceState'; +import registerVoiceHandler from './actions/registerVoiceHandler'; +import unregisterVoiceHandler from './actions/unregisterVoiceHandler'; +import postVoiceActivity from './actions/postVoiceActivity'; +import type { VoiceState } from './actions/setVoiceState'; +import type { VoiceHandler } from './actions/registerVoiceHandler'; export { isForbiddenPropertyName, @@ -99,6 +111,10 @@ export { emitTypingIndicator, getActivityLivestreamingMetadata, getOrgSchemaMessage, + getVoiceActivityRole, + getVoiceActivityText, + isVoiceActivity, + isVoiceTranscriptActivity, markActivity, onErrorResumeNext, parseAction, @@ -109,6 +125,9 @@ export { parseThing, parseVoteAction, postActivity, + postVoiceActivity, + registerVoiceHandler, + unregisterVoiceHandler, sendEvent, sendFiles, sendMessage, @@ -122,11 +141,14 @@ export { setSendBoxAttachments, setSendTimeout, setSendTypingIndicator, + setVoiceState, singleToArray, startDictate, startSpeakingActivity, + startVoiceRecording, stopDictate, stopSpeakingActivity, + stopVoiceRecording, submitSendBox }; @@ -155,6 +177,8 @@ export type { OrgSchemaThing, OrgSchemaUserReview, SendBoxAttachment, + VoiceHandler, + VoiceState, WebChatActivity }; diff --git a/packages/core/src/reducers/voiceActivity.ts b/packages/core/src/reducers/voiceActivity.ts new file mode 100644 index 0000000000..d7f6953e49 --- /dev/null +++ b/packages/core/src/reducers/voiceActivity.ts @@ -0,0 +1,80 @@ +import { VOICE_REGISTER_HANDLER } from '../actions/registerVoiceHandler'; +import { VOICE_SET_STATE } from '../actions/setVoiceState'; +import { VOICE_START_RECORDING } from '../actions/startVoiceRecording'; +import { VOICE_STOP_RECORDING } from '../actions/stopVoiceRecording'; +import { VOICE_UNREGISTER_HANDLER } from '../actions/unregisterVoiceHandler'; + +import type { VoiceHandler, VoiceRegisterHandlerAction } from '../actions/registerVoiceHandler'; +import type { VoiceSetStateAction, VoiceState } from '../actions/setVoiceState'; +import type { VoiceStartRecordingAction } from '../actions/startVoiceRecording'; +import type { VoiceStopRecordingAction } from '../actions/stopVoiceRecording'; +import type { VoiceUnregisterHandlerAction } from '../actions/unregisterVoiceHandler'; + +type VoiceActivityActions = + | VoiceRegisterHandlerAction + | VoiceSetStateAction + | VoiceStartRecordingAction + | VoiceStopRecordingAction + | VoiceUnregisterHandlerAction; + +interface VoiceActivityState { + voiceState: VoiceState; + voiceHandlers: Map; +} + +const DEFAULT_STATE: VoiceActivityState = { + voiceState: 'idle', + voiceHandlers: new Map() +}; + +export default function voiceActivity( + state: VoiceActivityState = DEFAULT_STATE, + action: VoiceActivityActions +): VoiceActivityState { + switch (action.type) { + case VOICE_REGISTER_HANDLER: { + const newHandlers = new Map(state.voiceHandlers); + newHandlers.set(action.payload.id, action.payload.voiceHandler); + return { + ...state, + voiceHandlers: newHandlers + }; + } + + case VOICE_UNREGISTER_HANDLER: { + const newHandlers = new Map(state.voiceHandlers); + newHandlers.delete(action.payload.id); + return { + ...state, + voiceHandlers: newHandlers + }; + } + + case VOICE_SET_STATE: + return { + ...state, + voiceState: action.payload.voiceState + }; + + case VOICE_START_RECORDING: + if (state.voiceState !== 'idle') { + console.warn(`botframework-webchat: Should not transit from "${state.voiceState}" to "listening"`); + } + + return { + ...state, + voiceState: 'listening' + }; + + case VOICE_STOP_RECORDING: + return { + ...state, + voiceState: 'idle' + }; + + default: + return state; + } +} + +export type { VoiceActivityState }; diff --git a/packages/core/src/sagas/observeActivitySaga.ts b/packages/core/src/sagas/observeActivitySaga.ts index 3378c4c655..9ee290076e 100644 --- a/packages/core/src/sagas/observeActivitySaga.ts +++ b/packages/core/src/sagas/observeActivitySaga.ts @@ -1,9 +1,12 @@ -import { put } from 'redux-saga/effects'; +import { put, select } from 'redux-saga/effects'; import updateIn from 'simple-update-in'; import observeEach from './effects/observeEach'; import queueIncomingActivity from '../actions/queueIncomingActivity'; +import setVoiceState from '../actions/setVoiceState'; import whileConnected from './effects/whileConnected'; +import isVoiceActivity from '../utils/voiceActivity/isVoiceActivity'; +import isVoiceTranscriptActivity from '../utils/voiceActivity/isVoiceTranscriptActivity'; import type { DirectLineActivity } from '../types/external/DirectLineActivity'; import type { DirectLineJSBotConnection } from '../types/external/DirectLineJSBotConnection'; import type { WebChatActivity } from '../types/WebChatActivity'; @@ -75,6 +78,53 @@ function patchFromName(activity: DirectLineActivity) { function* observeActivity({ directLine, userID }: { directLine: DirectLineJSBotConnection; userID?: string }) { yield observeEach(directLine.activity$, function* observeActivity(activity: DirectLineActivity) { + // Handle voice activities separately - don't store them in Redux (except transcripts) + if (isVoiceActivity(activity) && !isVoiceTranscriptActivity(activity)) { + const { recording, voiceHandlers } = yield select(state => ({ + recording: state.voice.voiceState !== 'idle', + voiceHandlers: state.voice.voiceHandlers + })); + + // Only process voice chunks if speech-to-speech is enabled. + if (!recording) { + return; + } + + switch (activity.name) { + case 'media.chunk': { + const audioContent = activity?.value?.content; + if (audioContent) { + voiceHandlers.forEach(handler => handler.queueAudio(audioContent)); + } + break; + } + + case 'request.update': { + const state = activity?.value?.state; + + switch (state) { + case 'detected': + voiceHandlers.forEach(handler => handler.stopAllAudio()); + yield put(setVoiceState('user_speaking')); + break; + + case 'processing': + yield put(setVoiceState('processing')); + break; + + default: + break; + } + break; + } + + default: + break; + } + + return; + } + // TODO: [P2] #3953 Move the patching logic to a DirectLineJS wrapper, instead of too close to inners of Web Chat. activity = patchNullAsUndefined(activity); activity = patchActivityWithFromRole(activity, userID); diff --git a/packages/core/src/sagas/postVoiceActivitySaga.ts b/packages/core/src/sagas/postVoiceActivitySaga.ts new file mode 100644 index 0000000000..ad7390deb0 --- /dev/null +++ b/packages/core/src/sagas/postVoiceActivitySaga.ts @@ -0,0 +1,76 @@ +import { select, takeEvery } from 'redux-saga/effects'; +import { VOICE_POST_ACTIVITY } from '../actions/postVoiceActivity'; +import languageSelector from '../selectors/language'; +import dateToLocaleISOString from '../utils/dateToLocaleISOString'; +import whileConnected from './effects/whileConnected'; +import observeOnce from './effects/observeOnce'; + +import type { DirectLineJSBotConnection } from '../types/external/DirectLineJSBotConnection'; +import type { DirectLineActivity } from '../types/external/DirectLineActivity'; +import type { GlobalScopePonyfill } from '../types/GlobalScopePonyfill'; +import type { VoicePostActivityAction } from '../actions/postVoiceActivity'; + +/** + * Saga for handling outgoing voice activities. + * + * Unlike regular postActivitySaga, this saga: + * - Does NOT wait for echo back + * - Does NOT store activity in Redux + * - Does NOT dispatch PENDING/FULFILLED/REJECTED actions + * - Fire and forget - just send to WebSocket + * + * This prevents memory leaks from storing thousands of voice chunks. + */ +function* postVoiceActivity( + directLine: DirectLineJSBotConnection, + userID: string, + username: string, + { payload: { activity } }: VoicePostActivityAction, + ponyfill: GlobalScopePonyfill +) { + const locale: string = yield select(languageSelector); + const localTimeZone = + typeof window.Intl === 'undefined' ? undefined : new Intl.DateTimeFormat().resolvedOptions().timeZone; + const now = new ponyfill.Date(); + + const outgoingActivity = { + ...activity, + channelId: 'webchat', + from: { + id: userID, + name: username, + role: 'user' + }, + locale, + localTimestamp: dateToLocaleISOString(now), + localTimezone: localTimeZone, + ...(activity.type === 'event' + ? { + name: activity.name, + value: activity.value + } + : {}) + }; + + try { + yield observeOnce(directLine.postActivity(outgoingActivity as DirectLineActivity)); + } catch (error) { + console.error('botframework-webchat: Failed to post voice activity to chat adapter.', error); + } +} + +export default function* voiceActivitySaga(ponyfill: GlobalScopePonyfill) { + yield whileConnected(function* voiceActivityWhileConnected({ + directLine, + userID, + username + }: { + directLine: DirectLineJSBotConnection; + userID: string; + username: string; + }) { + yield takeEvery(VOICE_POST_ACTIVITY, function* (action: VoicePostActivityAction) { + yield* postVoiceActivity(directLine, userID, username, action, ponyfill); + }); + }); +} diff --git a/packages/core/src/utils/voiceActivity/getVoiceActivityRole.spec.ts b/packages/core/src/utils/voiceActivity/getVoiceActivityRole.spec.ts new file mode 100644 index 0000000000..e8dcba4ad7 --- /dev/null +++ b/packages/core/src/utils/voiceActivity/getVoiceActivityRole.spec.ts @@ -0,0 +1,150 @@ +import getVoiceActivityRole from './getVoiceActivityRole'; +import { WebChatActivity } from '../../types/WebChatActivity'; + +// Mock activity factory for testing +const createMockActivity = (type: string = 'event', name?: string, value?: any, valueType?: string): WebChatActivity => + ({ + type: type as any, + id: 'test-activity-id', + from: { id: 'test-user' }, + channelData: { + 'webchat:sequence-id': 1 + }, + ...(name && { name }), + ...(value && { value }), + ...(valueType && { valueType }) + }) as WebChatActivity; + +const createMockVoiceActivity = ( + name: string, + value: Record, + valueType: string = 'application/vnd.microsoft.activity.azure.directline.audio.chunk' +): WebChatActivity => createMockActivity('event', name, value, valueType); + +const createMockTranscriptActivity = ( + origin: 'user' | 'agent', + transcription: string = 'test', + valueType: string = 'application/vnd.microsoft.activity.azure.directline.audio.transcript' +): WebChatActivity => createMockActivity('event', 'media.end', { transcription, origin }, valueType); + +describe('getVoiceActivityRole', () => { + describe.each([ + ['user', 'user', 'Hello world'], + ['user', 'user', ''] + ] as const)('Voice transcript activities with origin %s', (expectedRole, origin, transcription) => { + test(`should return "${expectedRole}" for media.end with origin ${origin}${transcription ? '' : ' and empty transcription'}`, () => { + const activity = createMockTranscriptActivity(origin, transcription); + + const result = getVoiceActivityRole(activity); + + expect(result).toBe(expectedRole); + }); + }); + + describe.each([ + ['bot', 'agent', 'Hello, how can I help you?'], + ['bot', 'agent', ''] + ] as const)('Voice transcript activities with origin %s', (expectedRole, origin, transcription) => { + test(`should return "${expectedRole}" for media.end with origin ${origin}${transcription ? '' : ' and empty transcription'}`, () => { + const activity = createMockTranscriptActivity(origin, transcription); + + const result = getVoiceActivityRole(activity); + + expect(result).toBe(expectedRole); + }); + }); + + describe('Non-transcript voice activities', () => { + test.each([ + ['media.chunk', { content: 'base64' }, 'application/vnd.microsoft.activity.azure.directline.audio.chunk'], + ['request.update', { state: 'detected' }, 'application/vnd.microsoft.activity.azure.directline.audio.state'] + ])('should return undefined for %s', (name, value, valueType) => { + const activity = createMockVoiceActivity(name, value, valueType); + + const result = getVoiceActivityRole(activity); + + expect(result).toBeUndefined(); + }); + }); + + describe('Non-voice activities', () => { + test.each([ + ['message', 'regular message activity'], + ['typing', 'typing activity'] + ])('should return undefined for %s', type => { + const activity = createMockActivity(type); + + const result = getVoiceActivityRole(activity); + + expect(result).toBeUndefined(); + }); + + test('should return undefined for event activity without audio valueType', () => { + const activity = createMockActivity('event', 'test', { someOtherData: 'test' }, 'application/json'); + + const result = getVoiceActivityRole(activity); + + expect(result).toBeUndefined(); + }); + }); + + describe('Real-world scenarios', () => { + test('should correctly identify user transcript in conversation flow', () => { + const userActivities = [ + createMockVoiceActivity( + 'request.update', + { state: 'detected' }, + 'application/vnd.microsoft.activity.azure.directline.audio.state' + ), + createMockVoiceActivity( + 'media.chunk', + { content: 'base64' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ), + createMockTranscriptActivity('user', 'What is the weather today?') + ]; + + const roles = userActivities.map(activity => getVoiceActivityRole(activity)); + + expect(roles).toEqual([undefined, undefined, 'user']); + }); + + test('should correctly identify bot transcript in conversation flow', () => { + const botActivities = [ + createMockVoiceActivity( + 'media.chunk', + { content: 'chunk1' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ), + createMockVoiceActivity( + 'media.chunk', + { content: 'chunk2' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ), + createMockTranscriptActivity('agent', 'Today will be sunny with a high of 75 degrees.'), + createMockVoiceActivity( + 'request.update', + { state: 'processing' }, + 'application/vnd.microsoft.activity.azure.directline.audio.state' + ) + ]; + + const roles = botActivities.map(activity => getVoiceActivityRole(activity)); + + expect(roles).toEqual([undefined, undefined, 'bot', undefined]); + }); + + test('should handle mixed activity types in conversation', () => { + const mixedActivities = [ + createMockActivity('message'), + createMockTranscriptActivity('user', 'Hello'), + createMockActivity('typing'), + createMockTranscriptActivity('agent', 'Hi there!') + ]; + + const roles = mixedActivities.map(activity => getVoiceActivityRole(activity)); + + expect(roles).toEqual([undefined, 'user', undefined, 'bot']); + }); + }); +}); diff --git a/packages/core/src/utils/voiceActivity/getVoiceActivityRole.ts b/packages/core/src/utils/voiceActivity/getVoiceActivityRole.ts new file mode 100644 index 0000000000..2801514e92 --- /dev/null +++ b/packages/core/src/utils/voiceActivity/getVoiceActivityRole.ts @@ -0,0 +1,16 @@ +import { WebChatActivity } from '../../types/WebChatActivity'; +import isVoiceTranscriptActivity from './isVoiceTranscriptActivity'; + +const getVoiceActivityRole = (activity: WebChatActivity): 'bot' | 'user' | undefined => { + if (isVoiceTranscriptActivity(activity)) { + if (activity.value.origin === 'agent') { + return 'bot'; + } else if (activity.value.origin === 'user') { + return 'user'; + } + } + + return undefined; +}; + +export default getVoiceActivityRole; diff --git a/packages/core/src/utils/voiceActivity/getVoiceActivityText.spec.ts b/packages/core/src/utils/voiceActivity/getVoiceActivityText.spec.ts new file mode 100644 index 0000000000..85197891de --- /dev/null +++ b/packages/core/src/utils/voiceActivity/getVoiceActivityText.spec.ts @@ -0,0 +1,100 @@ +import getVoiceActivityText from './getVoiceActivityText'; +import { WebChatActivity } from '../../types/WebChatActivity'; + +// Mock activity factory for testing +const createMockActivity = (type: string = 'event', name?: string, value?: any, valueType?: string): WebChatActivity => + ({ + type: type as any, + id: 'test-activity-id', + from: { id: 'test-user' }, + channelData: { + 'webchat:sequence-id': 1 + }, + ...(name && { name }), + ...(value && { value }), + ...(valueType && { valueType }) + }) as WebChatActivity; + +const createMockTranscriptActivity = ( + transcription: string | undefined, + origin: 'user' | 'agent' = 'user', + valueType: string = 'application/vnd.microsoft.activity.azure.directline.audio.transcript' +): WebChatActivity => + createMockActivity( + 'event', + 'media.end', + transcription !== undefined ? { transcription, origin } : { origin }, + valueType + ); + +describe('getVoiceActivityText', () => { + describe('Voice transcript activities', () => { + test.each([ + ['Hello world', 'Hello world'], + ['How can I help you today?', 'How can I help you today?'], + ['', ''] + ])('should return %p for media.end with transcription %p', (expected, transcription) => { + const activity = createMockTranscriptActivity(transcription); + + const result = getVoiceActivityText(activity); + + expect(result).toBe(expected); + }); + + test('should return undefined for media.end without transcript property', () => { + const activity = createMockTranscriptActivity(undefined); + + const result = getVoiceActivityText(activity); + + expect(result).toBeUndefined(); + }); + }); + + describe('Non-transcript voice activities', () => { + test.each([['media.chunk'], ['request.update']])('should return undefined for %s activity', name => { + const activity = createMockActivity( + 'event', + name, + { content: 'base64' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ); + + const result = getVoiceActivityText(activity); + + expect(result).toBeUndefined(); + }); + }); + + describe('Non-voice activities', () => { + test.each([ + ['message', undefined, undefined, undefined], + ['event', undefined, { someData: 'test' }, undefined] + ])('should return undefined for %s', (type, name, value, valueType) => { + const activity = createMockActivity(type, name, value, valueType); + + const result = getVoiceActivityText(activity); + + expect(result).toBeUndefined(); + }); + }); + + describe('Edge cases', () => { + test('should handle transcript with whitespace only', () => { + const activity = createMockTranscriptActivity(' '); + + const result = getVoiceActivityText(activity); + + expect(result).toBe(' '); + }); + + test('should handle very long transcript', () => { + const longText = 'A'.repeat(10000); + const activity = createMockTranscriptActivity(longText); + + const result = getVoiceActivityText(activity); + + expect(result).toBe(longText); + expect(result?.length).toBe(10000); + }); + }); +}); diff --git a/packages/core/src/utils/voiceActivity/getVoiceActivityText.ts b/packages/core/src/utils/voiceActivity/getVoiceActivityText.ts new file mode 100644 index 0000000000..9bd60ddee0 --- /dev/null +++ b/packages/core/src/utils/voiceActivity/getVoiceActivityText.ts @@ -0,0 +1,11 @@ +import { WebChatActivity } from '../../types/WebChatActivity'; +import isVoiceTranscriptActivity from './isVoiceTranscriptActivity'; + +const getVoiceActivityText = (activity: WebChatActivity): string | undefined => { + if (isVoiceTranscriptActivity(activity)) { + return activity.value.transcription; + } + return undefined; +}; + +export default getVoiceActivityText; diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts new file mode 100644 index 0000000000..5b57506669 --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts @@ -0,0 +1,202 @@ +import isVoiceActivity from './isVoiceActivity'; +import { WebChatActivity } from '../../types/WebChatActivity'; + +// Mock activity factory for testing +const createMockActivity = (type: string = 'event', name?: string, value?: any, valueType?: string): WebChatActivity => + ({ + type: type as any, + id: 'test-activity-id', + from: { id: 'test-user' }, + channelData: { + 'webchat:sequence-id': 1 + }, + ...(name && { name }), + ...(value && { value }), + ...(valueType && { valueType }) + }) as WebChatActivity; + +const createMockVoiceActivity = ( + name: string, + value: Record, + valueType: string = 'application/vnd.microsoft.activity.azure.directline.audio.chunk' +): WebChatActivity => createMockActivity('event', name, value, valueType); + +const createMockDtmfActivity = (name: string, value: Record): WebChatActivity => + createMockActivity('event', name, value, 'application/vnd.microsoft.activity.ccv2.dtmf'); + +describe('isVoiceActivity', () => { + describe('Valid voice activities', () => { + test('should return true for event activity with azure directline audio valueType', () => { + const activity = createMockVoiceActivity( + 'media.chunk', + { content: 'base64', contentType: 'audio/webm' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for event activity with ccv2 audio valueType', () => { + const activity = createMockVoiceActivity( + 'media.chunk', + { content: 'base64' }, + 'application/vnd.microsoft.activity.ccv2.audio.chunk' + ); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for event activity with dtmf valueType', () => { + const activity = createMockDtmfActivity('media.end', { key: '1' }); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for request.update with audio.state valueType', () => { + const activity = createMockVoiceActivity( + 'request.update', + { state: 'detected', message: 'Your request is identified' }, + 'application/vnd.microsoft.activity.azure.directline.audio.state' + ); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + }); + + describe('Invalid activities', () => { + const testCases = [ + { + name: 'message activity with audio valueType', + activity: () => + createMockActivity( + 'message', + 'media.chunk', + { content: 'base64' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ) + }, + { + name: 'typing activity', + activity: () => createMockActivity('typing') + }, + { + name: 'event activity with non-audio valueType', + activity: () => createMockActivity('event', 'test', { data: 'test' }, 'application/json') + }, + { + name: 'event activity without valueType', + activity: () => createMockActivity('event', 'test', { someData: 'value' }) + }, + { + name: 'event activity with no value', + activity: () => + createMockActivity( + 'event', + 'test', + undefined, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ) + }, + { + name: 'event activity with no name', + activity: () => + createMockActivity( + 'event', + undefined, + { data: 'test' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ) + } + ]; + + test.each(testCases)('should return false for $name', ({ activity }) => { + const result = isVoiceActivity(activity()); + + expect(result).toBe(false); + }); + }); + + describe('Real-world voice activity scenarios', () => { + const voiceScenarios = [ + { + name: 'request.update with speech detected state', + eventName: 'request.update', + value: { state: 'detected', message: 'Your request is identified' }, + valueType: 'application/vnd.microsoft.activity.azure.directline.audio.state' + }, + { + name: 'request.update with processing state', + eventName: 'request.update', + value: { state: 'processing', message: 'Your request is being processed' }, + valueType: 'application/vnd.microsoft.activity.azure.directline.audio.state' + }, + { + name: 'media.end with user transcription', + eventName: 'media.end', + value: { transcription: 'My destination is bangalore', origin: 'user' }, + valueType: 'application/vnd.microsoft.activity.azure.directline.audio.transcript' + }, + { + name: 'media.chunk with server audio response', + eventName: 'media.chunk', + value: { content: 'base64chunk', contentType: 'audio/webm' }, + valueType: 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + }, + { + name: 'media.end with bot transcription', + eventName: 'media.end', + value: { transcription: 'Your destination is at 1000m above sea level', origin: 'agent' }, + valueType: 'application/vnd.microsoft.activity.azure.directline.audio.transcript' + } + ]; + + test.each(voiceScenarios)('should return true for $name', ({ eventName, value, valueType }) => { + const activity = createMockVoiceActivity(eventName, value, valueType); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + }); + + describe('Real-world DTMF activity scenarios', () => { + const dtmfScenarios = [ + { + name: 'DTMF with digit 1', + eventName: 'media.end', + value: { key: '1' } + }, + { + name: 'DTMF with star key', + eventName: 'media.end', + value: { key: '*' } + }, + { + name: 'DTMF with hash key', + eventName: 'media.end', + value: { key: '#' } + }, + { + name: 'DTMF with digit 5', + eventName: 'media.end', + value: { key: '5' } + } + ]; + + test.each(dtmfScenarios)('should return true for $name', ({ eventName, value }) => { + const activity = createMockDtmfActivity(eventName, value); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + }); +}); diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts new file mode 100644 index 0000000000..f6c86b8dcc --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts @@ -0,0 +1,22 @@ +import { check, literal, looseObject, object, pipe, safeParse, string, type InferOutput } from 'valibot'; + +import { WebChatActivity } from '../../types/WebChatActivity'; + +// Activity spec proposal - https://github.com/microsoft/Agents/issues/416 +// valueType: contains 'audio' or 'dtmf' (works with any server prefix like azure.directline, ccv2, etc.) +const VoiceActivitySchema = object({ + name: string(), + type: literal('event'), + value: looseObject({}), + valueType: pipe( + string(), + check(value => value.includes('audio') || value.includes('dtmf')) + ) +}); + +const isVoiceActivity = ( + activity: WebChatActivity +): activity is WebChatActivity & InferOutput => + safeParse(VoiceActivitySchema, activity).success; + +export default isVoiceActivity; diff --git a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts new file mode 100644 index 0000000000..d3631fd019 --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.spec.ts @@ -0,0 +1,224 @@ +import isVoiceTranscriptActivity from './isVoiceTranscriptActivity'; +import { WebChatActivity } from '../../types/WebChatActivity'; + +// Mock activity factory for testing +const createMockActivity = (type: string = 'event', name?: string, value?: any, valueType?: string): WebChatActivity => + ({ + type: type as any, + id: 'test-activity-id', + from: { id: 'test-user' }, + channelData: { + 'webchat:sequence-id': 1 + }, + ...(name && { name }), + ...(value && { value }), + ...(valueType && { valueType }) + }) as WebChatActivity; + +const createMockVoiceActivity = ( + name: string, + value: Record, + valueType: string = 'application/vnd.microsoft.activity.azure.directline.audio.chunk' +): WebChatActivity => createMockActivity('event', name, value, valueType); + +const createMockTranscriptActivity = ( + transcription: string, + origin: 'user' | 'agent', + valueType: string = 'application/vnd.microsoft.activity.azure.directline.audio.transcript' +): WebChatActivity => createMockActivity('event', 'media.end', { transcription, origin }, valueType); + +describe('isVoiceTranscriptActivity', () => { + describe('Valid transcript activities', () => { + test('should return true for media.end with user transcription', () => { + const activity = createMockTranscriptActivity('Hello world', 'user'); + + const result = isVoiceTranscriptActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for media.end with agent transcription', () => { + const activity = createMockTranscriptActivity('Hi there!', 'agent'); + + const result = isVoiceTranscriptActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for media.end with empty transcription string', () => { + const activity = createMockTranscriptActivity('', 'user'); + + const result = isVoiceTranscriptActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for ccv2 transcript valueType', () => { + const activity = createMockTranscriptActivity( + 'Test transcript', + 'user', + 'application/vnd.microsoft.activity.ccv2.audio.transcript' + ); + + const result = isVoiceTranscriptActivity(activity); + + expect(result).toBe(true); + }); + }); + + describe('Invalid activities', () => { + const testCases = [ + { + name: 'media.chunk voice activity', + activity: () => + createMockVoiceActivity( + 'media.chunk', + { content: 'base64' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ) + }, + { + name: 'request.update voice activity', + activity: () => + createMockVoiceActivity( + 'request.update', + { state: 'detected' }, + 'application/vnd.microsoft.activity.azure.directline.audio.state' + ) + }, + { + name: 'media.end without transcription', + activity: () => + createMockActivity( + 'event', + 'media.end', + { origin: 'user' }, + 'application/vnd.microsoft.activity.azure.directline.audio.transcript' + ) + }, + { + name: 'media.end with non-string transcription', + activity: () => + createMockActivity( + 'event', + 'media.end', + { transcription: 123, origin: 'user' }, + 'application/vnd.microsoft.activity.azure.directline.audio.transcript' + ) + }, + { + name: 'media.end with null transcription', + activity: () => + createMockActivity( + 'event', + 'media.end', + { transcription: null, origin: 'user' }, + 'application/vnd.microsoft.activity.azure.directline.audio.transcript' + ) + }, + { + name: 'regular message activity', + activity: () => createMockActivity('message', 'test') + }, + { + name: 'typing activity', + activity: () => createMockActivity('typing') + }, + { + name: 'media.end with non-transcript valueType', + activity: () => + createMockActivity( + 'event', + 'media.end', + { transcription: 'test', origin: 'user' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ) + }, + { + name: 'event activity without valueType', + activity: () => createMockActivity('event', 'media.end', { transcription: 'test', origin: 'user' }) + }, + { + name: 'event activity without name', + activity: () => + createMockActivity( + 'event', + undefined, + { transcription: 'test', origin: 'user' }, + 'application/vnd.microsoft.activity.azure.directline.audio.transcript' + ) + } + ]; + + test.each(testCases)('should return false for $name', ({ activity }) => { + const result = isVoiceTranscriptActivity(activity()); + + expect(result).toBe(false); + }); + }); + + describe('Real-world scenarios', () => { + test('should identify user transcript in conversation flow', () => { + const conversationActivities = [ + createMockVoiceActivity( + 'request.update', + { state: 'detected' }, + 'application/vnd.microsoft.activity.azure.directline.audio.state' + ), + createMockVoiceActivity( + 'request.update', + { state: 'processing' }, + 'application/vnd.microsoft.activity.azure.directline.audio.state' + ), + createMockTranscriptActivity('What is the weather today?', 'user') + ]; + + const transcriptResults = conversationActivities.map(activity => isVoiceTranscriptActivity(activity)); + + expect(transcriptResults).toEqual([false, false, true]); + }); + + test('should identify agent transcript in response flow', () => { + const responseActivities = [ + createMockVoiceActivity( + 'request.update', + { state: 'response.available' }, + 'application/vnd.microsoft.activity.azure.directline.audio.state' + ), + createMockVoiceActivity( + 'media.chunk', + { content: 'chunk1' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ), + createMockVoiceActivity( + 'media.chunk', + { content: 'chunk2' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ), + createMockTranscriptActivity('Today will be sunny with a high of 75 degrees.', 'agent') + ]; + + const transcriptResults = responseActivities.map(activity => isVoiceTranscriptActivity(activity)); + + expect(transcriptResults).toEqual([false, false, false, true]); + }); + + test('should handle complete conversation with mixed activities', () => { + const mixedActivities = [ + createMockActivity('message', 'test'), + createMockTranscriptActivity('Hello', 'user'), + createMockVoiceActivity( + 'media.chunk', + { content: 'audio' }, + 'application/vnd.microsoft.activity.azure.directline.audio.chunk' + ), + createMockTranscriptActivity('Hi there!', 'agent'), + createMockActivity('typing') + ]; + + const transcriptResults = mixedActivities.map(activity => isVoiceTranscriptActivity(activity)); + + expect(transcriptResults).toEqual([false, true, false, true, false]); + }); + }); +}); diff --git a/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts new file mode 100644 index 0000000000..8b861d1783 --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceTranscriptActivity.ts @@ -0,0 +1,24 @@ +import { check, literal, object, picklist, pipe, safeParse, string, type InferOutput } from 'valibot'; + +import { WebChatActivity } from '../../types/WebChatActivity'; + +// valueType: contains 'audio.transcript' (e.g., azure.directline.audio.transcript) +const VoiceTranscriptActivitySchema = object({ + name: literal('media.end'), + type: literal('event'), + value: object({ + origin: picklist(['agent', 'user']), + transcription: string() + }), + valueType: pipe( + string(), + check(value => value.includes('audio.transcript')) + ) +}); + +const isVoiceTranscriptActivity = ( + activity: WebChatActivity +): activity is WebChatActivity & InferOutput => + safeParse(VoiceTranscriptActivitySchema, activity).success; + +export default isVoiceTranscriptActivity; diff --git a/packages/fluent-theme/src/components/activity/PartGroupingDecorator.tsx b/packages/fluent-theme/src/components/activity/PartGroupingDecorator.tsx index 6957fb797d..31b4e6626d 100644 --- a/packages/fluent-theme/src/components/activity/PartGroupingDecorator.tsx +++ b/packages/fluent-theme/src/components/activity/PartGroupingDecorator.tsx @@ -1,5 +1,10 @@ import { reactNode, validateProps } from '@msinternal/botframework-webchat-react-valibot'; -import { getOrgSchemaMessage, PartGrouping, type WebChatActivity } from 'botframework-webchat/internal'; +import { + getOrgSchemaMessage, + getVoiceActivityRole, + PartGrouping, + type WebChatActivity +} from 'botframework-webchat/internal'; import cx from 'classnames'; import React, { memo, useMemo, type ReactNode } from 'react'; import { array, custom, object, optional, pipe, readonly, safeParse } from 'valibot'; @@ -38,8 +43,12 @@ function PartGroupingDecorator(props: PartGroupingDecoratorProps) { [activity, restActivities.length] ); - const isFromUser = activity?.from?.role === 'user'; - const isFromBot = activity?.from?.role === 'bot'; + // S2S-both user and bot transcript comes from server (RT-LLM) hence need to check role explicitly. + // voiceActivityRole takes precedence over from.role since S2S activities always come from 'bot' + const voiceActivityRole = activity && getVoiceActivityRole(activity); + + const isFromBot = voiceActivityRole ? voiceActivityRole === 'bot' : activity?.from?.role === 'bot'; + const isFromUser = voiceActivityRole ? voiceActivityRole === 'user' : activity?.from?.role === 'user'; return (
; + +function VoiceTranscriptActivityStatus({ activity }: VoiceTranscriptActivityStatusProps) { + const classNames = useStyles(styles); + const localize = useLocalizer(); + const { timestamp } = activity; + const role = getVoiceActivityRole(activity); + const text = getVoiceActivityText(activity); + + const agentLabel = localize('ACTIVITY_STATUS_VOICE_TRANSCRIPT_AGENT_LABEL'); + + if (!text) { + return null; + } + + return ( + + {role === 'bot' && ( + + {agentLabel} + {timestamp && {'|'}} + + )} + {timestamp && } + + ); +} + +export default memo(VoiceTranscriptActivityStatus); diff --git a/packages/fluent-theme/src/components/icon/FluentIcon.module.css b/packages/fluent-theme/src/components/icon/FluentIcon.module.css index a091a1eee6..f6f274c23a 100644 --- a/packages/fluent-theme/src/components/icon/FluentIcon.module.css +++ b/packages/fluent-theme/src/components/icon/FluentIcon.module.css @@ -50,4 +50,11 @@ --webchat__fluent-icon--mask: url('data:image/svg+xml;utf8,'); } +:global(.webchat) .icon--microphone { + --webchat__fluent-icon--mask: url('data:image/svg+xml;utf8,'); +} + +:global(.webchat) .icon--audio-playing { + --webchat__fluent-icon--mask: url('data:image/svg+xml;utf8,'); +} /* #endregion */ diff --git a/packages/fluent-theme/src/components/sendBox/MicrophoneToolbarButton.tsx b/packages/fluent-theme/src/components/sendBox/MicrophoneToolbarButton.tsx new file mode 100644 index 0000000000..946f2a5e88 --- /dev/null +++ b/packages/fluent-theme/src/components/sendBox/MicrophoneToolbarButton.tsx @@ -0,0 +1,57 @@ +import { hooks } from 'botframework-webchat'; +import cx from 'classnames'; +import React, { memo, useCallback } from 'react'; + +import { useStyles } from '../../styles'; +import testIds from '../../testIds'; +import { FluentIcon } from '../icon'; +import { ToolbarButton } from './Toolbar'; + +import styles from './Toolbar.module.css'; + +const { useVoiceState, useStartVoice, useStopVoice, useLocalizer } = hooks; + +function MicrophoneToolbarButton() { + const [voiceState] = useVoiceState(); + const classNames = useStyles(styles); + const localize = useLocalizer(); + const startVoice = useStartVoice(); + const stopVoice = useStopVoice(); + + const recording = voiceState !== 'idle'; + + const handleMicrophoneClick = useCallback(() => { + if (recording) { + stopVoice(); // Stop recognition and synthesis. + } else { + startVoice(); // If it was stopped, will start recognition. It will synthesize when the bot respond. + } + }, [recording, startVoice, stopVoice]); + + const ariaLabel = localize( + recording ? 'SPEECH_INPUT_MICROPHONE_BUTTON_OPEN_ALT' : 'SPEECH_INPUT_MICROPHONE_BUTTON_CLOSE_ALT' + ); + + const isBotSpeaking = voiceState === 'bot_speaking'; + const isUserSpeaking = voiceState === 'user_speaking'; + + return ( + + + + ); +} + +MicrophoneToolbarButton.displayName = 'SendBox.MicrophoneToolbarButton'; + +export default memo(MicrophoneToolbarButton); diff --git a/packages/fluent-theme/src/components/sendBox/SendBox.module.css b/packages/fluent-theme/src/components/sendBox/SendBox.module.css index 473a71e55e..522205a042 100644 --- a/packages/fluent-theme/src/components/sendBox/SendBox.module.css +++ b/packages/fluent-theme/src/components/sendBox/SendBox.module.css @@ -55,7 +55,7 @@ transition: clip-path var(--webchat-durationUltraFast) var(--webchat-curveAccelerateMid); } - &:focus-within::after { + &:not(:has(textarea[readonly][aria-disabled='true'])):focus-within::after { clip-path: inset(calc(100% - var(--webchat-strokeWidthThicker)) 0 0 0); transition: clip-path var(--webchat-durationNormal) var(--webchat-curveDecelerateMid); } diff --git a/packages/fluent-theme/src/components/sendBox/SendBox.tsx b/packages/fluent-theme/src/components/sendBox/SendBox.tsx index 5b1264c6e2..05fb2bf184 100644 --- a/packages/fluent-theme/src/components/sendBox/SendBox.tsx +++ b/packages/fluent-theme/src/components/sendBox/SendBox.tsx @@ -1,9 +1,11 @@ import { Components, hooks } from 'botframework-webchat'; +import { usePostVoiceActivity, useShouldShowMicrophoneButton } from 'botframework-webchat/internal'; import cx from 'classnames'; import React, { memo, ReactNode, useCallback, + useMemo, useRef, useState, type FormEventHandler, @@ -19,12 +21,14 @@ import { SuggestedActions } from '../suggestedActions'; import { TelephoneKeypadSurrogate, useTelephoneKeypadShown, type DTMF } from '../telephoneKeypad'; import AddAttachmentButton from './AddAttachmentButton'; import ErrorMessage from './ErrorMessage'; +import useSpeechStateMessage from './private/useSpeechStateMessage'; import useSubmitError from './private/useSubmitError'; import useTranscriptNavigation from './private/useTranscriptNavigation'; import useUniqueId from './private/useUniqueId'; import styles from './SendBox.module.css'; import TelephoneKeypadToolbarButton from './TelephoneKeypadToolbarButton'; import { Toolbar, ToolbarButton, ToolbarSeparator } from './Toolbar'; +import MicrophoneToolbarButton from './MicrophoneToolbarButton'; const { useFocus, @@ -35,7 +39,8 @@ const { useSendBoxValue, useSendMessage, useStyleOptions, - useUIState + useUIState, + useVoiceState } = hooks; const { AttachmentBar, TextArea } = Components; @@ -54,23 +59,35 @@ function SendBox(props: Props) { const [localMessage, setLocalMessage] = useState(''); const [telephoneKeypadShown] = useTelephoneKeypadShown(); const [uiState] = useUIState(); + const [voiceState] = useVoiceState(); const classNames = useStyles(styles); const variantClassName = useVariantClassName(styles); const errorMessageId = useUniqueId('sendbox__error-message-id'); const inputRef = useRef(null); const localize = useLocalizer(); const makeThumbnail = useMakeThumbnail(); + const postVoiceActivity = usePostVoiceActivity(); const sendMessage = useSendMessage(); const setFocus = useFocus(); + const showMicrophoneButton = useShouldShowMicrophoneButton(); + const speechStateMessage = useSpeechStateMessage(); const message = props.isPrimary ? globalMessage : localMessage; + const recording = voiceState !== 'idle'; const setMessage = props.isPrimary ? setGlobalMessage : setLocalMessage; const isBlueprint = uiState === 'blueprint'; const [errorMessage, commitLatestError] = useSubmitError({ message, attachments }); const isMessageLengthExceeded = !!maxMessageLength && message.length > maxMessageLength; - const shouldShowMessageLength = - !isBlueprint && !telephoneKeypadShown && maxMessageLength && isFinite(maxMessageLength); + const shouldShowMessageLength = useMemo( + () => + !isBlueprint && + !telephoneKeypadShown && + !!maxMessageLength && + isFinite(maxMessageLength) && + !showMicrophoneButton, + [isBlueprint, telephoneKeypadShown, maxMessageLength, showMicrophoneButton] + ); const shouldShowTelephoneKeypad = !isBlueprint && telephoneKeypadShown; useRegisterFocusSendBox( @@ -156,9 +173,21 @@ function SendBox(props: Props) { ); const handleTelephoneKeypadButtonClick = useCallback( - // TODO: We need more official way of sending DTMF. - (dtmf: DTMF) => sendMessage(`/DTMFKey ${dtmf}`), - [sendMessage] + (dtmf: DTMF) => { + if (recording) { + postVoiceActivity({ + name: 'media.end', + type: 'event', + value: { + key: dtmf + } + } as any); + } else { + // TODO: We need more official way of sending DTMF. + sendMessage(`/DTMFKey ${dtmf}`); + } + }, + [postVoiceActivity, recording, sendMessage] ); const handleTranscriptNavigation = useTranscriptNavigation(); @@ -193,7 +222,10 @@ function SendBox(props: Props) { hidden={shouldShowTelephoneKeypad} onClick={handleClick} onInput={handleMessageChange} - placeholder={props.placeholder ?? localize('TEXT_INPUT_PLACEHOLDER')} + placeholder={ + props.placeholder ?? (showMicrophoneButton ? speechStateMessage : localize('TEXT_INPUT_PLACEHOLDER')) + } + readOnly={showMicrophoneButton} ref={inputRef} value={message} /> @@ -226,14 +258,18 @@ function SendBox(props: Props) { {!hideTelephoneKeypadButton && } {!disableFileUpload && } - - - + {showMicrophoneButton ? ( + + ) : ( + + + + )}
{!disableFileUpload && } diff --git a/packages/fluent-theme/src/components/sendBox/Toolbar.module.css b/packages/fluent-theme/src/components/sendBox/Toolbar.module.css index c60a842bb9..1ffae08533 100644 --- a/packages/fluent-theme/src/components/sendBox/Toolbar.module.css +++ b/packages/fluent-theme/src/components/sendBox/Toolbar.module.css @@ -46,6 +46,54 @@ color: var(--webchat-colorNeutralForegroundDisabled); cursor: not-allowed; } + + &.sendbox__toolbar-button--active { + --webchat__toolbar-button--pulse-start-size: 30px; + --webchat__toolbar-button--pulse-end-size: 58px; + --webchat__toolbar-button--pulse-opacity: 0.5; + --webchat__toolbar-button--background-gradient-opacity: 0.15; + --webchat__toolbar-button--gradient-color-1: var(--webchat-colorBrandForeground1, #0078d4); + --webchat__toolbar-button--gradient-color-2: #2db4ff; + --webchat__toolbar-button--gradient-color-3: #d660ff; + --webchat__toolbar-button--gradient-color-4: #fea874; + + background-color: var(--webchat-colorNeutralForeground2BrandSelected); + border-radius: 50%; + color: var(--webchat-colorNeutralBackground1); + + @media (hover: hover) { + &:hover { + color: var(--webchat-colorNeutralBackground1); + } + } + } + + &.sendbox__toolbar-button--with-pulse::before { + animation: toolbar-button__pulse 1s linear infinite alternate; + background-color: var(--webchat-colorNeutralForeground2BrandSelected); + border-radius: 50%; + content: ''; + height: var(--webchat__toolbar-button--pulse-start-size); + opacity: var(--webchat__toolbar-button--pulse-opacity); + position: absolute; + width: var(--webchat__toolbar-button--pulse-start-size); + } + + &.sendbox__toolbar-button--with-gradient::after { + background: linear-gradient( + 90deg, + var(--webchat__toolbar-button--gradient-color-1) 0%, + color-mix(in srgb, var(--webchat__toolbar-button--gradient-color-1), var(--webchat__toolbar-button--gradient-color-2)) 33%, + color-mix(in srgb, var(--webchat__toolbar-button--gradient-color-1), var(--webchat__toolbar-button--gradient-color-3)) 66%, + color-mix(in srgb, var(--webchat__toolbar-button--gradient-color-1), var(--webchat__toolbar-button--gradient-color-4)) 100% + ); + border-radius: 50%; + content: ''; + height: var(--webchat__toolbar-button--pulse-end-size); + opacity: var(--webchat__toolbar-button--background-gradient-opacity); + position: absolute; + width: var(--webchat__toolbar-button--pulse-end-size); + } } :global(.webchat-fluent) .sendbox__toolbar-separator { @@ -59,3 +107,15 @@ display: none; } } + +@keyframes toolbar-button__pulse { + 0% { + height: var(--webchat__toolbar-button--pulse-start-size); + width: var(--webchat__toolbar-button--pulse-start-size); + } + + 100% { + height: var(--webchat__toolbar-button--pulse-end-size); + width: var(--webchat__toolbar-button--pulse-end-size); + } +} diff --git a/packages/fluent-theme/src/components/sendBox/private/useSpeechStateMessage.ts b/packages/fluent-theme/src/components/sendBox/private/useSpeechStateMessage.ts new file mode 100644 index 0000000000..cdc42e33b9 --- /dev/null +++ b/packages/fluent-theme/src/components/sendBox/private/useSpeechStateMessage.ts @@ -0,0 +1,29 @@ +import { hooks } from 'botframework-webchat'; +import { useMemo } from 'react'; + +const { useLocalizer, useVoiceState } = hooks; + +export default function useSpeechPlaceholder(): string { + const [voiceState] = useVoiceState(); + const localize = useLocalizer(); + + return useMemo(() => { + switch (voiceState) { + case 'bot_speaking': + return localize('TEXT_INPUT_SPEECH_BOT_SPEAKING_PLACEHOLDER'); + + case 'idle': + return localize('TEXT_INPUT_SPEECH_IDLE_PLACEHOLDER'); + + case 'listening': + case 'user_speaking': + return localize('TEXT_INPUT_SPEECH_LISTENING_PLACEHOLDER'); + + case 'processing': + return localize('TEXT_INPUT_SPEECH_PROCESSING_PLACEHOLDER'); + + default: + return localize('TEXT_INPUT_SPEECH_IDLE_PLACEHOLDER'); + } + }, [voiceState, localize]); +} diff --git a/packages/fluent-theme/src/external.umd/botframework-webchat-api/internal.ts b/packages/fluent-theme/src/external.umd/botframework-webchat-api/internal.ts index d3920b80ab..c2713f0336 100644 --- a/packages/fluent-theme/src/external.umd/botframework-webchat-api/internal.ts +++ b/packages/fluent-theme/src/external.umd/botframework-webchat-api/internal.ts @@ -1,5 +1,3 @@ /// -module.exports = { - internal: (globalThis as any).WebChat.internal -}; +module.exports = (globalThis as any).WebChat.internal; diff --git a/packages/fluent-theme/src/private/FluentThemeProvider.tsx b/packages/fluent-theme/src/private/FluentThemeProvider.tsx index bfbc8fc02d..fcb0613a43 100644 --- a/packages/fluent-theme/src/private/FluentThemeProvider.tsx +++ b/packages/fluent-theme/src/private/FluentThemeProvider.tsx @@ -8,10 +8,16 @@ import { WebChatDecorator, type DecoratorMiddleware } from 'botframework-webchat/decorator'; -import { type ActivityMiddleware, type TypingIndicatorMiddleware } from 'botframework-webchat/internal'; +import { + isVoiceTranscriptActivity, + type ActivityMiddleware, + type ActivityStatusMiddleware, + type TypingIndicatorMiddleware +} from 'botframework-webchat/internal'; import React, { memo, useMemo } from 'react'; import { custom, object, optional, pipe, readonly, string, type InferInput } from 'valibot'; +import VoiceTranscriptActivityStatus from '../components/activityStatus/VoiceTranscriptActivityStatus'; import ActivityLoader from '../components/activity/ActivityLoader'; import PartGroupDecorator from '../components/activity/PartGroupingDecorator'; import AssetComposer from '../components/assets/AssetComposer'; @@ -73,6 +79,17 @@ const decoratorMiddleware: readonly DecoratorMiddleware[] = Object.freeze([ }) ]); +const activityStatusMiddleware: readonly ActivityStatusMiddleware[] = Object.freeze([ + () => + next => + ({ activity, ...args }) => { + if (isVoiceTranscriptActivity(activity)) { + return ; + } + return next({ activity, ...args }); + } +]); + const typingIndicatorMiddleware: readonly TypingIndicatorMiddleware[] = Object.freeze([ () => next => @@ -99,6 +116,7 @@ function FluentThemeProvider(props: FluentThemeProviderProps) { { + // Auto-handle voice activities (continuous sending by mic) without requiring actPostActivity + // Voice activities are fire-and-forget and don't echo back + if (outgoingActivity.type === 'event' && outgoingActivity.name.includes('media')) { + const id = uniqueId(); + + return new Observable(observer => { + try { + observer.next(id); + observer.complete(); + } catch (error) { + observer.error(error); + } + }); + } + const returnPostActivityWithResolvers = withResolvers(); const deferred = postActivityCallDeferreds.shift(); @@ -185,6 +200,18 @@ export default function createDirectLineEmulator({ autoConnect = true, ponyfill 1000 )); }, + emulateIncomingVoiceActivity: activity => { + activity = updateIn(activity, ['timestamp'], timestamp => + typeof timestamp === 'number' + ? new Date(now + timestamp).toISOString() + : 'timestamp' in activity + ? timestamp + : getTimestamp() + ); + activity = updateIn(activity, ['type'], type => type || 'event'); + + activityDeferredObservable.next(activity); + }, emulateOutgoingActivity: (activity, options) => { if (typeof activity === 'string') { activity = {