From 61fa1d38e7a3d60fcdb90ecea83f9d331b142593 Mon Sep 17 00:00:00 2001 From: AssemblyAI Date: Thu, 4 Jun 2026 15:42:04 -0400 Subject: [PATCH] Project import generated by Copybara. GitOrigin-RevId: dad6730c09aacd7349fcad9dc6fda1a96a3677d3 --- package.json | 2 +- .../README.md | 24 ++++ .../index.html | 12 ++ .../streaming-dual-channel-mic-system/main.ts | 45 ++++++++ src/services/streaming/service.ts | 21 +++- src/types/streaming/index.ts | 35 +++++- tests/unit/streaming.test.ts | 105 ++++++++++++++++++ 7 files changed, 241 insertions(+), 3 deletions(-) diff --git a/package.json b/package.json index 103b952..14e57a3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "assemblyai", - "version": "4.34.0", + "version": "4.34.4", "description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.", "engines": { "node": ">=18" diff --git a/samples/streaming-dual-channel-mic-system/README.md b/samples/streaming-dual-channel-mic-system/README.md index 8172f8e..318f7fd 100644 --- a/samples/streaming-dual-channel-mic-system/README.md +++ b/samples/streaming-dual-channel-mic-system/README.md @@ -109,6 +109,30 @@ between consecutive words. `channel` reliably reports the physical source (VAD-derived); `speaker_label` is AAI's acoustic diarization on the mixed mono stream. Either change is a real boundary. +### Speaker revisions + +With `speakerLabels` enabled, the server may revise the diarization of +_earlier_ turns once it has more context (offline reclustering). It sends a +`SpeakerRevision` event — one message per resolve, listing only the turns whose +labels actually changed: + +```ts +transcriber.on("speakerRevision", (event) => { + for (const rev of event.revisions) { + // rev.turn_order: which earlier turn to correct + // rev.speaker_label: corrected turn-level label + // rev.words[i].speaker: corrected per-word labels + } +}) +``` + +The sample keeps each finalized turn in the DOM and, on a revision, re-renders +the matching `turn_order` in place with the corrected labels — outlining the +bubble in purple and appending `(revised)` to its rollup line. Revision words +carry the new `speaker` but no client-side `channel`, so the sample merges +speakers onto the original words by timestamp and leaves channel attribution +untouched. + ## Platform caveats - **macOS:** `getDisplayMedia({ audio: true })` does **not** capture system diff --git a/samples/streaming-dual-channel-mic-system/index.html b/samples/streaming-dual-channel-mic-system/index.html index 8361bd5..5925fcb 100644 --- a/samples/streaming-dual-channel-mic-system/index.html +++ b/samples/streaming-dual-channel-mic-system/index.html @@ -80,6 +80,9 @@ #events .event-line.event-Termination { color: #fecaca; } + #events .event-line.event-SpeakerRevision { + color: #ddd6fe; + } .turn-container { display: flex; flex-direction: column; @@ -146,6 +149,15 @@ font-size: 0.75rem; padding: 0.15rem 0.5rem; } + .rollup-line.revised { + color: #6d28d9; + font-weight: bold; + } + /* A turn corrected by a later SpeakerRevision: outline its bubbles. */ + .turn-container.revised .turn-row { + outline: 2px solid #8b5cf6; + outline-offset: 1px; + } .description { background: #fef9c3; border: 1px solid #facc15; diff --git a/samples/streaming-dual-channel-mic-system/main.ts b/samples/streaming-dual-channel-mic-system/main.ts index 00cf44c..5a7a4cd 100644 --- a/samples/streaming-dual-channel-mic-system/main.ts +++ b/samples/streaming-dual-channel-mic-system/main.ts @@ -219,6 +219,13 @@ async function start(): Promise { // contiguous same-channel run within the turn) plus the rollup line on // end_of_turn. Keyed by turn_order so partials replace the same container. const turnContainers = new Map() + // Finalized turns we keep around so a later SpeakerRevision can re-render + // them in place. Keyed by turn_order; holds the container, its rollup line, + // and the last-rendered TurnEvent (so revisions merge onto known words). + const finalizedTurns = new Map< + number, + { container: HTMLElement; rollup: HTMLElement; turn: TurnEvent } + >() // Last committed turn-level (channel, speaker_label) composite — used to // detect speaker changes between consecutive *finalized turns*. let lastFinalTurnComposite: string | undefined @@ -333,6 +340,43 @@ async function start(): Promise { } | Most active channel: ${turn.channel ?? "unknown"}` container.after(rollupLine) output.scrollTop = output.scrollHeight + + // Retain the finalized turn so a later SpeakerRevision can correct it. + finalizedTurns.set(turn.turn_order, { container, rollup: rollupLine, turn }) + }) + + // SpeakerRevision: diarization-only, emitted once per offline-recluster + // resolve. Each revision is an *earlier* finalized turn whose speaker labels + // changed — we re-render that turn in place with the corrected labels and + // flag it as revised. Unchanged turns are not included. + transcriber.on("speakerRevision", (event) => { + logEvent("SpeakerRevision", event) + for (const rev of event.revisions) { + const entry = finalizedTurns.get(rev.turn_order) + if (!entry) continue // turn isn't in view (e.g. session was reset) + + // Revision words carry the corrected `speaker` but no client-side + // `channel` attribution, so merge speakers onto the original words by + // timestamp and keep each word's channel/channelResolved intact. + const speakerByStart = new Map(rev.words.map((w) => [w.start, w.speaker])) + const revisedWords = entry.turn.words.map((w) => ({ + ...w, + speaker: speakerByStart.get(w.start) ?? w.speaker, + })) + entry.turn = { + ...entry.turn, + speaker_label: rev.speaker_label ?? entry.turn.speaker_label, + words: revisedWords, + } + + renderTurnContainer(entry.container, entry.turn, true) + entry.container.classList.add("revised") + entry.rollup.className = "rollup-line revised" + entry.rollup.textContent = `-- End of turn #${rev.turn_order}: speaker_label: ${ + entry.turn.speaker_label ?? "?" + } | Most active channel: ${entry.turn.channel ?? "unknown"} (revised)` + } + output.scrollTop = output.scrollHeight }) // Expose so stop() can reset session-scoped state. @@ -340,6 +384,7 @@ async function start(): Promise { globalThis as unknown as { __resetTurnState?: () => void } ).__resetTurnState = () => { turnContainers.clear() + finalizedTurns.clear() lastFinalTurnComposite = undefined } diff --git a/src/services/streaming/service.ts b/src/services/streaming/service.ts index 7930d47..b1249dd 100644 --- a/src/services/streaming/service.ts +++ b/src/services/streaming/service.ts @@ -15,6 +15,7 @@ import { StreamingEventMessage, TurnEvent, LLMGatewayResponseEvent, + SpeakerRevisionEvent, StreamingUpdateConfiguration, StreamingForceEndpoint, WarningEvent, @@ -262,6 +263,10 @@ export class StreamingTranscriber { searchParams.set("prompt", this.params.prompt); } + if (this.params.agentContext) { + searchParams.set("agent_context", this.params.agentContext); + } + if (this.params.filterProfanity) { searchParams.set( "filter_profanity", @@ -274,7 +279,9 @@ export class StreamingTranscriber { "[Deprecation Warning] The speech model `u3-pro` is deprecated and will be removed in a future release. Please use `u3-rt-pro` instead.", ); } - searchParams.set("speech_model", this.params.speechModel.toString()); + if (this.params.speechModel !== undefined) { + searchParams.set("speech_model", this.params.speechModel.toString()); + } if (this.params.languageDetection !== undefined) { searchParams.set( @@ -389,6 +396,10 @@ export class StreamingTranscriber { searchParams.set("redact_pii_sub", this.params.redactPiiSub); } + if (this.params.mode !== undefined) { + searchParams.set("mode", this.params.mode); + } + if (this.params.llmGateway !== undefined) { searchParams.set("llm_gateway", JSON.stringify(this.params.llmGateway)); } @@ -404,6 +415,10 @@ export class StreamingTranscriber { event: "llmGatewayResponse", listener: (event: LLMGatewayResponseEvent) => void, ): void; + on( + event: "speakerRevision", + listener: (event: SpeakerRevisionEvent) => void, + ): void; on(event: "warning", listener: (event: WarningEvent) => void): void; on(event: "vad", listener: (event: VadFrame) => void): void; on(event: "error", listener: (error: Error) => void): void; @@ -508,6 +523,10 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c this.listeners.llmGatewayResponse?.(message); break; } + case "SpeakerRevision": { + this.listeners.speakerRevision?.(message); + break; + } case "Warning": { const warning = message as WarningEvent; console.warn( diff --git a/src/types/streaming/index.ts b/src/types/streaming/index.ts index 0eb8731..91e4516 100644 --- a/src/types/streaming/index.ts +++ b/src/types/streaming/index.ts @@ -91,7 +91,8 @@ export type StreamingTranscriberParams = { keyterms?: string[]; keytermsPrompt?: string[]; prompt?: string; - speechModel: StreamingSpeechModel; + agentContext?: string; + speechModel?: StreamingSpeechModel; languageDetection?: boolean; domain?: StreamingDomain; inactivityTimeout?: number; @@ -107,6 +108,7 @@ export type StreamingTranscriberParams = { redactPii?: boolean; redactPiiPolicies?: StreamingPiiPolicy[]; redactPiiSub?: StreamingPiiSubstitution; + mode?: StreamingMode; llmGateway?: LLMGatewayConfig; webhookUrl?: string; webhookAuthHeaderName?: string; @@ -146,6 +148,7 @@ export type StreamingEvents = | "turn" | "speechStarted" | "llmGatewayResponse" + | "speakerRevision" | "warning" | "vad" | "error"; @@ -156,6 +159,7 @@ export type StreamingListeners = { turn?: (event: TurnEvent) => void; speechStarted?: (event: SpeechStartedEvent) => void; llmGatewayResponse?: (event: LLMGatewayResponseEvent) => void; + speakerRevision?: (event: SpeakerRevisionEvent) => void; warning?: (event: WarningEvent) => void; vad?: (event: VadFrame) => void; error?: (error: Error) => void; @@ -165,11 +169,14 @@ export type StreamingSpeechModel = | "universal-streaming-english" | "universal-streaming-multilingual" | "u3-rt-pro" + | "u3-rt-pro-beta-1" | "whisper-rt" | "u3-pro"; export type StreamingDomain = "medical-v1"; +export type StreamingMode = "max_accuracy" | "min_latency" | "balanced"; + export type VoiceFocusModel = "near-field" | "far-field"; export type StreamingPiiSubstitution = "hash" | "entity_name"; @@ -330,6 +337,7 @@ export type StreamingUpdateConfiguration = { format_turns?: boolean; keyterms_prompt?: string[]; prompt?: string; + agent_context?: string; filter_profanity?: boolean; interruption_delay?: number; turn_left_pad_ms?: number; @@ -358,12 +366,37 @@ export type LLMGatewayResponseEvent = { data: unknown; }; +/** + * A single earlier Turn whose speaker labels were revised by reclustering. + * Match by `turn_order` against the original Turn; replace its per-word + * `speaker` assignments (and the turn-level `speaker_label`) with these. Text + * and word timestamps are unchanged from the original Turn. + */ +export type SpeakerRevisionItem = { + turn_order: number; + speaker_label?: string; + words: StreamingWord[]; +}; + +/** + * Server-side correction to previously-emitted Turns' speaker labels. + * Diarization-only (emitted only when `speakerLabels` is enabled). Sent once + * per offline-recluster resolve; `revisions` carries one entry per earlier + * Turn whose label actually changed (unchanged turns are omitted). Apply each + * entry by matching its `turn_order`. + */ +export type SpeakerRevisionEvent = { + type: "SpeakerRevision"; + revisions: SpeakerRevisionItem[]; +}; + export type StreamingEventMessage = | BeginEvent | TurnEvent | SpeechStartedEvent | TerminationEvent | LLMGatewayResponseEvent + | SpeakerRevisionEvent | ErrorEvent | WarningEvent; diff --git a/tests/unit/streaming.test.ts b/tests/unit/streaming.test.ts index 4b86651..dacecea 100644 --- a/tests/unit/streaming.test.ts +++ b/tests/unit/streaming.test.ts @@ -204,6 +204,37 @@ describe("streaming", () => { ); }); + it("should include agent_context in updateConfiguration message", async () => { + rt.updateConfiguration({ agent_context: "What is your account number?" }); + await expect(server).toReceiveMessage( + JSON.stringify({ + type: "UpdateConfiguration", + agent_context: "What is your account number?", + }), + ); + }); + + it("should include agent_context in connection URL", async () => { + await cleanup(); + WS.clean(); + + const wsUrl = + `${websocketBaseUrl}?token=123&sample_rate=16000` + + `&agent_context=${encodeURIComponent("What is your account number?")}` + + `&speech_model=u3-rt-pro`; + server = new WS(wsUrl); + rt = new StreamingTranscriber({ + websocketBaseUrl, + token: "123", + sampleRate: 16_000, + speechModel: "u3-rt-pro", + agentContext: "What is your account number?", + }); + onOpen = jest.fn(); + rt.on("open", onOpen); + await connect(rt, server); + }); + it("should include turn_left_pad_ms in connection URL", async () => { await cleanup(); WS.clean(); @@ -254,6 +285,24 @@ describe("streaming", () => { await connect(rt, server); }); + it("should include mode in connection URL", async () => { + await cleanup(); + WS.clean(); + + const wsUrl = `${websocketBaseUrl}?token=123&sample_rate=16000&speech_model=u3-rt-pro&mode=max_accuracy`; + server = new WS(wsUrl); + rt = new StreamingTranscriber({ + websocketBaseUrl, + token: "123", + sampleRate: 16_000, + speechModel: "u3-rt-pro", + mode: "max_accuracy", + }); + onOpen = jest.fn(); + rt.on("open", onOpen); + await connect(rt, server); + }); + it("should include whisper-rt speech model in connection URL", async () => { await cleanup(); WS.clean(); @@ -346,4 +395,60 @@ describe("streaming", () => { model: "claude-3-5-sonnet", }); }); + + it("should parse and dispatch SpeakerRevision event", async () => { + const revisionPromise = new Promise<{ + revisions: { + turn_order: number; + speaker_label?: string; + words: { speaker?: string }[]; + }[]; + }>((resolve) => { + rt.on("speakerRevision", (event) => resolve(event)); + }); + + // One message per recluster resolve, carrying a list of revised turns. + // Revision words use the same word schema as Turn. + server.send( + JSON.stringify({ + type: "SpeakerRevision", + revisions: [ + { + turn_order: 3, + speaker_label: "B", + words: [ + { + start: 1000, + end: 1200, + confidence: 0.9, + text: "hello", + word_is_final: true, + speaker: "B", + }, + { + start: 1210, + end: 1400, + confidence: 0.88, + text: "world", + word_is_final: true, + speaker: "A", + }, + ], + }, + { + turn_order: 7, + speaker_label: "A", + words: [], + }, + ], + }), + ); + + const event = await revisionPromise; + expect(event.revisions.map((r) => r.turn_order)).toEqual([3, 7]); + expect(event.revisions[0].speaker_label).toBe("B"); + expect(event.revisions[0].words.map((w) => w.speaker)).toEqual(["B", "A"]); + expect(event.revisions[1].speaker_label).toBe("A"); + expect(event.revisions[1].words).toEqual([]); + }); });