From 61fa1d38e7a3d60fcdb90ecea83f9d331b142593 Mon Sep 17 00:00:00 2001
From: AssemblyAI <engineering.sdk@assemblyai.com>
Date: Thu, 4 Jun 2026 15:42:04 -0400
Subject: [PATCH] Project import generated by Copybara.

GitOrigin-RevId: dad6730c09aacd7349fcad9dc6fda1a96a3677d3
---
 package.json                                  |   2 +-
 .../README.md                                 |  24 ++++
 .../index.html                                |  12 ++
 .../streaming-dual-channel-mic-system/main.ts |  45 ++++++++
 src/services/streaming/service.ts             |  21 +++-
 src/types/streaming/index.ts                  |  35 +++++-
 tests/unit/streaming.test.ts                  | 105 ++++++++++++++++++
 7 files changed, 241 insertions(+), 3 deletions(-)
diff --git a/package.json b/package.json
index 103b952..14e57a3 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "assemblyai",
-  "version": "4.34.0",
+  "version": "4.34.4",
   "description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
   "engines": {
     "node": ">=18"
diff --git a/samples/streaming-dual-channel-mic-system/README.md b/samples/streaming-dual-channel-mic-system/README.md
index 8172f8e..318f7fd 100644
--- a/samples/streaming-dual-channel-mic-system/README.md
+++ b/samples/streaming-dual-channel-mic-system/README.md
@@ -109,6 +109,30 @@ between consecutive words. `channel` reliably reports the physical source
 (VAD-derived); `speaker_label` is AAI's acoustic diarization on the mixed
 mono stream. Either change is a real boundary.
 
+### Speaker revisions
+
+With `speakerLabels` enabled, the server may revise the diarization of
+_earlier_ turns once it has more context (offline reclustering). It sends a
+`SpeakerRevision` event — one message per resolve, listing only the turns whose
+labels actually changed:
+
+```ts
+transcriber.on("speakerRevision", (event) => {
+  for (const rev of event.revisions) {
+    // rev.turn_order: which earlier turn to correct
+    // rev.speaker_label: corrected turn-level label
+    // rev.words[i].speaker: corrected per-word labels
+  }
+})
+```
+
+The sample keeps each finalized turn in the DOM and, on a revision, re-renders
+the matching `turn_order` in place with the corrected labels — outlining the
+bubble in purple and appending `(revised)` to its rollup line. Revision words
+carry the new `speaker` but no client-side `channel`, so the sample merges
+speakers onto the original words by timestamp and leaves channel attribution
+untouched.
+
 ## Platform caveats
 
 - **macOS:** `getDisplayMedia({ audio: true })` does **not** capture system
diff --git a/samples/streaming-dual-channel-mic-system/index.html b/samples/streaming-dual-channel-mic-system/index.html
index 8361bd5..5925fcb 100644
--- a/samples/streaming-dual-channel-mic-system/index.html
+++ b/samples/streaming-dual-channel-mic-system/index.html
@@ -80,6 +80,9 @@
       #events .event-line.event-Termination {
         color: #fecaca;
       }
+      #events .event-line.event-SpeakerRevision {
+        color: #ddd6fe;
+      }
       .turn-container {
         display: flex;
         flex-direction: column;
@@ -146,6 +149,15 @@
         font-size: 0.75rem;
         padding: 0.15rem 0.5rem;
       }
+      .rollup-line.revised {
+        color: #6d28d9;
+        font-weight: bold;
+      }
+      /* A turn corrected by a later SpeakerRevision: outline its bubbles. */
+      .turn-container.revised .turn-row {
+        outline: 2px solid #8b5cf6;
+        outline-offset: 1px;
+      }
       .description {
         background: #fef9c3;
         border: 1px solid #facc15;
diff --git a/samples/streaming-dual-channel-mic-system/main.ts b/samples/streaming-dual-channel-mic-system/main.ts
index 00cf44c..5a7a4cd 100644
--- a/samples/streaming-dual-channel-mic-system/main.ts
+++ b/samples/streaming-dual-channel-mic-system/main.ts
@@ -219,6 +219,13 @@ async function start(): Promise<void> {
   // contiguous same-channel run within the turn) plus the rollup line on
   // end_of_turn. Keyed by turn_order so partials replace the same container.
   const turnContainers = new Map<number, HTMLElement>()
+  // Finalized turns we keep around so a later SpeakerRevision can re-render
+  // them in place. Keyed by turn_order; holds the container, its rollup line,
+  // and the last-rendered TurnEvent (so revisions merge onto known words).
+  const finalizedTurns = new Map<
+    number,
+    { container: HTMLElement; rollup: HTMLElement; turn: TurnEvent }
+  >()
   // Last committed turn-level (channel, speaker_label) composite — used to
   // detect speaker changes between consecutive *finalized turns*.
   let lastFinalTurnComposite: string | undefined
@@ -333,6 +340,43 @@ async function start(): Promise<void> {
     } | Most active channel: ${turn.channel ?? "unknown"}`
     container.after(rollupLine)
     output.scrollTop = output.scrollHeight
+
+    // Retain the finalized turn so a later SpeakerRevision can correct it.
+    finalizedTurns.set(turn.turn_order, { container, rollup: rollupLine, turn })
+  })
+
+  // SpeakerRevision: diarization-only, emitted once per offline-recluster
+  // resolve. Each revision is an *earlier* finalized turn whose speaker labels
+  // changed — we re-render that turn in place with the corrected labels and
+  // flag it as revised. Unchanged turns are not included.
+  transcriber.on("speakerRevision", (event) => {
+    logEvent("SpeakerRevision", event)
+    for (const rev of event.revisions) {
+      const entry = finalizedTurns.get(rev.turn_order)
+      if (!entry) continue // turn isn't in view (e.g. session was reset)
+
+      // Revision words carry the corrected `speaker` but no client-side
+      // `channel` attribution, so merge speakers onto the original words by
+      // timestamp and keep each word's channel/channelResolved intact.
+      const speakerByStart = new Map(rev.words.map((w) => [w.start, w.speaker]))
+      const revisedWords = entry.turn.words.map((w) => ({
+        ...w,
+        speaker: speakerByStart.get(w.start) ?? w.speaker,
+      }))
+      entry.turn = {
+        ...entry.turn,
+        speaker_label: rev.speaker_label ?? entry.turn.speaker_label,
+        words: revisedWords,
+      }
+
+      renderTurnContainer(entry.container, entry.turn, true)
+      entry.container.classList.add("revised")
+      entry.rollup.className = "rollup-line revised"
+      entry.rollup.textContent = `-- End of turn #${rev.turn_order}: speaker_label: ${
+        entry.turn.speaker_label ?? "?"
+      } | Most active channel: ${entry.turn.channel ?? "unknown"}  (revised)`
+    }
+    output.scrollTop = output.scrollHeight
   })
 
   // Expose so stop() can reset session-scoped state.
@@ -340,6 +384,7 @@ async function start(): Promise<void> {
     globalThis as unknown as { __resetTurnState?: () => void }
   ).__resetTurnState = () => {
     turnContainers.clear()
+    finalizedTurns.clear()
     lastFinalTurnComposite = undefined
   }
 
diff --git a/src/services/streaming/service.ts b/src/services/streaming/service.ts
index 7930d47..b1249dd 100644
--- a/src/services/streaming/service.ts
+++ b/src/services/streaming/service.ts
@@ -15,6 +15,7 @@ import {
   StreamingEventMessage,
   TurnEvent,
   LLMGatewayResponseEvent,
+  SpeakerRevisionEvent,
   StreamingUpdateConfiguration,
   StreamingForceEndpoint,
   WarningEvent,
@@ -262,6 +263,10 @@ export class StreamingTranscriber {
       searchParams.set("prompt", this.params.prompt);
     }
 
+    if (this.params.agentContext) {
+      searchParams.set("agent_context", this.params.agentContext);
+    }
+
     if (this.params.filterProfanity) {
       searchParams.set(
         "filter_profanity",
@@ -274,7 +279,9 @@ export class StreamingTranscriber {
         "[Deprecation Warning] The speech model `u3-pro` is deprecated and will be removed in a future release. Please use `u3-rt-pro` instead.",
       );
     }
-    searchParams.set("speech_model", this.params.speechModel.toString());
+    if (this.params.speechModel !== undefined) {
+      searchParams.set("speech_model", this.params.speechModel.toString());
+    }
 
     if (this.params.languageDetection !== undefined) {
       searchParams.set(
@@ -389,6 +396,10 @@ export class StreamingTranscriber {
       searchParams.set("redact_pii_sub", this.params.redactPiiSub);
     }
 
+    if (this.params.mode !== undefined) {
+      searchParams.set("mode", this.params.mode);
+    }
+
     if (this.params.llmGateway !== undefined) {
       searchParams.set("llm_gateway", JSON.stringify(this.params.llmGateway));
     }
@@ -404,6 +415,10 @@ export class StreamingTranscriber {
     event: "llmGatewayResponse",
     listener: (event: LLMGatewayResponseEvent) => void,
   ): void;
+  on(
+    event: "speakerRevision",
+    listener: (event: SpeakerRevisionEvent) => void,
+  ): void;
   on(event: "warning", listener: (event: WarningEvent) => void): void;
   on(event: "vad", listener: (event: VadFrame) => void): void;
   on(event: "error", listener: (error: Error) => void): void;
@@ -508,6 +523,10 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
             this.listeners.llmGatewayResponse?.(message);
             break;
           }
+          case "SpeakerRevision": {
+            this.listeners.speakerRevision?.(message);
+            break;
+          }
           case "Warning": {
             const warning = message as WarningEvent;
             console.warn(
diff --git a/src/types/streaming/index.ts b/src/types/streaming/index.ts
index 0eb8731..91e4516 100644
--- a/src/types/streaming/index.ts
+++ b/src/types/streaming/index.ts
@@ -91,7 +91,8 @@ export type StreamingTranscriberParams = {
   keyterms?: string[];
   keytermsPrompt?: string[];
   prompt?: string;
-  speechModel: StreamingSpeechModel;
+  agentContext?: string;
+  speechModel?: StreamingSpeechModel;
   languageDetection?: boolean;
   domain?: StreamingDomain;
   inactivityTimeout?: number;
@@ -107,6 +108,7 @@ export type StreamingTranscriberParams = {
   redactPii?: boolean;
   redactPiiPolicies?: StreamingPiiPolicy[];
   redactPiiSub?: StreamingPiiSubstitution;
+  mode?: StreamingMode;
   llmGateway?: LLMGatewayConfig;
   webhookUrl?: string;
   webhookAuthHeaderName?: string;
@@ -146,6 +148,7 @@ export type StreamingEvents =
   | "turn"
   | "speechStarted"
   | "llmGatewayResponse"
+  | "speakerRevision"
   | "warning"
   | "vad"
   | "error";
@@ -156,6 +159,7 @@ export type StreamingListeners = {
   turn?: (event: TurnEvent) => void;
   speechStarted?: (event: SpeechStartedEvent) => void;
   llmGatewayResponse?: (event: LLMGatewayResponseEvent) => void;
+  speakerRevision?: (event: SpeakerRevisionEvent) => void;
   warning?: (event: WarningEvent) => void;
   vad?: (event: VadFrame) => void;
   error?: (error: Error) => void;
@@ -165,11 +169,14 @@ export type StreamingSpeechModel =
   | "universal-streaming-english"
   | "universal-streaming-multilingual"
   | "u3-rt-pro"
+  | "u3-rt-pro-beta-1"
   | "whisper-rt"
   | "u3-pro";
 
 export type StreamingDomain = "medical-v1";
 
+export type StreamingMode = "max_accuracy" | "min_latency" | "balanced";
+
 export type VoiceFocusModel = "near-field" | "far-field";
 
 export type StreamingPiiSubstitution = "hash" | "entity_name";
@@ -330,6 +337,7 @@ export type StreamingUpdateConfiguration = {
   format_turns?: boolean;
   keyterms_prompt?: string[];
   prompt?: string;
+  agent_context?: string;
   filter_profanity?: boolean;
   interruption_delay?: number;
   turn_left_pad_ms?: number;
@@ -358,12 +366,37 @@ export type LLMGatewayResponseEvent = {
   data: unknown;
 };
 
+/**
+ * A single earlier Turn whose speaker labels were revised by reclustering.
+ * Match by `turn_order` against the original Turn; replace its per-word
+ * `speaker` assignments (and the turn-level `speaker_label`) with these. Text
+ * and word timestamps are unchanged from the original Turn.
+ */
+export type SpeakerRevisionItem = {
+  turn_order: number;
+  speaker_label?: string;
+  words: StreamingWord[];
+};
+
+/**
+ * Server-side correction to previously-emitted Turns' speaker labels.
+ * Diarization-only (emitted only when `speakerLabels` is enabled). Sent once
+ * per offline-recluster resolve; `revisions` carries one entry per earlier
+ * Turn whose label actually changed (unchanged turns are omitted). Apply each
+ * entry by matching its `turn_order`.
+ */
+export type SpeakerRevisionEvent = {
+  type: "SpeakerRevision";
+  revisions: SpeakerRevisionItem[];
+};
+
 export type StreamingEventMessage =
   | BeginEvent
   | TurnEvent
   | SpeechStartedEvent
   | TerminationEvent
   | LLMGatewayResponseEvent
+  | SpeakerRevisionEvent
   | ErrorEvent
   | WarningEvent;
 
diff --git a/tests/unit/streaming.test.ts b/tests/unit/streaming.test.ts
index 4b86651..dacecea 100644
--- a/tests/unit/streaming.test.ts
+++ b/tests/unit/streaming.test.ts
@@ -204,6 +204,37 @@ describe("streaming", () => {
     );
   });
 
+  it("should include agent_context in updateConfiguration message", async () => {
+    rt.updateConfiguration({ agent_context: "What is your account number?" });
+    await expect(server).toReceiveMessage(
+      JSON.stringify({
+        type: "UpdateConfiguration",
+        agent_context: "What is your account number?",
+      }),
+    );
+  });
+
+  it("should include agent_context in connection URL", async () => {
+    await cleanup();
+    WS.clean();
+
+    const wsUrl =
+      `${websocketBaseUrl}?token=123&sample_rate=16000` +
+      `&agent_context=${encodeURIComponent("What is your account number?")}` +
+      `&speech_model=u3-rt-pro`;
+    server = new WS(wsUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "123",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+      agentContext: "What is your account number?",
+    });
+    onOpen = jest.fn();
+    rt.on("open", onOpen);
+    await connect(rt, server);
+  });
+
   it("should include turn_left_pad_ms in connection URL", async () => {
     await cleanup();
     WS.clean();
@@ -254,6 +285,24 @@ describe("streaming", () => {
     await connect(rt, server);
   });
 
+  it("should include mode in connection URL", async () => {
+    await cleanup();
+    WS.clean();
+
+    const wsUrl = `${websocketBaseUrl}?token=123&sample_rate=16000&speech_model=u3-rt-pro&mode=max_accuracy`;
+    server = new WS(wsUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "123",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+      mode: "max_accuracy",
+    });
+    onOpen = jest.fn();
+    rt.on("open", onOpen);
+    await connect(rt, server);
+  });
+
   it("should include whisper-rt speech model in connection URL", async () => {
     await cleanup();
     WS.clean();
@@ -346,4 +395,60 @@ describe("streaming", () => {
       model: "claude-3-5-sonnet",
     });
   });
+
+  it("should parse and dispatch SpeakerRevision event", async () => {
+    const revisionPromise = new Promise<{
+      revisions: {
+        turn_order: number;
+        speaker_label?: string;
+        words: { speaker?: string }[];
+      }[];
+    }>((resolve) => {
+      rt.on("speakerRevision", (event) => resolve(event));
+    });
+
+    // One message per recluster resolve, carrying a list of revised turns.
+    // Revision words use the same word schema as Turn.
+    server.send(
+      JSON.stringify({
+        type: "SpeakerRevision",
+        revisions: [
+          {
+            turn_order: 3,
+            speaker_label: "B",
+            words: [
+              {
+                start: 1000,
+                end: 1200,
+                confidence: 0.9,
+                text: "hello",
+                word_is_final: true,
+                speaker: "B",
+              },
+              {
+                start: 1210,
+                end: 1400,
+                confidence: 0.88,
+                text: "world",
+                word_is_final: true,
+                speaker: "A",
+              },
+            ],
+          },
+          {
+            turn_order: 7,
+            speaker_label: "A",
+            words: [],
+          },
+        ],
+      }),
+    );
+
+    const event = await revisionPromise;
+    expect(event.revisions.map((r) => r.turn_order)).toEqual([3, 7]);
+    expect(event.revisions[0].speaker_label).toBe("B");
+    expect(event.revisions[0].words.map((w) => w.speaker)).toEqual(["B", "A"]);
+    expect(event.revisions[1].speaker_label).toBe("A");
+    expect(event.revisions[1].words).toEqual([]);
+  });
 });