Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "assemblyai",
"version": "4.34.0",
"version": "4.34.4",
"description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
"engines": {
"node": ">=18"
Expand Down
24 changes: 24 additions & 0 deletions samples/streaming-dual-channel-mic-system/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,30 @@ between consecutive words. `channel` reliably reports the physical source
(VAD-derived); `speaker_label` is AAI's acoustic diarization on the mixed
mono stream. Either change is a real boundary.

### Speaker revisions

With `speakerLabels` enabled, the server may revise the diarization of
_earlier_ turns once it has more context (offline reclustering). It sends a
`SpeakerRevision` event — one message per resolve, listing only the turns whose
labels actually changed:

```ts
transcriber.on("speakerRevision", (event) => {
for (const rev of event.revisions) {
// rev.turn_order: which earlier turn to correct
// rev.speaker_label: corrected turn-level label
// rev.words[i].speaker: corrected per-word labels
}
})
```

The sample keeps each finalized turn in the DOM and, on a revision, re-renders
the matching `turn_order` in place with the corrected labels — outlining the
bubble in purple and appending `(revised)` to its rollup line. Revision words
carry the new `speaker` but no client-side `channel`, so the sample merges
speakers onto the original words by timestamp and leaves channel attribution
untouched.

## Platform caveats

- **macOS:** `getDisplayMedia({ audio: true })` does **not** capture system
Expand Down
12 changes: 12 additions & 0 deletions samples/streaming-dual-channel-mic-system/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@
#events .event-line.event-Termination {
color: #fecaca;
}
#events .event-line.event-SpeakerRevision {
color: #ddd6fe;
}
.turn-container {
display: flex;
flex-direction: column;
Expand Down Expand Up @@ -146,6 +149,15 @@
font-size: 0.75rem;
padding: 0.15rem 0.5rem;
}
.rollup-line.revised {
color: #6d28d9;
font-weight: bold;
}
/* A turn corrected by a later SpeakerRevision: outline its bubbles. */
.turn-container.revised .turn-row {
outline: 2px solid #8b5cf6;
outline-offset: 1px;
}
.description {
background: #fef9c3;
border: 1px solid #facc15;
Expand Down
45 changes: 45 additions & 0 deletions samples/streaming-dual-channel-mic-system/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,13 @@ async function start(): Promise<void> {
// contiguous same-channel run within the turn) plus the rollup line on
// end_of_turn. Keyed by turn_order so partials replace the same container.
const turnContainers = new Map<number, HTMLElement>()
// Finalized turns we keep around so a later SpeakerRevision can re-render
// them in place. Keyed by turn_order; holds the container, its rollup line,
// and the last-rendered TurnEvent (so revisions merge onto known words).
const finalizedTurns = new Map<
number,
{ container: HTMLElement; rollup: HTMLElement; turn: TurnEvent }
>()
// Last committed turn-level (channel, speaker_label) composite — used to
// detect speaker changes between consecutive *finalized turns*.
let lastFinalTurnComposite: string | undefined
Expand Down Expand Up @@ -333,13 +340,51 @@ async function start(): Promise<void> {
} | Most active channel: ${turn.channel ?? "unknown"}`
container.after(rollupLine)
output.scrollTop = output.scrollHeight

// Retain the finalized turn so a later SpeakerRevision can correct it.
finalizedTurns.set(turn.turn_order, { container, rollup: rollupLine, turn })
})

// SpeakerRevision: diarization-only, emitted once per offline-recluster
// resolve. Each revision is an *earlier* finalized turn whose speaker labels
// changed — we re-render that turn in place with the corrected labels and
// flag it as revised. Unchanged turns are not included.
transcriber.on("speakerRevision", (event) => {
logEvent("SpeakerRevision", event)
for (const rev of event.revisions) {
const entry = finalizedTurns.get(rev.turn_order)
if (!entry) continue // turn isn't in view (e.g. session was reset)

// Revision words carry the corrected `speaker` but no client-side
// `channel` attribution, so merge speakers onto the original words by
// timestamp and keep each word's channel/channelResolved intact.
const speakerByStart = new Map(rev.words.map((w) => [w.start, w.speaker]))
const revisedWords = entry.turn.words.map((w) => ({
...w,
speaker: speakerByStart.get(w.start) ?? w.speaker,
}))
entry.turn = {
...entry.turn,
speaker_label: rev.speaker_label ?? entry.turn.speaker_label,
words: revisedWords,
}

renderTurnContainer(entry.container, entry.turn, true)
entry.container.classList.add("revised")
entry.rollup.className = "rollup-line revised"
entry.rollup.textContent = `-- End of turn #${rev.turn_order}: speaker_label: ${
entry.turn.speaker_label ?? "?"
} | Most active channel: ${entry.turn.channel ?? "unknown"} (revised)`
}
output.scrollTop = output.scrollHeight
})

// Expose so stop() can reset session-scoped state.
;(
globalThis as unknown as { __resetTurnState?: () => void }
).__resetTurnState = () => {
turnContainers.clear()
finalizedTurns.clear()
lastFinalTurnComposite = undefined
}

Expand Down
21 changes: 20 additions & 1 deletion src/services/streaming/service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
StreamingEventMessage,
TurnEvent,
LLMGatewayResponseEvent,
SpeakerRevisionEvent,
StreamingUpdateConfiguration,
StreamingForceEndpoint,
WarningEvent,
Expand Down Expand Up @@ -262,6 +263,10 @@ export class StreamingTranscriber {
searchParams.set("prompt", this.params.prompt);
}

if (this.params.agentContext) {
searchParams.set("agent_context", this.params.agentContext);
}

if (this.params.filterProfanity) {
searchParams.set(
"filter_profanity",
Expand All @@ -274,7 +279,9 @@ export class StreamingTranscriber {
"[Deprecation Warning] The speech model `u3-pro` is deprecated and will be removed in a future release. Please use `u3-rt-pro` instead.",
);
}
searchParams.set("speech_model", this.params.speechModel.toString());
if (this.params.speechModel !== undefined) {
searchParams.set("speech_model", this.params.speechModel.toString());
}

if (this.params.languageDetection !== undefined) {
searchParams.set(
Expand Down Expand Up @@ -389,6 +396,10 @@ export class StreamingTranscriber {
searchParams.set("redact_pii_sub", this.params.redactPiiSub);
}

if (this.params.mode !== undefined) {
searchParams.set("mode", this.params.mode);
}

if (this.params.llmGateway !== undefined) {
searchParams.set("llm_gateway", JSON.stringify(this.params.llmGateway));
}
Expand All @@ -404,6 +415,10 @@ export class StreamingTranscriber {
event: "llmGatewayResponse",
listener: (event: LLMGatewayResponseEvent) => void,
): void;
on(
event: "speakerRevision",
listener: (event: SpeakerRevisionEvent) => void,
): void;
on(event: "warning", listener: (event: WarningEvent) => void): void;
on(event: "vad", listener: (event: VadFrame) => void): void;
on(event: "error", listener: (error: Error) => void): void;
Expand Down Expand Up @@ -508,6 +523,10 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
this.listeners.llmGatewayResponse?.(message);
break;
}
case "SpeakerRevision": {
this.listeners.speakerRevision?.(message);
break;
}
case "Warning": {
const warning = message as WarningEvent;
console.warn(
Expand Down
35 changes: 34 additions & 1 deletion src/types/streaming/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ export type StreamingTranscriberParams = {
keyterms?: string[];
keytermsPrompt?: string[];
prompt?: string;
speechModel: StreamingSpeechModel;
agentContext?: string;
speechModel?: StreamingSpeechModel;
languageDetection?: boolean;
domain?: StreamingDomain;
inactivityTimeout?: number;
Expand All @@ -107,6 +108,7 @@ export type StreamingTranscriberParams = {
redactPii?: boolean;
redactPiiPolicies?: StreamingPiiPolicy[];
redactPiiSub?: StreamingPiiSubstitution;
mode?: StreamingMode;
llmGateway?: LLMGatewayConfig;
webhookUrl?: string;
webhookAuthHeaderName?: string;
Expand Down Expand Up @@ -146,6 +148,7 @@ export type StreamingEvents =
| "turn"
| "speechStarted"
| "llmGatewayResponse"
| "speakerRevision"
| "warning"
| "vad"
| "error";
Expand All @@ -156,6 +159,7 @@ export type StreamingListeners = {
turn?: (event: TurnEvent) => void;
speechStarted?: (event: SpeechStartedEvent) => void;
llmGatewayResponse?: (event: LLMGatewayResponseEvent) => void;
speakerRevision?: (event: SpeakerRevisionEvent) => void;
warning?: (event: WarningEvent) => void;
vad?: (event: VadFrame) => void;
error?: (error: Error) => void;
Expand All @@ -165,11 +169,14 @@ export type StreamingSpeechModel =
| "universal-streaming-english"
| "universal-streaming-multilingual"
| "u3-rt-pro"
| "u3-rt-pro-beta-1"
| "whisper-rt"
| "u3-pro";

export type StreamingDomain = "medical-v1";

export type StreamingMode = "max_accuracy" | "min_latency" | "balanced";

export type VoiceFocusModel = "near-field" | "far-field";

export type StreamingPiiSubstitution = "hash" | "entity_name";
Expand Down Expand Up @@ -330,6 +337,7 @@ export type StreamingUpdateConfiguration = {
format_turns?: boolean;
keyterms_prompt?: string[];
prompt?: string;
agent_context?: string;
filter_profanity?: boolean;
interruption_delay?: number;
turn_left_pad_ms?: number;
Expand Down Expand Up @@ -358,12 +366,37 @@ export type LLMGatewayResponseEvent = {
data: unknown;
};

/**
* A single earlier Turn whose speaker labels were revised by reclustering.
* Match by `turn_order` against the original Turn; replace its per-word
* `speaker` assignments (and the turn-level `speaker_label`) with these. Text
* and word timestamps are unchanged from the original Turn.
*/
export type SpeakerRevisionItem = {
turn_order: number;
speaker_label?: string;
words: StreamingWord[];
};

/**
* Server-side correction to previously-emitted Turns' speaker labels.
* Diarization-only (emitted only when `speakerLabels` is enabled). Sent once
* per offline-recluster resolve; `revisions` carries one entry per earlier
* Turn whose label actually changed (unchanged turns are omitted). Apply each
* entry by matching its `turn_order`.
*/
export type SpeakerRevisionEvent = {
type: "SpeakerRevision";
revisions: SpeakerRevisionItem[];
};

export type StreamingEventMessage =
| BeginEvent
| TurnEvent
| SpeechStartedEvent
| TerminationEvent
| LLMGatewayResponseEvent
| SpeakerRevisionEvent
| ErrorEvent
| WarningEvent;

Expand Down
Loading
Loading