From 2a54c44bf5f8ef217009c3a5393e9ca2297599ae Mon Sep 17 00:00:00 2001 From: AssemblyAI Date: Thu, 28 May 2026 11:01:42 -0600 Subject: [PATCH] Project import generated by Copybara. GitOrigin-RevId: 61622876f9640a36c6ee2b6e98a51f9fc22e5c93 --- assemblyai/__version__.py | 2 +- assemblyai/streaming/v3/__init__.py | 2 + assemblyai/streaming/v3/_base.py | 3 ++ assemblyai/streaming/v3/models.py | 17 ++++++ tests/unit/test_streaming.py | 83 +++++++++++++++++++++++++++++ 5 files changed, 106 insertions(+), 1 deletion(-) diff --git a/assemblyai/__version__.py b/assemblyai/__version__.py index 79afd54..dd34a94 100644 --- a/assemblyai/__version__.py +++ b/assemblyai/__version__.py @@ -1 +1 @@ -__version__ = "0.64.3" +__version__ = "0.64.4" diff --git a/assemblyai/streaming/v3/__init__.py b/assemblyai/streaming/v3/__init__.py index c7d0806..abee882 100644 --- a/assemblyai/streaming/v3/__init__.py +++ b/assemblyai/streaming/v3/__init__.py @@ -6,6 +6,7 @@ EventMessage, LLMGatewayResponseEvent, NoiseSuppressionModel, + SpeakerRevisionEvent, SpeechModel, SpeechStartedEvent, StreamingClientOptions, @@ -29,6 +30,7 @@ "EventMessage", "LLMGatewayResponseEvent", "NoiseSuppressionModel", + "SpeakerRevisionEvent", "SpeechModel", "SpeechStartedEvent", "StreamingClient", diff --git a/assemblyai/streaming/v3/_base.py b/assemblyai/streaming/v3/_base.py index d46f90a..e244ec1 100644 --- a/assemblyai/streaming/v3/_base.py +++ b/assemblyai/streaming/v3/_base.py @@ -33,6 +33,7 @@ ErrorEvent, EventMessage, LLMGatewayResponseEvent, + SpeakerRevisionEvent, SpeechStartedEvent, StreamingClientOptions, StreamingError, @@ -233,6 +234,8 @@ def _parse_message(cls, data: Dict[str, Any]) -> Optional[EventMessage]: return _parse_model(SpeechStartedEvent, data) elif event_type == StreamingEvents.LLMGatewayResponse: return _parse_model(LLMGatewayResponseEvent, data) + elif event_type == StreamingEvents.SpeakerRevision: + return _parse_model(SpeakerRevisionEvent, data) elif event_type == StreamingEvents.Error: return _parse_model(ErrorEvent, data) elif event_type == StreamingEvents.Warning: diff --git a/assemblyai/streaming/v3/models.py b/assemblyai/streaming/v3/models.py index c7cd5f4..a59d5a8 100644 --- a/assemblyai/streaming/v3/models.py +++ b/assemblyai/streaming/v3/models.py @@ -74,6 +74,21 @@ class LLMGatewayResponseEvent(BaseModel): data: Any +class SpeakerRevisionEvent(BaseModel): + """Server-side correction to a previously-emitted Turn's speaker labels. + + Emitted after offline reclustering refines the live tentative labels. + Match by `turn_order` against the original Turn; replace its per-word + speaker assignments (and the turn-level `speaker_label`) with these. + Text and word timestamps are unchanged from the original Turn. + """ + + type: Literal["SpeakerRevision"] = "SpeakerRevision" + turn_order: int + speaker_label: Optional[str] = None + words: List[Word] = [] + + EventMessage = Union[ BeginEvent, TerminationEvent, @@ -82,6 +97,7 @@ class LLMGatewayResponseEvent(BaseModel): ErrorEvent, WarningEvent, LLMGatewayResponseEvent, + SpeakerRevisionEvent, ] @@ -290,3 +306,4 @@ class StreamingEvents(Enum): Error = "Error" Warning = "Warning" LLMGatewayResponse = "LLMGatewayResponse" + SpeakerRevision = "SpeakerRevision" diff --git a/tests/unit/test_streaming.py b/tests/unit/test_streaming.py index a637eba..bb0b551 100644 --- a/tests/unit/test_streaming.py +++ b/tests/unit/test_streaming.py @@ -13,6 +13,7 @@ from assemblyai.streaming.v3 import ( NoiseSuppressionModel, + SpeakerRevisionEvent, SpeechModel, SpeechStartedEvent, StreamingClient, @@ -911,6 +912,88 @@ def test_turn_event_with_word_speakers(): assert [w.speaker for w in event.words] == ["A", "B"] +def test_speaker_revision_event_parses(): + # Given: a SpeakerRevision payload as emitted by the server (revision words + # use the same Word schema as Turn — start/end/confidence/text/word_is_final/speaker) + data = { + "type": "SpeakerRevision", + "turn_order": 3, + "speaker_label": "B", + "words": [ + { + "start": 1000, + "end": 1200, + "confidence": 0.9, + "text": "hello", + "word_is_final": True, + "speaker": "B", + }, + { + "start": 1210, + "end": 1400, + "confidence": 0.88, + "text": "world", + "word_is_final": True, + "speaker": "A", + }, + ], + } + + # When: parsed + event = SpeakerRevisionEvent.parse_obj(data) + + # Then: the revision carries the corrected per-word and turn-level speakers + assert event.type == "SpeakerRevision" + assert event.turn_order == 3 + assert event.speaker_label == "B" + assert [w.speaker for w in event.words] == ["B", "A"] + + +def test_speaker_revision_event_dispatched_to_handler(mocker: MockFixture): + # Given: a SpeakerRevision frame on the wire and a handler registered + revision_json = json.dumps( + { + "type": "SpeakerRevision", + "turn_order": 5, + "speaker_label": "A", + "words": [ + { + "start": 500, + "end": 700, + "confidence": 0.95, + "text": "yes", + "word_is_final": True, + "speaker": "A", + }, + ], + } + ) + fake_ws = _FakeWebSocket(recv_script=[revision_json]) + mocker.patch( + "assemblyai.streaming.v3.client.websocket_connect", + return_value=fake_ws, + ) + received = [] + client = StreamingClient( + StreamingClientOptions(api_key="test", api_host="api.example.com") + ) + client.on(StreamingEvents.SpeakerRevision, lambda _c, event: received.append(event)) + + # When: the client reads the frame + client.connect(_default_params()) + deadline = time.monotonic() + 2.0 + while time.monotonic() < deadline and not received: + time.sleep(0.02) + client.disconnect(terminate=False) + + # Then: the handler is invoked with a parsed SpeakerRevisionEvent + assert len(received) == 1 + assert isinstance(received[0], SpeakerRevisionEvent) + assert received[0].turn_order == 5 + assert received[0].speaker_label == "A" + assert [w.speaker for w in received[0].words] == ["A"] + + def test_speech_model_required(): """Test that omitting speech_model raises a validation error.""" with pytest.raises(ValidationError):