Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion assemblyai/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.64.3"
__version__ = "0.64.4"
2 changes: 2 additions & 0 deletions assemblyai/streaming/v3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
EventMessage,
LLMGatewayResponseEvent,
NoiseSuppressionModel,
SpeakerRevisionEvent,
SpeechModel,
SpeechStartedEvent,
StreamingClientOptions,
Expand All @@ -29,6 +30,7 @@
"EventMessage",
"LLMGatewayResponseEvent",
"NoiseSuppressionModel",
"SpeakerRevisionEvent",
"SpeechModel",
"SpeechStartedEvent",
"StreamingClient",
Expand Down
3 changes: 3 additions & 0 deletions assemblyai/streaming/v3/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
ErrorEvent,
EventMessage,
LLMGatewayResponseEvent,
SpeakerRevisionEvent,
SpeechStartedEvent,
StreamingClientOptions,
StreamingError,
Expand Down Expand Up @@ -233,6 +234,8 @@ def _parse_message(cls, data: Dict[str, Any]) -> Optional[EventMessage]:
return _parse_model(SpeechStartedEvent, data)
elif event_type == StreamingEvents.LLMGatewayResponse:
return _parse_model(LLMGatewayResponseEvent, data)
elif event_type == StreamingEvents.SpeakerRevision:
return _parse_model(SpeakerRevisionEvent, data)
elif event_type == StreamingEvents.Error:
return _parse_model(ErrorEvent, data)
elif event_type == StreamingEvents.Warning:
Expand Down
17 changes: 17 additions & 0 deletions assemblyai/streaming/v3/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,21 @@ class LLMGatewayResponseEvent(BaseModel):
data: Any


class SpeakerRevisionEvent(BaseModel):
"""Server-side correction to a previously-emitted Turn's speaker labels.

Emitted after offline reclustering refines the live tentative labels.
Match by `turn_order` against the original Turn; replace its per-word
speaker assignments (and the turn-level `speaker_label`) with these.
Text and word timestamps are unchanged from the original Turn.
"""

type: Literal["SpeakerRevision"] = "SpeakerRevision"
turn_order: int
speaker_label: Optional[str] = None
words: List[Word] = []


EventMessage = Union[
BeginEvent,
TerminationEvent,
Expand All @@ -82,6 +97,7 @@ class LLMGatewayResponseEvent(BaseModel):
ErrorEvent,
WarningEvent,
LLMGatewayResponseEvent,
SpeakerRevisionEvent,
]


Expand Down Expand Up @@ -290,3 +306,4 @@ class StreamingEvents(Enum):
Error = "Error"
Warning = "Warning"
LLMGatewayResponse = "LLMGatewayResponse"
SpeakerRevision = "SpeakerRevision"
83 changes: 83 additions & 0 deletions tests/unit/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from assemblyai.streaming.v3 import (
NoiseSuppressionModel,
SpeakerRevisionEvent,
SpeechModel,
SpeechStartedEvent,
StreamingClient,
Expand Down Expand Up @@ -911,6 +912,88 @@ def test_turn_event_with_word_speakers():
assert [w.speaker for w in event.words] == ["A", "B"]


def test_speaker_revision_event_parses():
# Given: a SpeakerRevision payload as emitted by the server (revision words
# use the same Word schema as Turn — start/end/confidence/text/word_is_final/speaker)
data = {
"type": "SpeakerRevision",
"turn_order": 3,
"speaker_label": "B",
"words": [
{
"start": 1000,
"end": 1200,
"confidence": 0.9,
"text": "hello",
"word_is_final": True,
"speaker": "B",
},
{
"start": 1210,
"end": 1400,
"confidence": 0.88,
"text": "world",
"word_is_final": True,
"speaker": "A",
},
],
}

# When: parsed
event = SpeakerRevisionEvent.parse_obj(data)

# Then: the revision carries the corrected per-word and turn-level speakers
assert event.type == "SpeakerRevision"
assert event.turn_order == 3
assert event.speaker_label == "B"
assert [w.speaker for w in event.words] == ["B", "A"]


def test_speaker_revision_event_dispatched_to_handler(mocker: MockFixture):
# Given: a SpeakerRevision frame on the wire and a handler registered
revision_json = json.dumps(
{
"type": "SpeakerRevision",
"turn_order": 5,
"speaker_label": "A",
"words": [
{
"start": 500,
"end": 700,
"confidence": 0.95,
"text": "yes",
"word_is_final": True,
"speaker": "A",
},
],
}
)
fake_ws = _FakeWebSocket(recv_script=[revision_json])
mocker.patch(
"assemblyai.streaming.v3.client.websocket_connect",
return_value=fake_ws,
)
received = []
client = StreamingClient(
StreamingClientOptions(api_key="test", api_host="api.example.com")
)
client.on(StreamingEvents.SpeakerRevision, lambda _c, event: received.append(event))

# When: the client reads the frame
client.connect(_default_params())
deadline = time.monotonic() + 2.0
while time.monotonic() < deadline and not received:
time.sleep(0.02)
client.disconnect(terminate=False)

# Then: the handler is invoked with a parsed SpeakerRevisionEvent
assert len(received) == 1
assert isinstance(received[0], SpeakerRevisionEvent)
assert received[0].turn_order == 5
assert received[0].speaker_label == "A"
assert [w.speaker for w in received[0].words] == ["A"]


def test_speech_model_required():
"""Test that omitting speech_model raises a validation error."""
with pytest.raises(ValidationError):
Expand Down
Loading