From 47e82cb5f5032bd8ccca1a861c779e0a7fc47689 Mon Sep 17 00:00:00 2001 From: fern-api <115122769+fern-api[bot]@users.noreply.github.com> Date: Wed, 20 May 2026 20:38:04 +0000 Subject: [PATCH 01/28] [fern-generated] Update SDK Generated by Fern CLI Version: unknown Generators: - fernapi/fern-python-sdk: 4.37.0 --- .fern/replay.lock | 10 + .fernignore | 3 + .gitattributes | 1 + .github/workflows/ci.yml | 5 +- .github/workflows/release.yml | 51 ---- README.md | 86 ++++++ reference.md | 30 +- src/agora_agent/agent_management/client.py | 2 + .../agent_management/raw_client.py | 2 + ..._management_request_on_listening_action.py | 4 +- .../agent_think_agent_management_response.py | 2 +- ...agent_think_request_on_listening_action.py | 5 - .../agent_think_request_on_speaking_action.py | 5 - .../agent_think_request_on_thinking_action.py | 5 - src/agora_agent/agents/client.py | 62 +++- src/agora_agent/agents/raw_client.py | 42 ++- .../agents/types/get_agents_response.py | 1 - .../types/get_agents_response_status.py | 2 +- .../agents/types/get_turns_agents_response.py | 26 ++ .../get_turns_agents_response_pagination.py} | 18 +- ...agents_response_turns_item_end_metadata.py | 4 +- ..._item_metrics_segmented_latency_ms_item.py | 12 +- .../agents/types/interrupt_agents_response.py | 2 +- ...t_agents_response_data_list_item_status.py | 2 +- ...ts_request_properties_advanced_features.py | 2 +- .../start_agents_request_properties_avatar.py | 1 + ...agents_request_properties_avatar_vendor.py | 2 +- ...ties_filler_words_content_static_config.py | 2 +- .../start_agents_request_properties_llm.py | 2 +- ...request_properties_llm_greeting_configs.py | 6 + .../start_agents_request_properties_mllm.py | 13 +- ..._request_properties_mllm_turn_detection.py | 2 +- ...s_mllm_turn_detection_server_vad_config.py | 2 +- ...t_agents_request_properties_mllm_vendor.py | 2 +- .../agents/types/start_agents_response.py | 1 - .../types/start_agents_response_status.py | 2 +- .../agents/types/update_agents_response.py | 1 - .../types/update_agents_response_status.py | 2 +- src/agora_agent/core/client_wrapper.py | 4 +- src/agora_agent/types/agent_error_response.py | 46 +++ .../types/agent_error_response_reason.py | 24 ++ src/agora_agent/types/open_ai_tts_params.py | 2 +- .../types/telephony_error_response.py | 34 +++ tests/custom/test_client.py | 268 +----------------- tests/custom/test_presets.py | 135 --------- 45 files changed, 407 insertions(+), 528 deletions(-) create mode 100644 .fern/replay.lock create mode 100644 .gitattributes delete mode 100644 .github/workflows/release.yml delete mode 100644 src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py delete mode 100644 src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py delete mode 100644 src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py rename src/agora_agent/{agent_management/types/agent_think_response.py => agents/types/get_turns_agents_response_pagination.py} (56%) create mode 100644 src/agora_agent/types/agent_error_response.py create mode 100644 src/agora_agent/types/agent_error_response_reason.py create mode 100644 src/agora_agent/types/telephony_error_response.py delete mode 100644 tests/custom/test_presets.py diff --git a/.fern/replay.lock b/.fern/replay.lock new file mode 100644 index 0000000..536b6e4 --- /dev/null +++ b/.fern/replay.lock @@ -0,0 +1,10 @@ +# DO NOT EDIT MANUALLY - Managed by Fern Replay +version: "1.0" +generations: + - commit_sha: a217c8ecfd919345831eebaca8295e292d65ebcf + tree_hash: 707f496ae7e028b80fc9a2adc1e5d69468f170b3 + timestamp: 2026-05-20T20:38:02.180Z + cli_version: unknown + generator_versions: {} +current_generation: a217c8ecfd919345831eebaca8295e292d65ebcf +patches: [] diff --git a/.fernignore b/.fernignore index d85561f..1d34ba0 100644 --- a/.fernignore +++ b/.fernignore @@ -14,3 +14,6 @@ docs/ pyproject.toml poetry.lock requirements.txt +.fern/replay.lock +.fern/replay.yml +.gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..74928d6 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +.fern/replay.lock linguist-generated=true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d7dc922..f46ffcf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,6 +54,7 @@ jobs: - name: Publish to pypi run: | poetry config repositories.remote https://upload.pypi.org/legacy/ - poetry --no-interaction -v publish --build --repository remote --username "__token__" --password "$PYPI_API_TOKEN" + poetry --no-interaction -v publish --build --repository remote --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD" env: - PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 2c4b0e7..0000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: release - -on: - push: - tags: - - "v*" - workflow_dispatch: - inputs: - tag: - description: "Existing tag to release, for example v1.4.0" - required: true - type: string - -permissions: - contents: write - -jobs: - github-release: - runs-on: ubuntu-latest - env: - RELEASE_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.tag || github.ref_name }} - steps: - - name: Checkout repo - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Extract release notes - run: | - TAG="${RELEASE_TAG}" - git rev-parse "$TAG" >/dev/null - awk -v tag="$TAG" ' - index($0, "## [" tag "]") == 1 { found = 1; next } - found && /^## \[/ { exit } - found { print } - ' changelog.md > release_notes.md - - if [ ! -s release_notes.md ]; then - echo "Release $TAG" > release_notes.md - fi - - - name: Create or update GitHub release - env: - GH_TOKEN: ${{ github.token }} - run: | - TAG="${RELEASE_TAG}" - if gh release view "$TAG" >/dev/null 2>&1; then - gh release edit "$TAG" --title "$TAG" --notes-file release_notes.md - else - gh release create "$TAG" --title "$TAG" --notes-file release_notes.md - fi diff --git a/README.md b/README.md index f0ecfe5..f83f96b 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ and multimodal flows (MLLM) for real-time audio processing. - [Documentation](#documentation) - [Reference](#reference) - [Mllm Flow Multimodal](#mllm-flow-multimodal) +- [Mllm Flow Multimodal](#mllm-flow-multimodal) - [Usage](#usage) - [Async Client](#async-client) - [Exception Handling](#exception-handling) @@ -278,6 +279,71 @@ client.agents.start( ``` +## MLLM Flow (Multimodal) + +For real-time audio processing using OpenAI's Realtime API or Google Gemini Live, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. + +```python +from agora-agent-server-sdk import Agora +from agora-agent-server-sdk.agents import ( + StartAgentsRequestProperties, + StartAgentsRequestPropertiesAdvancedFeatures, + StartAgentsRequestPropertiesMllm, + StartAgentsRequestPropertiesMllmVendor, + StartAgentsRequestPropertiesTts, + StartAgentsRequestPropertiesTtsVendor, + StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionType, +) + +client = Agora( + customer_id="YOUR_CUSTOMER_ID", + customer_secret="YOUR_CUSTOMER_SECRET", +) + +client.agents.start( + appid="your_app_id", + name="mllm_agent", + properties=StartAgentsRequestProperties( + channel="channel_name", + token="your_token", + agent_rtc_uid="1001", + remote_rtc_uids=["1002"], + idle_timeout=120, + advanced_features=StartAgentsRequestPropertiesAdvancedFeatures( + enable_mllm=True, + ), + mllm=StartAgentsRequestPropertiesMllm( + url="wss://api.openai.com/v1/realtime", + api_key="", + vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + params={ + "model": "gpt-4o-realtime-preview", + "voice": "alloy", + }, + input_modalities=["audio"], + output_modalities=["text", "audio"], + greeting_message="Hello! I'm ready to chat in real-time.", + ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + type=StartAgentsRequestPropertiesTurnDetectionType.SERVER_VAD, + threshold=0.5, + silence_duration_ms=500, + ), + # TTS and LLM are still required but not used when MLLM is enabled + tts=StartAgentsRequestPropertiesTts( + vendor=StartAgentsRequestPropertiesTtsVendor.MICROSOFT, + params={}, + ), + llm=StartAgentsRequestPropertiesLlm( + url="https://api.openai.com/v1/chat/completions", + ), + ), +) +``` + + ## Usage Instantiate and use the client with the following: @@ -288,6 +354,9 @@ from agora_agent.agents import ( StartAgentsRequestProperties, StartAgentsRequestPropertiesAsr, StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionConfig, + StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, ) client = Agora( @@ -325,6 +394,13 @@ client.agents.start( greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + config=StartAgentsRequestPropertiesTurnDetectionConfig( + end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( + mode="semantic", + ), + ), + ), ), ) ``` @@ -341,6 +417,9 @@ from agora_agent.agents import ( StartAgentsRequestProperties, StartAgentsRequestPropertiesAsr, StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionConfig, + StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, ) client = AsyncAgora( @@ -381,6 +460,13 @@ async def main() -> None: greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + config=StartAgentsRequestPropertiesTurnDetectionConfig( + end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( + mode="semantic", + ), + ), + ), ), ) diff --git a/reference.md b/reference.md index 356b016..55a516e 100644 --- a/reference.md +++ b/reference.md @@ -32,6 +32,9 @@ from agora_agent.agents import ( StartAgentsRequestProperties, StartAgentsRequestPropertiesAsr, StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionConfig, + StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, ) client = Agora( @@ -69,6 +72,13 @@ client.agents.start( greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + config=StartAgentsRequestPropertiesTurnDetectionConfig( + end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( + mode="semantic", + ), + ), + ), ), ) @@ -242,7 +252,6 @@ The agent state to filter by. Only one state can be specified per query: - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. -- `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. @@ -516,6 +525,22 @@ client.agents.get_turns(
+**page_index:** `typing.Optional[int]` — The page number. Starts from 1. + +
+
+ +
+
+ +**page_size:** `typing.Optional[int]` — The number of dialogue turns returned per page. + +
+
+ +
+
+ **request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
@@ -540,7 +565,7 @@ client.agents.get_turns(
-Stop the specified conversational agent instance. +Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned.
@@ -1015,6 +1040,7 @@ client.agent_management.agent_think( The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. +- `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. diff --git a/src/agora_agent/agent_management/client.py b/src/agora_agent/agent_management/client.py index 71b3c62..6140649 100644 --- a/src/agora_agent/agent_management/client.py +++ b/src/agora_agent/agent_management/client.py @@ -72,6 +72,7 @@ def agent_think( on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] @@ -186,6 +187,7 @@ async def agent_think( on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] diff --git a/src/agora_agent/agent_management/raw_client.py b/src/agora_agent/agent_management/raw_client.py index 03a0838..4af36fe 100644 --- a/src/agora_agent/agent_management/raw_client.py +++ b/src/agora_agent/agent_management/raw_client.py @@ -65,6 +65,7 @@ def agent_think( on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] @@ -167,6 +168,7 @@ async def agent_think( on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] diff --git a/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py index 117d8cc..3812a6e 100644 --- a/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py +++ b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py @@ -2,4 +2,6 @@ import typing -AgentThinkAgentManagementRequestOnListeningAction = typing.Union[typing.Literal["inject", "ignore"], typing.Any] +AgentThinkAgentManagementRequestOnListeningAction = typing.Union[ + typing.Literal["inject", "interrupt", "ignore"], typing.Any +] diff --git a/src/agora_agent/agent_management/types/agent_think_agent_management_response.py b/src/agora_agent/agent_management/types/agent_think_agent_management_response.py index 7e512eb..0c1f689 100644 --- a/src/agora_agent/agent_management/types/agent_think_agent_management_response.py +++ b/src/agora_agent/agent_management/types/agent_think_agent_management_response.py @@ -20,7 +20,7 @@ class AgentThinkAgentManagementResponse(UncheckedBaseModel): start_ts: typing.Optional[int] = pydantic.Field(default=None) """ - Timestamp indicating when the agent was created. + Unix timestamp in seconds when the think request was processed. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py b/src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py deleted file mode 100644 index 54cca4c..0000000 --- a/src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -AgentThinkRequestOnListeningAction = typing.Union[typing.Literal["inject", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py b/src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py deleted file mode 100644 index 8329197..0000000 --- a/src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -AgentThinkRequestOnSpeakingAction = typing.Union[typing.Literal["interrupt", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py b/src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py deleted file mode 100644 index ee50877..0000000 --- a/src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -AgentThinkRequestOnThinkingAction = typing.Union[typing.Literal["interrupt", "ignore"], typing.Any] diff --git a/src/agora_agent/agents/client.py b/src/agora_agent/agents/client.py index 6c4a48c..3f6af4c 100644 --- a/src/agora_agent/agents/client.py +++ b/src/agora_agent/agents/client.py @@ -89,6 +89,9 @@ def start( StartAgentsRequestProperties, StartAgentsRequestPropertiesAsr, StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionConfig, + StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, ) client = Agora( @@ -126,6 +129,13 @@ def start( greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + config=StartAgentsRequestPropertiesTurnDetectionConfig( + end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( + mode="semantic", + ), + ), + ), ), ) """ @@ -175,7 +185,6 @@ def list( - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. limit : typing.Optional[int] @@ -302,7 +311,13 @@ def get_history( return _response.data def get_turns( - self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None + self, + appid: str, + agent_id: str, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, ) -> GetTurnsAgentsResponse: """ Query conversation turn information for a conversational AI agent session. @@ -319,6 +334,12 @@ def get_turns( agent_id : str The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + page_index : typing.Optional[int] + The page number. Starts from 1. + + page_size : typing.Optional[int] + The number of dialogue turns returned per page. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -341,12 +362,14 @@ def get_turns( agent_id="agentId", ) """ - _response = self._raw_client.get_turns(appid, agent_id, request_options=request_options) + _response = self._raw_client.get_turns( + appid, agent_id, page_index=page_index, page_size=page_size, request_options=request_options + ) return _response.data def stop(self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None: """ - Stop the specified conversational agent instance. + Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned. Parameters ---------- @@ -623,6 +646,9 @@ async def start( StartAgentsRequestProperties, StartAgentsRequestPropertiesAsr, StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionConfig, + StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, ) client = AsyncAgora( @@ -663,6 +689,13 @@ async def main() -> None: greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + config=StartAgentsRequestPropertiesTurnDetectionConfig( + end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( + mode="semantic", + ), + ), + ), ), ) @@ -715,7 +748,6 @@ async def list( - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. limit : typing.Optional[int] @@ -867,7 +899,13 @@ async def main() -> None: return _response.data async def get_turns( - self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None + self, + appid: str, + agent_id: str, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, ) -> GetTurnsAgentsResponse: """ Query conversation turn information for a conversational AI agent session. @@ -884,6 +922,12 @@ async def get_turns( agent_id : str The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + page_index : typing.Optional[int] + The page number. Starts from 1. + + page_size : typing.Optional[int] + The number of dialogue turns returned per page. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -914,12 +958,14 @@ async def main() -> None: asyncio.run(main()) """ - _response = await self._raw_client.get_turns(appid, agent_id, request_options=request_options) + _response = await self._raw_client.get_turns( + appid, agent_id, page_index=page_index, page_size=page_size, request_options=request_options + ) return _response.data async def stop(self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None: """ - Stop the specified conversational agent instance. + Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned. Parameters ---------- diff --git a/src/agora_agent/agents/raw_client.py b/src/agora_agent/agents/raw_client.py index c6eb03f..c0dfa01 100644 --- a/src/agora_agent/agents/raw_client.py +++ b/src/agora_agent/agents/raw_client.py @@ -144,7 +144,6 @@ def list( - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. limit : typing.Optional[int] @@ -293,7 +292,13 @@ def get_history( raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) def get_turns( - self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None + self, + appid: str, + agent_id: str, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, ) -> HttpResponse[GetTurnsAgentsResponse]: """ Query conversation turn information for a conversational AI agent session. @@ -310,6 +315,12 @@ def get_turns( agent_id : str The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + page_index : typing.Optional[int] + The page number. Starts from 1. + + page_size : typing.Optional[int] + The number of dialogue turns returned per page. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -321,6 +332,10 @@ def get_turns( _response = self._client_wrapper.httpx_client.request( f"v2/projects/{jsonable_encoder(appid)}/agents/{jsonable_encoder(agent_id)}/turns", method="GET", + params={ + "page_index": page_index, + "page_size": page_size, + }, request_options=request_options, ) try: @@ -342,7 +357,7 @@ def stop( self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None ) -> HttpResponse[None]: """ - Stop the specified conversational agent instance. + Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned. Parameters ---------- @@ -670,7 +685,6 @@ async def list( - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. limit : typing.Optional[int] @@ -822,7 +836,13 @@ async def get_history( raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) async def get_turns( - self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None + self, + appid: str, + agent_id: str, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, ) -> AsyncHttpResponse[GetTurnsAgentsResponse]: """ Query conversation turn information for a conversational AI agent session. @@ -839,6 +859,12 @@ async def get_turns( agent_id : str The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + page_index : typing.Optional[int] + The page number. Starts from 1. + + page_size : typing.Optional[int] + The number of dialogue turns returned per page. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -850,6 +876,10 @@ async def get_turns( _response = await self._client_wrapper.httpx_client.request( f"v2/projects/{jsonable_encoder(appid)}/agents/{jsonable_encoder(agent_id)}/turns", method="GET", + params={ + "page_index": page_index, + "page_size": page_size, + }, request_options=request_options, ) try: @@ -871,7 +901,7 @@ async def stop( self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None ) -> AsyncHttpResponse[None]: """ - Stop the specified conversational agent instance. + Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned. Parameters ---------- diff --git a/src/agora_agent/agents/types/get_agents_response.py b/src/agora_agent/agents/types/get_agents_response.py index fe7e8e9..fd0aa2b 100644 --- a/src/agora_agent/agents/types/get_agents_response.py +++ b/src/agora_agent/agents/types/get_agents_response.py @@ -32,7 +32,6 @@ class GetAgentsResponse(UncheckedBaseModel): - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. """ diff --git a/src/agora_agent/agents/types/get_agents_response_status.py b/src/agora_agent/agents/types/get_agents_response_status.py index 2cef8c7..c170dc3 100644 --- a/src/agora_agent/agents/types/get_agents_response_status.py +++ b/src/agora_agent/agents/types/get_agents_response_status.py @@ -3,5 +3,5 @@ import typing GetAgentsResponseStatus = typing.Union[ - typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "RECOVERING", "FAILED"], typing.Any + typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "FAILED"], typing.Any ] diff --git a/src/agora_agent/agents/types/get_turns_agents_response.py b/src/agora_agent/agents/types/get_turns_agents_response.py index dca9ce1..653b3a5 100644 --- a/src/agora_agent/agents/types/get_turns_agents_response.py +++ b/src/agora_agent/agents/types/get_turns_agents_response.py @@ -5,10 +5,36 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from .get_turns_agents_response_pagination import GetTurnsAgentsResponsePagination from .get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem class GetTurnsAgentsResponse(UncheckedBaseModel): + agent_id: typing.Optional[str] = pydantic.Field(default=None) + """ + The unique identifier of the agent. + """ + + name: typing.Optional[str] = pydantic.Field(default=None) + """ + The name of the agent. + """ + + channel: typing.Optional[str] = pydantic.Field(default=None) + """ + The name of the RTC channel the agent joined. + """ + + total_turn_count: typing.Optional[int] = pydantic.Field(default=None) + """ + The total number of dialogue turns in the current session. + """ + + pagination: typing.Optional[GetTurnsAgentsResponsePagination] = pydantic.Field(default=None) + """ + Pagination information. + """ + turns: typing.Optional[typing.List[GetTurnsAgentsResponseTurnsItem]] = pydantic.Field(default=None) """ A list of conversation turns for the agent session. diff --git a/src/agora_agent/agent_management/types/agent_think_response.py b/src/agora_agent/agents/types/get_turns_agents_response_pagination.py similarity index 56% rename from src/agora_agent/agent_management/types/agent_think_response.py rename to src/agora_agent/agents/types/get_turns_agents_response_pagination.py index 3a3c646..8330f7c 100644 --- a/src/agora_agent/agent_management/types/agent_think_response.py +++ b/src/agora_agent/agents/types/get_turns_agents_response_pagination.py @@ -7,20 +7,24 @@ from ...core.unchecked_base_model import UncheckedBaseModel -class AgentThinkResponse(UncheckedBaseModel): - agent_id: typing.Optional[str] = pydantic.Field(default=None) +class GetTurnsAgentsResponsePagination(UncheckedBaseModel): """ - Unique identifier of the agent instance. + Pagination information. """ - channel: typing.Optional[str] = pydantic.Field(default=None) + page_index: typing.Optional[int] = pydantic.Field(default=None) """ - The name of the RTC channel where the agent is located. + The current page number; starts from 1. """ - start_ts: typing.Optional[int] = pydantic.Field(default=None) + total_pages: typing.Optional[int] = pydantic.Field(default=None) """ - Timestamp indicating when the agent was created. + The total number of pages. + """ + + is_last_page: typing.Optional[bool] = pydantic.Field(default=None) + """ + True if the current page is the last page. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/agents/types/get_turns_agents_response_turns_item_end_metadata.py b/src/agora_agent/agents/types/get_turns_agents_response_turns_item_end_metadata.py index 0d9c61e..1e69709 100644 --- a/src/agora_agent/agents/types/get_turns_agents_response_turns_item_end_metadata.py +++ b/src/agora_agent/agents/types/get_turns_agents_response_turns_item_end_metadata.py @@ -28,8 +28,8 @@ class GetTurnsAgentsResponseTurnsItemEndMetadata(UncheckedBaseModel): - `api_leave`: The turn was interrupted because the agent left the channel. When `type` is `ignored`, possible values are: - - `semantic`: The turn was ignored because semantic end-of-speech detection determined no response was required. - - `keywords`: The turn was ignored because the start keyword was not detected. + - `semantic`: The turn was ignored because semantic end-of-speech detection determined no response was required. Applies when `turn_detection.config.end_of_speech.mode` is set to `semantic`. + - `keywords`: The turn was ignored because the start keyword was not detected. Applies when `turn_detection.config.start_of_speech.mode` is set to `keywords`. - `disable`: The turn was ignored because interruption is disabled for this turn. """ diff --git a/src/agora_agent/agents/types/get_turns_agents_response_turns_item_metrics_segmented_latency_ms_item.py b/src/agora_agent/agents/types/get_turns_agents_response_turns_item_metrics_segmented_latency_ms_item.py index 345d970..e849fdc 100644 --- a/src/agora_agent/agents/types/get_turns_agents_response_turns_item_metrics_segmented_latency_ms_item.py +++ b/src/agora_agent/agents/types/get_turns_agents_response_turns_item_metrics_segmented_latency_ms_item.py @@ -14,16 +14,16 @@ class GetTurnsAgentsResponseTurnsItemMetricsSegmentedLatencyMsItem(UncheckedBase When the LLM input modality is `text`, the returned segments are: - `algorithm_processing`: Algorithm processing delay. - - `asr_ttlw`: ASR Time To Last Word (TTLW) in milliseconds. - - `llm_ttft`: LLM Time To First Token (TTFT) in milliseconds. - - `llm_ftfs`: LLM First Token To First Sentence (FTFS) in milliseconds. - - `tts_ttfb`: TTS Time To First Byte (TTFB) in milliseconds. + - `asr_ttlw`: The ASR Time To Last Word (TTLW) in milliseconds. Represents the delay from when the user finishes speaking to when the ASR module outputs the last word. + - `llm_ttft`: The LLM Time To First Token (TTFT) in milliseconds. Represents the delay from when the user finishes speaking to when the LLM outputs the first token. + - `llm_ftfs`: The LLM First Token To First Sentence (FTFS) in milliseconds. Represents the delay from when the LLM outputs the first token to when it outputs the first complete sentence. + - `tts_ttfb`: The TTS Time To First Byte (TTFB) in milliseconds. Represents the delay from when the TTS module receives a text request to when it outputs the first audio byte. - `transport`: Network transmission delay in milliseconds. Not returned when the user is connected using the RTC Web SDK. When the LLM input modality is `audio`, the returned segments are: - `algorithm_processing`: Algorithm processing delay. - - `asr_ttlw`: ASR Time To Last Word (TTLW) in milliseconds. - - `llm_ttfa`: LLM Time To First Audio Byte (TTFA) in milliseconds. + - `asr_ttlw`: The ASR Time To Last Word (TTLW) in milliseconds. Represents the delay from when the user finishes speaking to when the ASR module outputs the last word. + - `llm_ttfa`: The LLM Time To First Audio Byte (TTFA) in milliseconds. Represents the delay from when the user finishes speaking to when the LLM outputs the first audio byte. - `transport`: Network transmission delay in milliseconds. Not returned when the user is connected using the RTC Web SDK. """ diff --git a/src/agora_agent/agents/types/interrupt_agents_response.py b/src/agora_agent/agents/types/interrupt_agents_response.py index 79ff7f5..f8c59af 100644 --- a/src/agora_agent/agents/types/interrupt_agents_response.py +++ b/src/agora_agent/agents/types/interrupt_agents_response.py @@ -20,7 +20,7 @@ class InterruptAgentsResponse(UncheckedBaseModel): start_ts: typing.Optional[int] = pydantic.Field(default=None) """ - Timestamp when the broadcast started + Unix timestamp in seconds when the interrupt request was processed. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/agents/types/list_agents_response_data_list_item_status.py b/src/agora_agent/agents/types/list_agents_response_data_list_item_status.py index 58c42e0..a00c386 100644 --- a/src/agora_agent/agents/types/list_agents_response_data_list_item_status.py +++ b/src/agora_agent/agents/types/list_agents_response_data_list_item_status.py @@ -3,5 +3,5 @@ import typing ListAgentsResponseDataListItemStatus = typing.Union[ - typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "RECOVERING", "FAILED"], typing.Any + typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "FAILED"], typing.Any ] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py b/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py index 78250d7..e75f098 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py @@ -14,7 +14,7 @@ class StartAgentsRequestPropertiesAdvancedFeatures(UncheckedBaseModel): enable_mllm: typing.Optional[bool] = pydantic.Field(default=None) """ - Use `mllm.enable` instead. Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. See `turn_detection.mode` for turn detection options available with MLLM. + Use `mllm.enable` instead. Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. See `turn_detection.type` for turn detection options available with MLLM. """ enable_rtm: typing.Optional[bool] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_avatar.py b/src/agora_agent/agents/types/start_agents_request_properties_avatar.py index 24e3888..8993b2c 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_avatar.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_avatar.py @@ -24,6 +24,7 @@ class StartAgentsRequestPropertiesAvatar(UncheckedBaseModel): - `akool`: Akool (Beta) - `liveavatar`: LiveAvatar (Beta) - `anam`: Anam (Beta) + - `generic`: Generic (Beta) """ params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_avatar_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_avatar_vendor.py index 2926d04..e5bcec5 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_avatar_vendor.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_avatar_vendor.py @@ -3,5 +3,5 @@ import typing StartAgentsRequestPropertiesAvatarVendor = typing.Union[ - typing.Literal["akool", "liveavatar", "anam", "heygen"], typing.Any + typing.Literal["akool", "liveavatar", "anam", "generic", "heygen"], typing.Any ] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_filler_words_content_static_config.py b/src/agora_agent/agents/types/start_agents_request_properties_filler_words_content_static_config.py index e220739..7fe264c 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_filler_words_content_static_config.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_filler_words_content_static_config.py @@ -15,7 +15,7 @@ class StartAgentsRequestPropertiesFillerWordsContentStaticConfig(UncheckedBaseMo Static filler word configuration. Used when `mode` is `static`. """ - phrases: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + phrases: typing.List[str] = pydantic.Field() """ List of filler word phrases. Maximum 100 filler words, each not exceeding 50 English words. """ diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm.py b/src/agora_agent/agents/types/start_agents_request_properties_llm.py index 20c391e..9ab0f62 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_llm.py @@ -44,7 +44,7 @@ class StartAgentsRequestPropertiesLlm(UncheckedBaseModel): """ LLM input modalities: - `["text"]`: Text only - - `["text", "image"]`: Text plus image; requires the selected LLM to support visual input + - `["text", "image"]`: Text plus image. Recommended configuration, requires the selected LLM to support visual input """ output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py index a8594ee..c0d7046 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py @@ -27,6 +27,12 @@ class StartAgentsRequestPropertiesLlmGreetingConfigs(UncheckedBaseModel): The delay in milliseconds before the agent plays the greeting message after a user joins the channel. """ + interruptable: typing.Optional[bool] = pydantic.Field(default=None) + """ + - `true`: Follows the global `interruption` configuration. + - `false`: Uninterruptible. The greeting plays in its entirety. If the user speaks multiple times while the greeting plays, the system merges the speech segments after the greeting ends and sends them to the LLM for a single response. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py index e84422c..0993ebc 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py @@ -59,17 +59,7 @@ class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): failure_message: typing.Optional[str] = pydantic.Field(default=None) """ - Message played when the MLLM call fails. - """ - - max_history: typing.Optional[int] = pydantic.Field(default=None) - """ - Maximum number of conversation history messages cached for the MLLM session. - """ - - predefined_tools: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - Predefined tools available to the MLLM provider. + Agent failure message. If provided, the agent speaks this message when an MLLM request fails. """ vendor: typing.Optional[StartAgentsRequestPropertiesMllmVendor] = pydantic.Field(default=None) @@ -78,6 +68,7 @@ class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): - `openai`: OpenAI Realtime API - `gemini`: Google Gemini Live - `vertexai`: Google Gemini Live (Vertex AI) + - `xai`: xAI Grok Realtime API """ turn_detection: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetection] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py index 9298a0c..032979d 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py @@ -26,7 +26,7 @@ class StartAgentsRequestPropertiesMllmTurnDetection(UncheckedBaseModel): """ Turn detection mode for MLLM: - `agora_vad`: Agora VAD-based detection. - - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API and Gemini Live. + - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API, Gemini Live, and xAI Grok. - `semantic_vad`: Semantic-based detection. Supported by OpenAI Realtime API only. """ diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py index d27b76e..c74d8d7 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py @@ -30,7 +30,7 @@ class StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig(UncheckedBase threshold: typing.Optional[float] = pydantic.Field(default=None) """ - VAD sensitivity threshold. Applicable to OpenAI Realtime API only. + VAD sensitivity threshold. Applicable to OpenAI Realtime API and xAI Grok. """ idle_timeout_ms: typing.Optional[int] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py index 519b143..0233696 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py @@ -2,4 +2,4 @@ import typing -StartAgentsRequestPropertiesMllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai"], typing.Any] +StartAgentsRequestPropertiesMllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/agents/types/start_agents_response.py b/src/agora_agent/agents/types/start_agents_response.py index ee3f32e..1d69a34 100644 --- a/src/agora_agent/agents/types/start_agents_response.py +++ b/src/agora_agent/agents/types/start_agents_response.py @@ -27,7 +27,6 @@ class StartAgentsResponse(UncheckedBaseModel): - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. """ diff --git a/src/agora_agent/agents/types/start_agents_response_status.py b/src/agora_agent/agents/types/start_agents_response_status.py index 6926aca..894df21 100644 --- a/src/agora_agent/agents/types/start_agents_response_status.py +++ b/src/agora_agent/agents/types/start_agents_response_status.py @@ -3,5 +3,5 @@ import typing StartAgentsResponseStatus = typing.Union[ - typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "RECOVERING", "FAILED"], typing.Any + typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "FAILED"], typing.Any ] diff --git a/src/agora_agent/agents/types/update_agents_response.py b/src/agora_agent/agents/types/update_agents_response.py index 9f73945..8951b77 100644 --- a/src/agora_agent/agents/types/update_agents_response.py +++ b/src/agora_agent/agents/types/update_agents_response.py @@ -27,7 +27,6 @@ class UpdateAgentsResponse(UncheckedBaseModel): - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. """ diff --git a/src/agora_agent/agents/types/update_agents_response_status.py b/src/agora_agent/agents/types/update_agents_response_status.py index 7f4078f..c8e9133 100644 --- a/src/agora_agent/agents/types/update_agents_response_status.py +++ b/src/agora_agent/agents/types/update_agents_response_status.py @@ -3,5 +3,5 @@ import typing UpdateAgentsResponseStatus = typing.Union[ - typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "RECOVERING", "FAILED"], typing.Any + typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "FAILED"], typing.Any ] diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index c5a0e03..ed2ed34 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agent-server-sdk/v1.4.0", + "User-Agent": "agora-agent-server-sdk/v1.5.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agent-server-sdk", - "X-Fern-SDK-Version": "v1.4.0", + "X-Fern-SDK-Version": "v1.5.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/types/agent_error_response.py b/src/agora_agent/types/agent_error_response.py new file mode 100644 index 0000000..510ce74 --- /dev/null +++ b/src/agora_agent/types/agent_error_response.py @@ -0,0 +1,46 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .agent_error_response_reason import AgentErrorResponseReason + + +class AgentErrorResponse(UncheckedBaseModel): + """ + Error response returned when a Conversational AI Agent Management API request fails. + + Common HTTP status codes include: + - `400`: Invalid request parameters. + - `401`: Authentication failed. + - `403`: Unauthorized access or service not enabled. + - `404`: Agent not found or has exited. + - `409`: Agent conflict. + - `422`: Access limit exceeded. + - `429`: Request rate limit exceeded. + - `500`: Internal server error. + - `502`: Gateway error. + - `503`: Agent startup failure. + - `504`: Request timeout. + """ + + detail: typing.Optional[str] = pydantic.Field(default=None) + """ + Error detail message. + """ + + reason: typing.Optional[AgentErrorResponseReason] = pydantic.Field(default=None) + """ + Error reason code. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/agent_error_response_reason.py b/src/agora_agent/types/agent_error_response_reason.py new file mode 100644 index 0000000..c7144cf --- /dev/null +++ b/src/agora_agent/types/agent_error_response_reason.py @@ -0,0 +1,24 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AgentErrorResponseReason = typing.Union[ + typing.Literal[ + "ServiceNotEnabled", + "AccountSuspended", + "InternalError", + "InvalidPermission", + "InvalidRequestBody", + "MissingRequiredField", + "InvalidFieldValue", + "ResourceQuotaLimitExceeded", + "ConcurrencyLimitExceeded", + "ServiceUnavailable", + "ResourceAllocationFailed", + "TaskConflict", + "TaskNotFound", + "TaskOperationTimeout", + "NotImplemented", + ], + typing.Any, +] diff --git a/src/agora_agent/types/open_ai_tts_params.py b/src/agora_agent/types/open_ai_tts_params.py index 67a6e1d..f042c1f 100644 --- a/src/agora_agent/types/open_ai_tts_params.py +++ b/src/agora_agent/types/open_ai_tts_params.py @@ -14,7 +14,7 @@ class OpenAiTtsParams(UncheckedBaseModel): api_key: typing.Optional[str] = pydantic.Field(default=None) """ - OpenAI API key + OpenAI API key. Optional for preset-backed OpenAI TTS usage. """ voice: str = pydantic.Field() diff --git a/src/agora_agent/types/telephony_error_response.py b/src/agora_agent/types/telephony_error_response.py new file mode 100644 index 0000000..af4e7d4 --- /dev/null +++ b/src/agora_agent/types/telephony_error_response.py @@ -0,0 +1,34 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class TelephonyErrorResponse(UncheckedBaseModel): + """ + Error response returned when a Telephony or Phone Number Management API request fails. + + The response body includes `error_type` and `description` fields (not the Agent Management `detail`/`reason` shape). + """ + + error_type: typing.Optional[str] = pydantic.Field(default=None) + """ + The type of error that occurred. + """ + + description: typing.Optional[str] = pydantic.Field(default=None) + """ + A detailed description of the error. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py index 77fae36..ab04ce6 100644 --- a/tests/custom/test_client.py +++ b/tests/custom/test_client.py @@ -1,265 +1,7 @@ -from agora_agent.agentkit.agent import Agent, AdvancedFeatures, InterruptionConfig, MllmTurnDetectionConfig, TurnDetectionConfig -from agora_agent.agentkit.constants import TurnDetectionTypeValues -import asyncio -import warnings -from agora_agent.agentkit.agent_session import AgentSession, AsyncAgentSession -from agora_agent.agentkit.vendors import DeepgramTTS, HeyGenAvatar, MicrosoftTTS, OpenAI, OpenAIRealtime -from agora_agent.agentkit import AgentThinkResponse -from typing import Any, Dict, List, Tuple +import pytest -class _AgentManagementStub: - def __init__(self) -> None: - self.calls: List[Tuple[str, str, Dict[str, Any]]] = [] - - def agent_think(self, appid, agent_id, **kwargs): # noqa: ANN001 - self.calls.append((appid, agent_id, kwargs)) - return AgentThinkResponse(agent_id=agent_id, channel="room", start_ts=1) - - -class _ClientStub: - auth_mode = "basic" - - def __init__(self) -> None: - self.agents = object() - self.agent_management = _AgentManagementStub() - - -class _AsyncAgentManagementStub: - def __init__(self) -> None: - self.calls: List[Tuple[str, str, Dict[str, Any]]] = [] - - async def agent_think(self, appid, agent_id, **kwargs): # noqa: ANN001 - self.calls.append((appid, agent_id, kwargs)) - return AgentThinkResponse(agent_id=agent_id, channel="room", start_ts=1) - - -class _AsyncClientStub: - auth_mode = "basic" - - def __init__(self) -> None: - self.agents = object() - self.agent_management = _AsyncAgentManagementStub() - - -def test_agentkit_think_routes_to_agent_management() -> None: - client = _ClientStub() - session = AgentSession( - client=client, - agent=Agent(), - app_id="appid", - name="agent", - channel="room", - token="token", - agent_uid="1", - remote_uids=["2"], - ) - session._status = "running" - session._agent_id = "agent-1" - - response = session.think("Injected instruction", on_thinking_action="interrupt") - assert response.agent_id == "agent-1" - assert len(client.agent_management.calls) == 1 - appid, agent_id, kwargs = client.agent_management.calls[0] - assert appid == "appid" - assert agent_id == "agent-1" - assert kwargs["text"] == "Injected instruction" - assert kwargs["on_thinking_action"] == "interrupt" - - -def test_async_agentkit_think_routes_to_agent_management() -> None: - async def _run() -> None: - client = _AsyncClientStub() - session = AsyncAgentSession( - client=client, - agent=Agent(), - app_id="appid", - name="agent", - channel="room", - token="token", - agent_uid="1", - remote_uids=["2"], - ) - session._status = "running" - session._agent_id = "agent-1" - - response = await session.think("Injected instruction", on_thinking_action="interrupt") - assert response.agent_id == "agent-1" - assert len(client.agent_management.calls) == 1 - appid, agent_id, kwargs = client.agent_management.calls[0] - assert appid == "appid" - assert agent_id == "agent-1" - assert kwargs["text"] == "Injected instruction" - assert kwargs["on_thinking_action"] == "interrupt" - - asyncio.run(_run()) - - -def test_llm_vendor_headers_are_forwarded_to_properties() -> None: - agent = Agent().with_llm( - OpenAI( - api_key="openai-key", - model="gpt-4o-mini", - headers={"X-Trace-Id": "trace-123"}, - output_modalities=["text", "audio"], - greeting_configs={"mode": "single_first"}, - template_variables={"caller_name": "Ada"}, - ) - ).with_tts(MicrosoftTTS(key="tts-key", region="eastus", voice_name="en-US-JennyNeural")) - - props = agent.to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.llm is not None - assert props.llm.headers == {"X-Trace-Id": "trace-123"} - assert props.llm.output_modalities == ["text", "audio"] - assert props.llm.greeting_configs is not None - assert props.llm.greeting_configs.mode == "single_first" - assert props.llm.template_variables == {"caller_name": "Ada"} - - -def test_with_turn_detection_forwards_config() -> None: - turn_detection = TurnDetectionConfig( - type=TurnDetectionTypeValues.AGORA_VAD, - threshold=0.5, - ) - - props = Agent().with_turn_detection(turn_detection).to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - skip_vendor_validation=True, - ) - - assert props.turn_detection == turn_detection - - -def test_with_interruption_forwards_config() -> None: - interruption = InterruptionConfig( - enable=False, - disabled_config={"strategy": "ignore"}, - ) - - props = Agent().with_interruption(interruption).to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - skip_vendor_validation=True, - ) - - assert props.interruption == interruption - - -def test_mllm_turn_detection_is_forwarded_without_legacy_style() -> None: - mllm_turn_detection = MllmTurnDetectionConfig( - mode="server_vad", - server_vad_config={"idle_timeout_ms": 5000}, - ) - props = Agent().with_mllm( - OpenAIRealtime(api_key="openai-key", turn_detection=mllm_turn_detection) - ).to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.mllm is not None - assert props.mllm.vendor == "openai" - assert "style" not in props.mllm.dict() - assert props.mllm.turn_detection == mllm_turn_detection - - -def test_with_mllm_sets_mllm_enable_without_legacy_flag() -> None: - agent = Agent().with_mllm(OpenAIRealtime(api_key="openai-key")) - - props = agent.to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.mllm is not None - assert props.mllm.enable is True - assert props.advanced_features is None - - -def test_with_mllm_removes_deprecated_enable_mllm_from_existing_advanced_features() -> None: - agent = Agent( - advanced_features=AdvancedFeatures(enable_mllm=True, enable_rtm=True) - ).with_mllm(OpenAIRealtime(api_key="openai-key")) - - props = agent.to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.mllm is not None - assert props.mllm.enable is True - assert props.advanced_features is not None - assert props.advanced_features.enable_mllm is None - assert props.advanced_features.enable_rtm is True - - -def test_with_mllm_drops_advanced_features_when_only_deprecated_enable_mllm_was_set() -> None: - props = Agent( - advanced_features=AdvancedFeatures(enable_mllm=True) - ).with_mllm(OpenAIRealtime(api_key="openai-key")).to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.mllm is not None - assert props.mllm.enable is True - assert props.advanced_features is None - - -def test_with_tools_sets_enable_tools() -> None: - props = Agent().with_tools().to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - skip_vendor_validation=True, - ) - - assert props.advanced_features is not None - assert props.advanced_features.enable_tools is True - - -def test_heygen_avatar_emits_deprecation_warning() -> None: - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - HeyGenAvatar(api_key="heygen-key", quality="high", agora_uid="42") - - assert any("HeyGenAvatar is deprecated" in str(warning.message) for warning in caught) - - -def test_deepgram_tts_vendor_config() -> None: - tts = DeepgramTTS( - api_key="deepgram-key", - model="aura-2-thalia-en", - base_url="wss://api.deepgram.com/v1/speak", - sample_rate=24000, - params={"encoding": "linear16"}, - ).to_config() - - assert tts["vendor"] == "deepgram" - assert tts["params"] == { - "api_key": "deepgram-key", - "model": "aura-2-thalia-en", - "base_url": "wss://api.deepgram.com/v1/speak", - "sample_rate": 24000, - "encoding": "linear16", - } +# Get started with writing tests with pytest at https://docs.pytest.org +@pytest.mark.skip(reason="Unimplemented") +def test_client() -> None: + assert True diff --git a/tests/custom/test_presets.py b/tests/custom/test_presets.py deleted file mode 100644 index c05c477..0000000 --- a/tests/custom/test_presets.py +++ /dev/null @@ -1,135 +0,0 @@ -from agora_agent.agentkit.presets import resolve_session_presets - - -def test_minimax_preset_strips_group_id_and_url_when_no_key() -> None: - """When no key is provided, preset is inferred and credential fields are stripped.""" - properties = { - "tts": { - "vendor": "minimax", - "params": { - "group_id": "my-group", - "model": "speech-2.6-turbo", - "url": "wss://api-uw.minimax.io/ws/v1/t2a_v2", - "voice_setting": {"voice_id": "English_captivating_female1"}, - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "minimax_speech_2_6_turbo" - params = resolved["tts"]["params"] - assert "group_id" not in params - assert "url" not in params - assert "model" not in params - assert params["voice_setting"]["voice_id"] == "English_captivating_female1" - - -def test_minimax_preset_strips_group_id_and_url_for_28_turbo() -> None: - properties = { - "tts": { - "vendor": "minimax", - "params": { - "group_id": "org-123", - "model": "speech-2.8-turbo", - "url": "wss://api.minimax.io/ws/v1/t2a_v2", - "voice_setting": {"voice_id": "some-voice"}, - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "minimax_speech_2_8_turbo" - params = resolved["tts"]["params"] - assert "group_id" not in params - assert "url" not in params - assert "model" not in params - - -def test_minimax_preset_strips_group_id_and_url_with_underscore_model_name() -> None: - properties = { - "tts": { - "vendor": "minimax", - "params": { - "group_id": "my-group", - "model": "speech_2_6_turbo", - "url": "wss://api-uw.minimax.io/ws/v1/t2a_v2", - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "minimax_speech_2_6_turbo" - params = resolved["tts"].get("params") or {} - assert "group_id" not in params - assert "url" not in params - assert "model" not in params - - -def test_minimax_preset_not_inferred_when_key_present() -> None: - """When user provides their own key, preset is NOT inferred and nothing is stripped.""" - properties = { - "tts": { - "vendor": "minimax", - "params": { - "key": "user-secret", - "group_id": "my-group", - "model": "speech-2.6-turbo", - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset is None - params = resolved["tts"]["params"] - assert params.get("key") == "user-secret" - assert params.get("group_id") == "my-group" - - -def test_minimax_preset_not_inferred_when_explicit_preset_given() -> None: - """When an explicit tts preset is provided, tts inference is skipped.""" - properties = { - "tts": { - "vendor": "minimax", - "params": { - "group_id": "my-group", - "model": "speech-2.6-turbo", - }, - } - } - preset, resolved = resolve_session_presets("minimax_speech_2_6_turbo", properties) - assert preset == "minimax_speech_2_6_turbo" - # Explicit preset: tts inference is skipped, params are NOT stripped - params = resolved["tts"]["params"] - assert params.get("group_id") == "my-group" - - -def test_deepgram_preset_strips_model_and_api_key() -> None: - properties = { - "asr": { - "vendor": "deepgram", - "params": { - "model": "nova-3", - "language": "en-US", - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "deepgram_nova_3" - params = resolved["asr"]["params"] - assert "model" not in params - assert "api_key" not in params - assert params.get("language") == "en-US" - - -def test_openai_llm_preset_strips_model_api_key_and_default_url() -> None: - properties = { - "llm": { - "vendor": "openai", - "url": "https://api.openai.com/v1/chat/completions", - "params": { - "model": "gpt-4o-mini", - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "openai_gpt_4o_mini" - llm = resolved["llm"] - assert "api_key" not in llm - assert "url" not in llm - assert "model" not in (llm.get("params") or {}) From 6e30398b5dc6e8ff2681a442a4d6a49c7d866032 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 20 May 2026 20:55:47 -0400 Subject: [PATCH 02/28] chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases Bumps the package version to v1.5.0 and re-exports the generated v2.7 core types as public AgentKit aliases so callers can import config, response, vendor, and turn-detection types directly from and . Adds the xAI Grok and Vertex AI MLLM wrappers to the vendor namespace alongside the existing generic and live avatar wrappers. --- pyproject.toml | 2 +- src/agora_agent/__init__.py | 9 ++++ src/agora_agent/agentkit/__init__.py | 54 ++++++++++++++++++++ src/agora_agent/agentkit/vendors/__init__.py | 7 ++- 4 files changed, 69 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 07f0004..4369ff3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "agora-agent-server-sdk" [tool.poetry] name = "agora-agent-server-sdk" -version = "v1.4.1" +version = "v1.5.0" description = "" readme = "README.md" authors = [] diff --git a/src/agora_agent/__init__.py b/src/agora_agent/__init__.py index 84a5d9f..b6296da 100644 --- a/src/agora_agent/__init__.py +++ b/src/agora_agent/__init__.py @@ -14,6 +14,9 @@ Agent, AgentSession, AgentSessionOptions, + GenericAvatar, + XaiGrok, + XaiRealtime, generate_rtc_token, GenerateTokenOptions, ) @@ -26,8 +29,11 @@ "Area": ".core.domain", "AsyncAgentSession": ".agentkit.agent_session", "AsyncAgora": ".pool_client", + "GenericAvatar": ".agentkit", + "XaiGrok": ".agentkit", "GenerateTokenOptions": ".agentkit", "Pool": ".core.domain", + "XaiRealtime": ".agentkit", "__version__": ".version", "agentkit": ".agentkit", "agents": ".agents", @@ -68,8 +74,11 @@ def __dir__(): "Area", "AsyncAgentSession", "AsyncAgora", + "GenericAvatar", + "XaiGrok", "GenerateTokenOptions", "Pool", + "XaiRealtime", "__version__", "agentkit", "agents", diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py index 1942bce..5ceda66 100644 --- a/src/agora_agent/agentkit/__init__.py +++ b/src/agora_agent/agentkit/__init__.py @@ -1,13 +1,30 @@ from .agent import ( Agent, + AgentConfig, + AgentConfigUpdate, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, RtcConfig, + SttConfig, + SttVendor, + TtsConfig, FillerWordsConfig, FillerWordsTrigger, FillerWordsTriggerFixedTimeConfig, FillerWordsContent, FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, TurnDetectionConfig, TurnDetectionNestedConfig, StartOfSpeechConfig, @@ -37,9 +54,14 @@ InterruptionMode, MllmTurnDetectionConfig, MllmTurnDetectionMode, + Labels, LlmGreetingConfigs, LlmGreetingConfigsMode, McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, ) from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession from ..agent_management.types.agent_think_agent_management_response import ( @@ -57,8 +79,10 @@ from .avatar_types import ( is_akool_avatar, is_anam_avatar, + is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) @@ -112,6 +136,7 @@ FishAudioTTS, Gemini, GeminiLive, + GenericAvatar, GoogleSTT, GoogleTTS, HeyGenAvatar, @@ -132,14 +157,27 @@ SarvamTTS, SpeechmaticsSTT, VertexAI, + XaiGrok, + XaiRealtime, LiveAvatarAvatar, ) __all__ = [ "Agent", + "AgentConfig", + "AgentConfigUpdate", # Return type of Agent.to_properties() "StartAgentsRequestProperties", # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", "GeofenceConfig", "RtcConfig", "FillerWordsConfig", @@ -147,6 +185,7 @@ "FillerWordsTriggerFixedTimeConfig", "FillerWordsContent", "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", # Turn detection types "TurnDetectionConfig", "TurnDetectionNestedConfig", @@ -181,6 +220,7 @@ "InterruptionMode", "MllmTurnDetectionConfig", "MllmTurnDetectionMode", + "Labels", # Type-safe constants "DataChannel", "AudioScenario", @@ -197,6 +237,15 @@ "AgentSession", "AsyncAgentSession", "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", "AgentThinkResponse", "AgentThinkRequestOnListeningAction", "AgentThinkRequestOnThinkingAction", @@ -253,14 +302,19 @@ "OpenAIRealtime", "GeminiLive", "VertexAI", + "XaiGrok", + "XaiRealtime", "HeyGenAvatar", "LiveAvatarAvatar", "AkoolAvatar", "AnamAvatar", + "GenericAvatar", "is_heygen_avatar", "is_live_avatar_avatar", "is_akool_avatar", "is_anam_avatar", + "is_generic_avatar", + "is_rtc_avatar", "validate_avatar_config", "validate_tts_sample_rate", ] diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py index 0320843..689eab1 100644 --- a/src/agora_agent/agentkit/vendors/__init__.py +++ b/src/agora_agent/agentkit/vendors/__init__.py @@ -11,9 +11,9 @@ OpenAISampleRate, SampleRate, ) -from .avatar import AkoolAvatar, AnamAvatar, HeyGenAvatar, LiveAvatarAvatar +from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI -from .mllm import GeminiLive, OpenAIRealtime, VertexAI +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime from .stt import ( AmazonSTT, AresSTT, @@ -82,8 +82,11 @@ "OpenAIRealtime", "GeminiLive", "VertexAI", + "XaiGrok", + "XaiRealtime", "HeyGenAvatar", "LiveAvatarAvatar", "AkoolAvatar", "AnamAvatar", + "GenericAvatar", ] From 9df782b46d872599f103078e30c5ded2053f2517 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 20 May 2026 20:57:54 -0400 Subject: [PATCH 03/28] feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 Adds xAI Grok Realtime and Vertex AI MLLM wrappers, and aligns MLLM config serialization with the generated core types. LLM vendors now accept typed greeting_configs and serialize them through the generated model shape, including interruptable. --- src/agora_agent/agentkit/vendors/llm.py | 31 ++++-- src/agora_agent/agentkit/vendors/mllm.py | 118 +++++++++++++++++------ 2 files changed, 113 insertions(+), 36 deletions(-) diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 7465c9f..6f74b43 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -1,9 +1,14 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, ConfigDict, Field +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, +) from .base import BaseLLM +LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" @@ -15,6 +20,14 @@ def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]] result.append(item) return result + +def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + class OpenAIOptions(BaseModel): model_config = ConfigDict(extra="forbid") @@ -31,7 +44,7 @@ class OpenAIOptions(BaseModel): params: Optional[Dict[str, Any]] = Field(default=None) headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) vendor: Optional[str] = Field(default=None) mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -74,7 +87,7 @@ def to_config(self) -> Dict[str, Any]: if self.options.output_modalities is not None: config["output_modalities"] = self.options.output_modalities if self.options.greeting_configs is not None: - config["greeting_configs"] = self.options.greeting_configs + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) if self.options.template_variables is not None: config["template_variables"] = self.options.template_variables if self.options.vendor is not None: @@ -104,7 +117,7 @@ class AzureOpenAIOptions(BaseModel): params: Optional[Dict[str, Any]] = Field(default=None) headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) vendor: Optional[str] = Field(default=None) mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -150,7 +163,7 @@ def to_config(self) -> Dict[str, Any]: if self.options.output_modalities is not None: config["output_modalities"] = self.options.output_modalities if self.options.greeting_configs is not None: - config["greeting_configs"] = self.options.greeting_configs + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) if self.options.template_variables is not None: config["template_variables"] = self.options.template_variables if self.options.mcp_servers is not None: @@ -177,7 +190,7 @@ class AnthropicOptions(BaseModel): params: Optional[Dict[str, Any]] = Field(default=None) headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) vendor: Optional[str] = Field(default=None) mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -216,7 +229,7 @@ def to_config(self) -> Dict[str, Any]: if self.options.output_modalities is not None: config["output_modalities"] = self.options.output_modalities if self.options.greeting_configs is not None: - config["greeting_configs"] = self.options.greeting_configs + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) if self.options.template_variables is not None: config["template_variables"] = self.options.template_variables if self.options.vendor is not None: @@ -246,7 +259,7 @@ class GeminiOptions(BaseModel): params: Optional[Dict[str, Any]] = Field(default=None) headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) vendor: Optional[str] = Field(default=None) mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -287,7 +300,7 @@ def to_config(self) -> Dict[str, Any]: if self.options.output_modalities is not None: config["output_modalities"] = self.options.output_modalities if self.options.greeting_configs is not None: - config["greeting_configs"] = self.options.greeting_configs + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) if self.options.template_variables is not None: config["template_variables"] = self.options.template_variables if self.options.vendor is not None: diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index 5f6f940..cd6cd07 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field @@ -22,9 +23,7 @@ class OpenAIRealtimeOptions(BaseModel): messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") failure_message: Optional[str] = Field(default=None, description="Message played on failure") - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") class OpenAIRealtime(BaseMLLM): def __init__(self, **kwargs: Any): @@ -53,18 +52,97 @@ def to_config(self) -> Dict[str, Any]: config["output_modalities"] = self.options.output_modalities if self.options.messages is not None: config["messages"] = self.options.messages - if self.options.predefined_tools is not None: - config["predefined_tools"] = self.options.predefined_tools if self.options.failure_message is not None: config["failure_message"] = self.options.failure_message - if self.options.max_history is not None: - config["max_history"] = self.options.max_history if self.options.turn_detection is not None: config["turn_detection"] = self.options.turn_detection return config +# xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name +# is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + +class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + +class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + +class XaiRealtimeOptions(XaiGrokOptions): + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + def __init__(self, **data: Any): + warnings.warn( + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**data) + + +class XaiRealtime(XaiGrok): + """Deprecated: use :class:`XaiGrok` instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "XaiRealtime is deprecated; use XaiGrok instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**kwargs) + + class VertexAIOptions(BaseModel): model_config = ConfigDict(extra="forbid") @@ -81,28 +159,24 @@ class VertexAIOptions(BaseModel): messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") failure_message: Optional[str] = Field(default=None, description="Message played on failure") - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") class VertexAI(BaseMLLM): def __init__(self, **kwargs: Any): self.options = VertexAIOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { - "model": self.options.model, - "project_id": self.options.project_id, - "location": self.options.location, - "adc_credentials_string": self.options.adc_credentials_string, - } - + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string if self.options.instructions is not None: params["instructions"] = self.options.instructions if self.options.voice is not None: params["voice"] = self.options.voice - if self.options.additional_params is not None: - params.update(self.options.additional_params) config: Dict[str, Any] = { "vendor": "vertexai", @@ -119,12 +193,8 @@ def to_config(self) -> Dict[str, Any]: config["output_modalities"] = self.options.output_modalities if self.options.messages is not None: config["messages"] = self.options.messages - if self.options.predefined_tools is not None: - config["predefined_tools"] = self.options.predefined_tools if self.options.failure_message is not None: config["failure_message"] = self.options.failure_message - if self.options.max_history is not None: - config["max_history"] = self.options.max_history if self.options.turn_detection is not None: config["turn_detection"] = self.options.turn_detection @@ -145,9 +215,7 @@ class GeminiLiveOptions(BaseModel): messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") failure_message: Optional[str] = Field(default=None, description="Message played on failure") - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") class GeminiLive(BaseMLLM): def __init__(self, **kwargs: Any): @@ -179,12 +247,8 @@ def to_config(self) -> Dict[str, Any]: config["output_modalities"] = self.options.output_modalities if self.options.messages is not None: config["messages"] = self.options.messages - if self.options.predefined_tools is not None: - config["predefined_tools"] = self.options.predefined_tools if self.options.failure_message is not None: config["failure_message"] = self.options.failure_message - if self.options.max_history is not None: - config["max_history"] = self.options.max_history if self.options.turn_detection is not None: config["turn_detection"] = self.options.turn_detection From 26706d73ae15d860d57daf926837632c01be7f10 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 20 May 2026 20:59:22 -0400 Subject: [PATCH 04/28] feat(agentkit): add GenericAvatar and session-aware avatar validation Adds the GenericAvatar vendor wrapper and extends avatar validation helpers for generic and RTC-backed avatars. Session-derived fields such as agora_appid, agora_channel, and agora_token can now be validated after AgentSession enrichment. --- src/agora_agent/agentkit/avatar_types.py | 35 +++++++++++++++++- src/agora_agent/agentkit/vendors/avatar.py | 42 ++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py index 9e132a9..a04809c 100644 --- a/src/agora_agent/agentkit/avatar_types.py +++ b/src/agora_agent/agentkit/avatar_types.py @@ -17,7 +17,21 @@ def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: return config.get("vendor") == "anam" -def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: +def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + +def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + ) + + +def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, +) -> None: """Validates avatar configuration at runtime. Parameters @@ -45,6 +59,8 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: f"Invalid quality for {label}: {params.get('quality')}. " f"Must be one of: {', '.join(valid_qualities)}" ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") elif is_akool_avatar(config): params = config.get("params", {}) if not params.get("api_key"): @@ -53,6 +69,23 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: params = config.get("params", {}) if not params.get("api_key"): raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") def validate_tts_sample_rate( diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index b83a356..00cad8f 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -132,6 +132,48 @@ def to_config(self) -> Dict[str, Any]: return {"enable": enable, "vendor": "liveavatar", "params": params} +class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + +class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + class AnamAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") From 9f491c63a964c13c67ba4af3708379e1b75a92d8 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 20 May 2026 21:00:58 -0400 Subject: [PATCH 05/28] feat(agentkit): update Agent builder and session lifecycle for v2.7 Aligns Agent and AgentSession with the generated v2.7 request shape. MLLM sessions no longer require TTS, LLM, or STT, and enabled avatars are rejected when MLLM is configured. AgentSession now enriches generic and RTC avatars with session context, auto-generates avatar tokens, validates TTS sample rates from vendor-specific fields, and adds paginated get_turns/get_all_turns helpers with fail-fast pagination guards. --- src/agora_agent/agentkit/agent.py | 164 +++++++++++++-- src/agora_agent/agentkit/agent_session.py | 231 ++++++++++++++++++++-- 2 files changed, 360 insertions(+), 35 deletions(-) diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 70a1bdd..86a958e 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -8,6 +8,24 @@ from .agent_session import AgentSession, AsyncAgentSession from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties +from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr +from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor +from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar +from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor +from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm +from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle +from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm +from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor +from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties +from ..agents.types.get_agents_response import GetAgentsResponse +from ..agents.types.list_agents_response import ListAgentsResponse +from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem +from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse +from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem +from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse +from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem +from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech @@ -46,10 +64,21 @@ from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule +from ..types.tts import Tts from .token import generate_convo_ai_token, _validate_expires_in from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS # Top-level aliases +LlmConfig = StartAgentsRequestPropertiesLlm +LlmStyle = StartAgentsRequestPropertiesLlmStyle +SttConfig = StartAgentsRequestPropertiesAsr +SttVendor = StartAgentsRequestPropertiesAsrVendor +TtsConfig = Tts +MllmConfig = StartAgentsRequestPropertiesMllm +MllmVendor = StartAgentsRequestPropertiesMllmVendor +AvatarConfig = StartAgentsRequestPropertiesAvatar +AvatarVendor = StartAgentsRequestPropertiesAvatarVendor TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection SalConfig = StartAgentsRequestPropertiesSal SalMode = StartAgentsRequestPropertiesSalSalMode @@ -93,6 +122,18 @@ InterruptionMode = StartAgentsRequestPropertiesInterruptionMode MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode +AgentConfig = StartAgentsRequestProperties +AgentConfigUpdate = UpdateAgentsRequestProperties +SessionInfo = GetAgentsResponse +SessionListResponse = ListAgentsResponse +SessionSummary = ListAgentsResponseDataListItem +ConversationHistory = GetHistoryAgentsResponse +ConversationTurn = GetHistoryAgentsResponseContentsItem +ConversationRole = GetHistoryAgentsResponseContentsItemRole +ConversationTurns = GetTurnsAgentsResponse +ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem +SpeakPriority = SpeakAgentsRequestPriority +Labels = typing.Dict[str, str] class SessionParamsInput(typing_extensions.TypedDict, total=False): @@ -116,6 +157,7 @@ class SessionParamsInput(typing_extensions.TypedDict, total=False): FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig +FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule class Agent: @@ -183,9 +225,20 @@ def with_llm(self, vendor: BaseLLM) -> "Agent": return new_agent def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) new_agent = self._clone() new_agent._tts = vendor.to_config() - new_agent._tts_sample_rate = vendor.sample_rate + new_agent._tts_sample_rate = sample_rate return new_agent def with_stt(self, vendor: BaseSTT) -> "Agent": @@ -194,6 +247,9 @@ def with_stt(self, vendor: BaseSTT) -> "Agent": return new_agent def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. new_agent = self._clone() new_agent._mllm = vendor.to_config() if isinstance(new_agent._mllm, dict): @@ -202,7 +258,10 @@ def with_mllm(self, vendor: BaseMLLM) -> "Agent": advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): - advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) if ( advanced_features_model.enable_rtm is None and advanced_features_model.enable_sal is None @@ -214,6 +273,10 @@ def with_mllm(self, vendor: BaseMLLM) -> "Agent": return new_agent def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. required_sample_rate = vendor.required_sample_rate if ( required_sample_rate not in (None, 0) @@ -282,7 +345,10 @@ def with_tools(self, enabled: bool = True) -> "Agent": {**new_agent._advanced_features, "enable_tools": enabled}, ) else: - new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) return new_agent def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": @@ -294,6 +360,23 @@ def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsI new_agent._parameters = parameters return new_agent + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + def with_failure_message(self, message: str) -> "Agent": """Returns a new Agent with the specified failure message. @@ -342,6 +425,33 @@ def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": new_agent._filler_words = filler_words return new_agent + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + @property def name(self) -> typing.Optional[str]: return self._name @@ -354,6 +464,10 @@ def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: return self._tts + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + @property def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: return self._stt @@ -536,6 +650,20 @@ def to_properties( expires_in: typing.Optional[int] = None, skip_vendor_validation: bool = False, ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + if token is None: if app_id is None or app_certificate is None: raise ValueError("Either token or app_id+app_certificate must be provided") @@ -553,9 +681,6 @@ def to_properties( **token_kwargs, ) - mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True - is_mllm_mode = bool(mllm_flag or self._mllm is not None) - base_kwargs: typing.Dict[str, typing.Any] = { "channel": channel, "token": token, @@ -579,11 +704,12 @@ def to_properties( base_kwargs["avatar"] = self._avatar if self._advanced_features is not None: base_kwargs["advanced_features"] = self._advanced_features - if self._parameters is not None: - if isinstance(self._parameters, dict): - base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) else: - base_kwargs["parameters"] = self._parameters + base_kwargs["parameters"] = parameters if self._geofence is not None: base_kwargs["geofence"] = self._geofence if self._labels is not None: @@ -596,12 +722,10 @@ def to_properties( if is_mllm_mode: if self._mllm is not None: mllm_config = dict(self._mllm) - if self._greeting: + if self._greeting is not None: mllm_config.setdefault("greeting_message", self._greeting) - if self._failure_message: + if self._failure_message is not None: mllm_config.setdefault("failure_message", self._failure_message) - if self._max_history is not None: - mllm_config.setdefault("max_history", self._max_history) base_kwargs["mllm"] = mllm_config return StartAgentsRequestProperties(**base_kwargs) @@ -617,14 +741,14 @@ def to_properties( llm_config = dict(self._llm) # Agent-level fields take priority over the vendor's defaults. # This matches the TS SDK where agent-level values override vendor config. - if self._instructions: + if self._instructions is not None: llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] - if self._greeting: - llm_config.setdefault("greeting_message", self._greeting) - if self._failure_message: - llm_config.setdefault("failure_message", self._failure_message) + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message if self._max_history is not None: - llm_config.setdefault("max_history", self._max_history) + llm_config["max_history"] = self._max_history base_kwargs["llm"] = llm_config base_kwargs["tts"] = self._tts diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index 2408659..e41a399 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -14,13 +14,16 @@ from ..agent_management.types.agent_think_agent_management_response import ( AgentThinkAgentManagementResponse as AgentThinkResponse, ) +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties from .agent import Agent from .avatar_types import ( is_akool_avatar, is_anam_avatar, + is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) @@ -182,17 +185,29 @@ def _validate_avatar_config(self) -> None: tts = self._agent.tts if not avatar or avatar.get("enable", True) is False: return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) if ( is_heygen_avatar(avatar) or is_live_avatar_avatar(avatar) or is_akool_avatar(avatar) or is_anam_avatar(avatar) + or is_generic_avatar(avatar) ): validate_avatar_config(avatar) tts_params = tts.get("params") if isinstance(tts, dict) else None - sample_rate = tts_params.get("sample_rate") if isinstance(tts_params, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) if isinstance(sample_rate, int): validate_tts_sample_rate(avatar, sample_rate) elif is_heygen_avatar(avatar): @@ -211,6 +226,50 @@ def _validate_avatar_config(self) -> None: "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." ) + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_rtc_avatar(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + @staticmethod def _dump_model(value: typing.Any) -> typing.Any: if hasattr(value, "model_dump"): @@ -238,12 +297,17 @@ def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> t **token_opts, ) properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) if self._is_mllm_mode(): if self._agent.mllm is not None: - mllm = dict(self._agent.mllm) - if self._agent.greeting: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) properties["mllm"] = mllm return properties @@ -251,20 +315,41 @@ def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> t properties["tts"] = self._dump_model(self._agent.tts) if self._agent.llm is not None: llm = dict(self._agent.llm) - if self._agent.instructions: + if self._agent.instructions is not None: llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] - if self._agent.greeting: - llm.setdefault("greeting_message", self._agent.greeting) - if self._agent.failure_message: - llm.setdefault("failure_message", self._agent.failure_message) + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message if self._agent.max_history is not None: - llm.setdefault("max_history", self._agent.max_history) + llm["max_history"] = self._agent.max_history properties["llm"] = llm if self._agent.stt is not None: properties["asr"] = self._dump_model(self._agent.stt) return properties + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + # ------------------------------------------------------------------ # Event handling # ------------------------------------------------------------------ @@ -484,7 +569,12 @@ def think( interruptable: typing.Optional[bool] = None, metadata: typing.Optional[typing.Dict[str, str]] = None, ) -> AgentThinkResponse: - """Inject a custom text instruction into the current session pipeline.""" + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ if self._status != "running": raise RuntimeError(f"Cannot think in {self._status} state") if not self._agent_id: @@ -547,15 +637,68 @@ def get_info(self) -> typing.Any: self._app_id, self._agent_id, request_options=self._request_options() ) - def get_turns(self) -> typing.Any: + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: """Get turn-by-turn analytics and timing details for this session.""" if not self._agent_id: raise RuntimeError("No agent ID available") return self._client.agents.get_turns( - self._app_id, self._agent_id, request_options=self._request_options() + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), ) + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + class AsyncAgentSession(_AgentSessionBase): """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. @@ -734,7 +877,12 @@ async def think( interruptable: typing.Optional[bool] = None, metadata: typing.Optional[typing.Dict[str, str]] = None, ) -> AgentThinkResponse: - """Inject a custom text instruction into the current session pipeline.""" + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ if self._status != "running": raise RuntimeError(f"Cannot think in {self._status} state") if not self._agent_id: @@ -797,11 +945,64 @@ async def get_info(self) -> typing.Any: self._app_id, self._agent_id, request_options=self._request_options() ) - async def get_turns(self) -> typing.Any: + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: """Get turn-by-turn analytics and timing details for this session.""" if not self._agent_id: raise RuntimeError("No agent ID available") return await self._client.agents.get_turns( - self._app_id, self._agent_id, request_options=self._request_options() + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) From 64703bda8ddb3bb24857b2adafbc4ce640d5f9d5 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 20 May 2026 21:01:51 -0400 Subject: [PATCH 06/28] test(agentkit): add custom tests for v1.5.0 AgentKit behavior Covers MLLM and avatar guardrails, generic avatar enrichment and token generation, vendor config serialization, turn pagination, and agent-level field overrides for both cascading and MLLM sessions. --- tests/custom/test_agentkit_agent.py | 298 ++++++++++++++++++++ tests/custom/test_agentkit_session.py | 383 ++++++++++++++++++++++++++ tests/custom/test_agentkit_vendors.py | 122 ++++++++ tests/custom/test_avatar_token.py | 12 + 4 files changed, 815 insertions(+) create mode 100644 tests/custom/test_agentkit_agent.py create mode 100644 tests/custom/test_agentkit_session.py create mode 100644 tests/custom/test_agentkit_vendors.py create mode 100644 tests/custom/test_avatar_token.py diff --git a/tests/custom/test_agentkit_agent.py b/tests/custom/test_agentkit_agent.py new file mode 100644 index 0000000..9719b04 --- /dev/null +++ b/tests/custom/test_agentkit_agent.py @@ -0,0 +1,298 @@ +from agora_agent.agentkit import ( + Agent, + AvatarConfig, + AvatarVendor, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + SttConfig, + SttVendor, + TtsConfig, +) +import pytest + +from agora_agent.agentkit.vendors import ( + AkoolAvatar, + ElevenLabsTTS, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, +) + + +def _parameter(config, key): + parameters = config["parameters"] + if isinstance(parameters, dict): + return parameters[key] + return getattr(parameters, key) + + +class _CopyOnlyModel: + def __init__(self, **values): + self.values = values + + def copy(self, update=None): + return _CopyOnlyModel(**{**self.values, **(update or {})}) + + +def test_generated_core_aliases_are_public(): + assert LlmConfig is not None + assert LlmStyle is not None + assert SttConfig is not None + assert SttVendor is not None + assert TtsConfig is not None + assert MllmConfig is not None + assert MllmVendor is not None + assert AvatarConfig is not None + assert AvatarVendor is not None + + +def test_model_copy_helper_supports_pydantic_v1_copy_api(): + copied = Agent._copy_model_update(_CopyOnlyModel(enable_rtm=True), {"data_channel": "rtm"}) # noqa: SLF001 + + assert copied.values == {"enable_rtm": True, "data_channel": "rtm"} + + +def test_with_audio_scenario_sets_session_parameter(): + agent = Agent(name="test").with_audio_scenario("chorus") + + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_with_audio_scenario_preserves_existing_parameters(): + agent = Agent(name="test", parameters={"enable_metrics": True}).with_audio_scenario( + "chorus" + ) + + assert _parameter(agent.config, "enable_metrics") is True + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_enable_rtm_defaults_data_channel_to_rtm(): + properties = Agent(name="test", advanced_features={"enable_rtm": True}).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "rtm" + + +def test_enable_rtm_preserves_explicit_data_channel(): + properties = Agent( + name="test", + advanced_features={"enable_rtm": True}, + parameters={"data_channel": "datastream"}, + ).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "datastream" + + +def test_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + + properties = agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + ) + + assert properties.llm.greeting_message == "agent greeting" + assert properties.llm.failure_message == "agent failure" + assert properties.llm.max_history == 2 + + +def test_avatar_sample_rate_validation_works_when_tts_added_after_avatar(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2") + ) + + with pytest.raises(ValueError, match="24000"): + agent.with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + + +def test_avatar_sample_rate_validation_uses_wrapper_sample_rate(): + agent = ( + Agent(name="test") + .with_avatar(AkoolAvatar(api_key="avatar-key")) + .with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + ) + + assert agent.tts_sample_rate == 16000 + + +def test_with_mllm_removes_deprecated_advanced_features_enable_mllm(): + properties = ( + Agent( + name="test", + advanced_features={"enable_mllm": True, "enable_rtm": True}, + greeting="hello from agent", + failure_message="try again", + max_history=5, + ) + .with_mllm(OpenAIRealtime(api_key="openai-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None + assert properties.mllm.enable is True + assert properties.mllm.greeting_message == "hello from agent" + assert properties.mllm.failure_message == "try again" + mllm_dump = properties.mllm.model_dump(exclude_none=True) + assert "max_history" not in mllm_dump + assert properties.advanced_features is not None + af_dump = properties.advanced_features.model_dump(exclude_none=True) + assert "enable_mllm" not in af_dump + assert af_dump.get("enable_rtm") is True + + +def test_to_properties_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_mllm_with_avatar_fires_before_token_generation(): + """The guard must fire before the token-generation step so callers get a + clear, actionable error even when app_id/app_certificate are empty. + """ + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + app_id="", + app_certificate="", + ) + + +def test_to_properties_rejects_mllm_with_default_enabled_avatar(): + """Avatar with no `enable` field should be treated as enabled.""" + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + agent._avatar = { # noqa: SLF001 + "vendor": "liveavatar", + "params": { + "api_key": "avatar-key", + "quality": "high", + "agora_uid": "200", + "agora_token": "avatar-token", + }, + } + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_allows_mllm_with_disabled_avatar_and_no_tts(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is not None and properties.avatar.enable is False + + +def test_to_properties_mllm_without_tts_or_llm_succeeds(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is None diff --git a/tests/custom/test_agentkit_session.py b/tests/custom/test_agentkit_session.py new file mode 100644 index 0000000..198fcd0 --- /dev/null +++ b/tests/custom/test_agentkit_session.py @@ -0,0 +1,383 @@ +from types import SimpleNamespace + +import pytest + +from agora_agent.agentkit import Agent, AgentSession +from agora_agent.agentkit.vendors import ( + ElevenLabsTTS, + GenericAvatar, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, + RimeTTS, +) +from agora_agent.agents.types.get_turns_agents_response import GetTurnsAgentsResponse + + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +class _Agents: + def __init__(self): + self.calls = [] + self.start_calls = [] + + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + self.start_calls.append((app_id, name, properties, preset, pipeline_id, request_options)) + return SimpleNamespace(agent_id="agent-1") + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls.append((app_id, agent_id, page_index, page_size, request_options)) + is_last_page = page_index != 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={ + "page_index": page_index or 1, + "total_pages": 2, + "is_last_page": is_last_page, + }, + turns=[{"turn_id": float(page_index or 1)}], + ) + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def _session(agent, warn=None): + return AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + warn=warn, + ) + + +def test_generic_avatar_enrichment_adds_session_context_and_token(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + assert params["agora_token"] != properties["token"] + + +def test_generic_avatar_empty_session_fields_are_filled(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + agora_appid="", + agora_channel="", + agora_token="", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + + +def test_avatar_uid_matching_agent_uid_warns(): + warnings = [] + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="1", + ) + ) + session = _session(agent, warn=warnings.append) + + session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert any("matches agent_rtc_uid" in warning for warning in warnings) + + +def test_session_start_properties_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["llm"]["greeting_message"] == "agent greeting" + assert properties["llm"]["failure_message"] == "agent failure" + assert properties["llm"]["max_history"] == 2 + + +def test_session_start_properties_applies_mllm_agent_level_defaults(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "agent greeting" + assert properties["mllm"]["failure_message"] == "agent failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_properties_preserves_mllm_vendor_defaults(): + agent = ( + Agent(name="test") + .with_mllm( + OpenAIRealtime( + api_key="mllm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + ) + ) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "vendor greeting" + assert properties["mllm"]["failure_message"] == "vendor failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_allows_mllm_without_tts(): + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_session_start_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + session = _session(agent) + + with pytest.raises(ValueError, match="cascading"): + session.start() + assert session._client.agents.start_calls == [] # noqa: SLF001 + + +def test_session_start_allows_mllm_with_disabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + ) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_avatar_sample_rate_validation_uses_serialized_vendor_keys(): + warnings = [] + agent = ( + Agent(name="test") + .with_avatar(LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2")) + .with_tts(RimeTTS(key="tts-key", speaker="speaker", sampling_rate=24000)) + ) + session = _session(agent, warn=warnings.append) + + session._validate_avatar_config() # noqa: SLF001 + + assert warnings == [] + + +def test_avatar_user_token_is_not_overwritten(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar( + api_key="live-key", + quality="medium", + agora_uid="2", + agora_token="user-token", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["avatar"]["params"]["agora_token"] == "user-token" + + +def test_get_turns_forwards_pagination_args(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + session.get_turns(page_index=3, page_size=25) + + assert session._client.agents.calls[-1][:4] == (APP_ID, "agent-id", 3, 25) # noqa: SLF001 + + +def test_get_all_turns_aggregates_pages(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + response = session.get_all_turns(page_size=1) + + assert [turn.turn_id for turn in response.turns] == [1.0, 2.0] + assert response.pagination.page_index == 2 + + +def test_get_all_turns_raises_when_pagination_does_not_advance(): + class _StuckAgents: + def __init__(self): + self.calls = 0 + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls += 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={"page_index": 1, "is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _StuckClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _StuckAgents() + self.agent_management = object() + + session = AgentSession( + client=_StuckClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="did not advance"): + session.get_all_turns(page_size=1) + + +def test_get_all_turns_raises_when_pagination_metadata_missing(): + class _NoMetaAgents: + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=1, + pagination={"is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _NoMetaClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _NoMetaAgents() + self.agent_management = object() + + session = AgentSession( + client=_NoMetaClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="cannot continue"): + session.get_all_turns(page_size=1) diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py new file mode 100644 index 0000000..8473821 --- /dev/null +++ b/tests/custom/test_agentkit_vendors.py @@ -0,0 +1,122 @@ +import pytest +from pydantic import ValidationError + +from agora_agent.agentkit import LlmGreetingConfigs +import warnings + +from agora_agent.agentkit.vendors import GenericAvatar, OpenAI, OpenAIRealtime, XaiGrok, XaiRealtime + + +def test_xai_grok_serializes_v27_shape_without_style(): + config = XaiGrok( + api_key="xai-key", + voice="eve", + language="en", + sample_rate=24000, + output_modalities=["audio", "text"], + params={"temperature": 0.2}, + ).to_config() + + assert config["vendor"] == "xai" + assert config["url"] == "wss://api.x.ai/v1/realtime" + assert config["api_key"] == "xai-key" + assert config["params"] == { + "temperature": 0.2, + "voice": "eve", + "language": "en", + "sample_rate": 24000, + } + assert config["output_modalities"] == ["audio", "text"] + assert "style" not in config + + +def test_xai_grok_emits_params_even_when_empty(): + assert XaiGrok(api_key="xai-key").to_config()["params"] == {} + + +def test_xai_realtime_deprecated_alias_emits_same_vendor(): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always", DeprecationWarning) + config = XaiRealtime(api_key="xai-key").to_config() + assert len(caught) == 1 + assert issubclass(caught[0].category, DeprecationWarning) + assert config["vendor"] == "xai" + + +def test_mllm_rejects_fields_not_in_core_contract(): + with pytest.raises(ValidationError): + OpenAIRealtime(api_key="openai-key", predefined_tools=["_publish_message"]) + + with pytest.raises(ValidationError): + XaiGrok(api_key="xai-key", max_history=10) + + +def test_generic_avatar_omits_session_enriched_fields_when_unset(): + config = GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ).to_config() + + assert config == { + "enable": True, + "vendor": "generic", + "params": { + "api_key": "avatar-key", + "api_base_url": "https://avatar.example.com", + "avatar_id": "avatar-1", + "agora_uid": "2", + }, + } + + +def test_vertex_ai_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import VertexAI + + config = VertexAI( + model="explicit-model", + project_id="explicit-project", + location="explicit-region", + adc_credentials_string="{}", + additional_params={ + "model": "should-be-overridden", + "project_id": "should-be-overridden", + "location": "should-be-overridden", + "adc_credentials_string": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["vendor"] == "vertexai" + assert config["params"]["model"] == "explicit-model" + assert config["params"]["project_id"] == "explicit-project" + assert config["params"]["location"] == "explicit-region" + assert config["params"]["adc_credentials_string"] == "{}" + assert config["params"]["extra_key"] == "kept" + + +def test_gemini_live_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import GeminiLive + + config = GeminiLive( + api_key="key", + model="explicit-model", + additional_params={ + "model": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["params"]["model"] == "explicit-model" + assert config["params"]["extra_key"] == "kept" + + +def test_llm_greeting_configs_interruptable_serializes(): + config = OpenAI( + api_key="openai-key", + greeting_configs=LlmGreetingConfigs(mode="single_first", interruptable=False), + ).to_config() + + assert config["greeting_configs"]["mode"] == "single_first" + assert config["greeting_configs"]["interruptable"] is False diff --git a/tests/custom/test_avatar_token.py b/tests/custom/test_avatar_token.py new file mode 100644 index 0000000..efe6ac0 --- /dev/null +++ b/tests/custom/test_avatar_token.py @@ -0,0 +1,12 @@ +from agora_agent.agentkit import generate_convo_ai_token + + +def test_avatar_tokens_use_convo_ai_token_path_with_avatar_uid(): + token = generate_convo_ai_token( + app_id="0" * 32, + app_certificate="1" * 32, + channel_name="room", + account="avatar-uid", + ) + + assert token.startswith("007") From 6c20f076c9a683de3c568c7c843135b46a7a8229 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 20 May 2026 21:02:54 -0400 Subject: [PATCH 07/28] docs(agentkit): update v1.5.0 guides, reference, and changelog Documents MLLM sessions without TTS, avatar cascading constraints, GenericAvatar session enrichment, turn pagination, and v2.7 error handling. Updates README examples and adds the errors reference page. --- README.md | 48 +++++++---------------------- changelog.md | 39 ++++++++++++++++++++--- docs/concepts/vendors.md | 12 ++++++-- docs/guides/avatars.md | 51 ++++++++++++++++++++++-------- docs/guides/mllm-flow.md | 59 ++++++++++++++++++++++------------- docs/index.md | 9 +++--- docs/reference/agent.md | 32 ++++++++++++++++--- docs/reference/errors.md | 29 +++++++++++++++++ docs/reference/session.md | 30 ++++++++++++++++-- docs/reference/vendors.md | 65 +++++++++++++++++++++++++++++++++------ 10 files changed, 277 insertions(+), 97 deletions(-) create mode 100644 docs/reference/errors.md diff --git a/README.md b/README.md index f83f96b..4a9f9d6 100644 --- a/README.md +++ b/README.md @@ -53,8 +53,10 @@ from agora_agent.agentkit import ( Agent, DataChannel, DeepgramSTT, + GenericAvatar, MiniMaxTTS, OpenAI, + XaiGrok, expires_in_hours, ) @@ -190,7 +192,7 @@ If you want to bring your own vendor credentials instead of using Agora-managed ## MLLM (Realtime / Multimodal) -Use `with_mllm()` for OpenAI Realtime or Gemini Live. No STT, LLM, or TTS vendor is needed when MLLM mode is enabled. +Use `with_mllm()` for OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok. No STT, LLM, or TTS vendor is needed when MLLM mode is enabled. ```python from agora_agent.agentkit import Agent, OpenAIRealtime @@ -216,18 +218,14 @@ A full reference for this library is available [here](https://github.com/AgoraIO ## MLLM Flow (Multimodal) -For real-time audio processing using OpenAI's Realtime API or Google Gemini Live, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. +For real-time audio processing using OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. MLLM mode does not require separate TTS, STT, or LLM vendors. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. ```python -from agora-agent-server-sdk import Agora -from agora-agent-server-sdk.agents import ( +from agora_agent import Agora +from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAdvancedFeatures, StartAgentsRequestPropertiesMllm, StartAgentsRequestPropertiesMllmVendor, - StartAgentsRequestPropertiesTts, - StartAgentsRequestPropertiesTtsVendor, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionType, ) @@ -246,10 +244,8 @@ client.agents.start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - advanced_features=StartAgentsRequestPropertiesAdvancedFeatures( - enable_mllm=True, - ), mllm=StartAgentsRequestPropertiesMllm( + enable=True, url="wss://api.openai.com/v1/realtime", api_key="", vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, @@ -266,14 +262,6 @@ client.agents.start( threshold=0.5, silence_duration_ms=500, ), - # TTS and LLM are still required but not used when MLLM is enabled - tts=StartAgentsRequestPropertiesTts( - vendor=StartAgentsRequestPropertiesTtsVendor.MICROSOFT, - params={}, - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - ), ), ) ``` @@ -281,18 +269,14 @@ client.agents.start( ## MLLM Flow (Multimodal) -For real-time audio processing using OpenAI's Realtime API or Google Gemini Live, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. +For real-time audio processing using OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. MLLM mode does not require separate TTS, STT, or LLM vendors. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. ```python -from agora-agent-server-sdk import Agora -from agora-agent-server-sdk.agents import ( +from agora_agent import Agora +from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAdvancedFeatures, StartAgentsRequestPropertiesMllm, StartAgentsRequestPropertiesMllmVendor, - StartAgentsRequestPropertiesTts, - StartAgentsRequestPropertiesTtsVendor, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionType, ) @@ -311,10 +295,8 @@ client.agents.start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - advanced_features=StartAgentsRequestPropertiesAdvancedFeatures( - enable_mllm=True, - ), mllm=StartAgentsRequestPropertiesMllm( + enable=True, url="wss://api.openai.com/v1/realtime", api_key="", vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, @@ -331,14 +313,6 @@ client.agents.start( threshold=0.5, silence_duration_ms=500, ), - # TTS and LLM are still required but not used when MLLM is enabled - tts=StartAgentsRequestPropertiesTts( - vendor=StartAgentsRequestPropertiesTtsVendor.MICROSOFT, - params={}, - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - ), ), ) ``` diff --git a/changelog.md b/changelog.md index dad1528..31784c8 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,37 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [v1.5.0] — 2026-05-20 + +### Added + +- **`XaiGrok`** — New MLLM wrapper for xAI Grok (`mllm.vendor`: `"xai"`), including Realtime API URL, voice, language, sample rate, modalities, messages, and MLLM turn detection support. +- **`GenericAvatar`** — New generic avatar wrapper (`vendor: "generic"`) for custom avatar providers. +- **Avatar token enrichment** — `AgentSession.start()` now fills missing generic avatar `agora_appid` and `agora_channel` from the session and generates missing avatar `agora_token` values for HeyGen, LiveAvatar, and Generic avatars using each avatar's `agora_uid`. +- **Turn pagination** — `AgentSession.get_turns()` and `AsyncAgentSession.get_turns()` now accept `page_index` and `page_size`. New `get_all_turns()` helpers fetch and combine all pages. +- **Greeting interruption control** — LLM vendor `greeting_configs` now accepts the typed `LlmGreetingConfigs` shape, including v2.7 `interruptable`. +- **Type alias parity** — Added public aliases for v2.7 generated types such as `LlmConfig`, `TtsConfig`, `SttConfig`, `MllmConfig`, `AvatarConfig`, `AgentConfigUpdate`, `ConversationTurns`, `ConversationHistory`, `SessionInfo`, `Labels`, `SpeakPriority`, and `FillerWordsContentSelectionRule`. + +### Changed + +- **`XaiGrok` is the primary xAI MLLM class** — Matches the product name ([xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai)) and the TypeScript/Go SDKs. `XaiRealtime` remains as a deprecated backward-compatible alias. +- **Package version** — Bumped to `v1.5.0` to match the Fern-generated SDK headers. +- **RTM data channel default** — When `advanced_features.enable_rtm=True`, AgentKit now defaults `parameters.data_channel` to `"rtm"` unless the caller explicitly sets a data channel. +- **Agent-level LLM overrides** — In the standard ASR + LLM + TTS pipeline, agent-level `greeting`, `failure_message`, and `max_history` now override vendor defaults, matching the TypeScript SDK. In MLLM mode, agent-level `greeting` and `failure_message` fill only missing fields. +- **MLLM core alignment** — MLLM wrappers no longer expose or emit unsupported `predefined_tools` or `max_history` fields because they are not present in the generated v2.7 core `mllm` type. +- **MLLM without TTS** — MLLM sessions no longer require separate TTS, STT, or LLM vendor configuration. +- **Avatar pipeline support** — Avatar vendors are now explicitly limited to the cascading ASR + LLM + TTS pipeline. Combining `with_avatar()` with `with_mllm()` is rejected at `Agent.to_properties()` and `AgentSession.start()` (matching the TypeScript SDK), with a disabled avatar (`enable=False`) still permitted alongside MLLM. +- **VertexAI parity** — `VertexAI.to_config()` now spreads `additional_params` first so explicit `model`, `project_id`, `location`, and `adc_credentials_string` fields always win, matching the TypeScript and Gemini Live wrappers. +- **Pagination guard parity** — `AgentSession.get_all_turns()` and `AsyncAgentSession.get_all_turns()` now raise `RuntimeError` if the server's pagination metadata is missing (`page_index`/`total_pages`/`is_last_page`) or if the next page does not advance, matching the TypeScript SDK. + +### Migration notes + +- **`XaiRealtime` → `XaiGrok`** — Import and use `XaiGrok` for xAI MLLM (`mllm.vendor`: `"xai"`). `XaiRealtime` still works but emits `DeprecationWarning`. Future xAI cascading vendors will use `XaiSTT` and `XaiTTS`, not `XaiRealtime`. +- **`think()` default** — The server default for `on_listening_action` changed from `inject` to `interrupt` in API v2.7. Pass `on_listening_action="inject"` explicitly to preserve the old behavior. +- **Turn analytics pagination** — Sessions with more than 50 turns must request additional pages via `get_turns(page_index=..., page_size=...)` or use `get_all_turns()`. +- **Error reasons** — API v2.7 adds status codes `401`, `429`, and `500`; `InvalidRequest` is split into `InvalidRequestBody`, `MissingRequiredField`, and `InvalidFieldValue`, with new reasons such as `ServiceNotEnabled`, `AccountSuspended`, and `ResourceAllocationFailed`. +- **Event `112`** — Webhook event `112 turns finished` can be used as an alternative batch delivery path for post-session turn data. + ## [v1.4.1] — 2026-05-18 ### Fixed @@ -58,8 +89,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Fixed - **`AresSTT`** — Removed redundant `language` key from the `params` dict. Language is now emitted only at the top level. `params` is only included when `additional_params` is provided. -- **`OpenAIRealtime` / `VertexAI` (MLLM)** — Agent-level `greeting`, `failure_message`, and `max_history` overrides are now correctly applied when the agent is in MLLM mode. Previously these values were silently dropped. -- **`VertexAI` (MLLM)** — `messages` is now correctly placed inside `params` (required by the Gemini Live API). Previously it was emitted at the top level and silently ignored. +- **`OpenAIRealtime` / `VertexAI` (MLLM)** — Agent-level `greeting` and `failure_message` defaults are now correctly applied when missing in MLLM mode. Previously these values were silently dropped. +- **`VertexAI` (MLLM)** — `messages` is emitted at the MLLM top level, matching the generated core SDK contract. ### Changed @@ -72,8 +103,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). - **`OpenAITTS`** — New optional parameters: `response_format` (str, e.g. `"pcm"`) and `speed` (float). - **`CartesiaTTS`** — `voice_id` user-facing field is preserved; voice is serialized to the required nested object format automatically. - **`RimeTTS`** — New optional parameters: `lang` (str), `sampling_rate` (int, serialized as `samplingRate`), `speed_alpha` (float, serialized as `speedAlpha`). -- **`OpenAIRealtime`** — New optional parameters: `predefined_tools` (List[str]), `failure_message` (str), `max_history` (int). -- **`VertexAI` (MLLM)** — New optional parameters: `predefined_tools` (List[str]), `failure_message` (str), `max_history` (int). +- **`OpenAIRealtime`** — New optional parameter: `failure_message` (str). +- **`VertexAI` (MLLM)** — New optional parameter: `failure_message` (str). - **`HeyGenAvatar`** — New fields: `agora_token` (str, optional), `avatar_id` (str, optional), `enable` (bool, optional, default `True`), `disable_idle_timeout` (bool, optional), `activity_idle_timeout` (int, optional). ## [v1.1.0] — 2026-03-17 diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 7f22d8e..81f585b 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -97,6 +97,9 @@ Used with `agent.with_mllm()` for the [MLLM flow](../guides/mllm-flow.md). These | `OpenAIRealtime` | OpenAI Realtime | `api_key`; optional `turn_detection` | | `GeminiLive` | Google Gemini Live API | `api_key`, `model`; optional `turn_detection` | | `VertexAI` | Vertex AI (Gemini Live) | `model`, `project_id`, `location`, `adc_credentials_string`; optional `turn_detection` | +| `XaiGrok` | xAI Grok (`mllm.vendor`: `xai`) | `api_key`; optional `voice`, `language`, `sample_rate`, `turn_detection` | + +`XaiRealtime` is deprecated (use `XaiGrok`). Future xAI STT/TTS wrappers will be `XaiSTT` / `XaiTTS`, not `XaiRealtime`. ```python @@ -107,12 +110,15 @@ mllm = OpenAIRealtime(api_key='your-openai-key', model='gpt-4o-realtime-preview' ## Avatar Vendors -Used with `agent.with_avatar()`. Avatars require specific TTS sample rates — see [Avatar Integration](../guides/avatars.md). +Used with `agent.with_avatar()` in the cascading ASR + LLM + TTS pipeline. Some avatars require specific TTS sample rates — see [Avatar Integration](../guides/avatars.md). | Class | Provider | Required Parameters | Required TTS Sample Rate | |---|---|---|---| -| `HeyGenAvatar` | HeyGen | `api_key`, `quality`, `agora_uid` | 24000 Hz | -| `AkoolAvatar` | Akool | `api_key`, `agora_uid` | 16000 Hz | +| `HeyGenAvatar` | HeyGen (deprecated alias) | `api_key`, `quality`, `agora_uid` | 24000 Hz | +| `LiveAvatarAvatar` | LiveAvatar | `api_key`, `quality`, `agora_uid` | 24000 Hz | +| `AkoolAvatar` | Akool | `api_key` | 16000 Hz | +| `AnamAvatar` | Anam | `api_key` | None | +| `GenericAvatar` | Generic Avatar | `api_key`, `api_base_url`, `avatar_id`, `agora_uid` | None | ```python diff --git a/docs/guides/avatars.md b/docs/guides/avatars.md index e015bb7..1f1200a 100644 --- a/docs/guides/avatars.md +++ b/docs/guides/avatars.md @@ -1,32 +1,43 @@ --- sidebar_position: 3 title: Avatar Integration -description: Add a digital avatar (HeyGen or Akool) to your Conversational AI agent. +description: Add a digital avatar to your Conversational AI agent. --- # Avatar Integration -You can attach a digital avatar to your voice agent so that users see a visual representation of the AI speaking. Two avatar providers are supported: +You can attach a digital avatar to your voice agent so that users see a visual representation of the AI speaking. + +Avatars are currently supported only with the cascading ASR + LLM + TTS pipeline. MLLM sessions handle audio end-to-end and do not support avatars at this time. | Provider | Class | Required TTS Sample Rate | |---|---|---| -| HeyGen | `HeyGenAvatar` | 24000 Hz | +| LiveAvatar | `LiveAvatarAvatar` | 24000 Hz | +| HeyGen (deprecated alias) | `HeyGenAvatar` | 24000 Hz | | Akool | `AkoolAvatar` | 16000 Hz | +| Anam | `AnamAvatar` | None | +| Generic | `GenericAvatar` | None | + +## Token Model + +The agent and avatar join the same RTC channel with separate UIDs. The agent token is scoped to `agent_uid`; `avatar.params.agora_token` is scoped to the avatar `agora_uid`. + +When using `AgentSession.start()`, `agora_token` is optional for LiveAvatar, HeyGen, and Generic avatars. If omitted, AgentKit generates it with the same ConvoAI token path as the agent, using the avatar UID. You can still pass `agora_token` explicitly. ## Sample Rate Constraint -Each avatar vendor requires a specific TTS sample rate. The SDK validates this when you call `with_avatar()` — if the TTS sample rate does not match, a `ValueError` is raised immediately: +Each avatar vendor requires a specific TTS sample rate. The SDK validates this when you add TTS or avatar configuration and again when the session starts. If the TTS sample rate does not match, a `ValueError` is raised: ``` ValueError: Avatar requires TTS sample rate of 24000 Hz, but TTS is configured with 16000 Hz. Please update your TTS sample_rate to 24000. ``` -This validation happens at build time (when chaining methods), not at runtime when the session starts. Python raises this as a `ValueError` — there is no compile-time check as in statically typed languages. +Python raises this as a `ValueError` — there is no compile-time check as in statically typed languages. -Additionally, if the TTS `sample_rate` is not explicitly set (returns `None`), the SDK issues a warning: +Additionally, if the TTS sample rate is not explicitly available, the SDK issues a warning through the session warning callback: ``` -UserWarning: Avatar requires TTS sample rate of 24000 Hz, but TTS sample_rate is not explicitly set. Please ensure your TTS provider is configured for 24000 Hz. +Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz. ``` ## HeyGen Avatar (24 kHz) @@ -58,7 +69,7 @@ agent = ( api_key='your-heygen-key', quality='medium', agora_uid='2', - avatar_name='your-avatar-name', + avatar_id='your-avatar-id', )) ) @@ -68,6 +79,21 @@ session.say('Hello! I am your visual assistant.') session.stop() ``` +## Generic Avatar + +`GenericAvatar` supports custom avatar providers. `agora_appid`, `agora_channel`, and `agora_token` are optional when using `AgentSession.start()`. + +```python +from agora_agent.agentkit.vendors import GenericAvatar + +agent = agent.with_avatar(GenericAvatar( + api_key='your-avatar-provider-key', + api_base_url='https://avatar-provider.example.com', + avatar_id='avatar-123', + agora_uid='2', +)) +``` + ## Akool Avatar (16 kHz) Akool requires a TTS vendor configured at 16000 Hz: @@ -143,15 +169,14 @@ If you call `with_avatar()` before `with_tts()`, the sample rate check is deferr | `api_key` | `str` | Yes | HeyGen API key | | `quality` | `str` | Yes | Avatar quality: `low`, `medium`, or `high` | | `agora_uid` | `str` | Yes | Agora UID for the avatar video stream | -| `avatar_name` | `str` | No | Avatar name | -| `voice_id` | `str` | No | Voice ID | -| `language` | `str` | No | Language code | -| `version` | `str` | No | API version (`v1` or `v2`) | +| `agora_token` | `str` | No | Avatar token, generated at session start when omitted | +| `avatar_id` | `str` | No | Avatar ID | +| `disable_idle_timeout` | `bool` | No | Disable idle timeout | +| `activity_idle_timeout` | `int` | No | Idle timeout in seconds | ## Akool Options | Parameter | Type | Required | Description | |---|---|---|---| | `api_key` | `str` | Yes | Akool API key | -| `agora_uid` | `str` | Yes | Agora UID for the avatar video stream | | `avatar_id` | `str` | No | Avatar ID | diff --git a/docs/guides/mllm-flow.md b/docs/guides/mllm-flow.md index 15d7c3e..392206c 100644 --- a/docs/guides/mllm-flow.md +++ b/docs/guides/mllm-flow.md @@ -1,29 +1,28 @@ --- sidebar_position: 2 title: MLLM Flow (Multimodal) -description: Use OpenAI Realtime or Gemini Live for end-to-end audio processing. +description: Use OpenAI Realtime, Gemini Live, Vertex AI, or xAI for end-to-end audio processing. --- # MLLM Flow (Multimodal) The MLLM (Multimodal LLM) flow uses a single model to handle both audio input and output — no separate STT or TTS step. This gives the model direct access to voice tone, pacing, and emotion. -Two MLLM vendors are supported: +MLLM vendors supported by AgentKit: - **OpenAI Realtime** — `gpt-4o-realtime-preview` and related models - **Gemini Live** — direct Google AI API access for audio-native Gemini models +- **Vertex AI** — Gemini Live through Google Cloud Vertex AI +- **xAI Grok** — xAI Realtime API ## Enable MLLM Mode -Call `agent.with_mllm(vendor)` to enable MLLM mode. The builder sets `mllm.enable = True` automatically. +Call `agent.with_mllm(vendor)` to enable MLLM mode. The builder sets `mllm.enable = True` automatically. MLLM sessions do not require TTS, STT, or LLM vendors. Avatars are currently supported only with the cascading ASR + LLM + TTS pipeline. ```python from agora_agent.agentkit import Agent -agent = Agent( - name='realtime-agent', - instructions='You are a voice assistant.', -) +agent = Agent(name='realtime-agent') ``` ## OpenAI Realtime @@ -42,10 +41,7 @@ client = Agora( ) agent = ( - Agent( - name='realtime-agent', - instructions='You are a helpful voice assistant.', - ) + Agent(name='realtime-agent') .with_mllm(OpenAIRealtime( api_key='your-openai-key', model='gpt-4o-realtime-preview', @@ -74,10 +70,7 @@ async def main(): ) agent = ( - Agent( - name='realtime-agent', - instructions='You are a helpful voice assistant.', - ) + Agent(name='realtime-agent') .with_mllm(OpenAIRealtime( api_key='your-openai-key', model='gpt-4o-realtime-preview', @@ -107,10 +100,7 @@ client = Agora( ) agent = ( - Agent( - name='gemini-agent', - instructions='You are a helpful multilingual assistant.', - ) + Agent(name='gemini-agent') .with_mllm(GeminiLive( api_key='your-google-ai-api-key', model='gemini-live-2.5-flash', @@ -123,6 +113,33 @@ agent_id = session.start() session.stop() ``` +## xAI Grok + +```python +from agora_agent import Agora, Area +from agora_agent.agentkit import Agent +from agora_agent.agentkit.vendors import XaiGrok + +client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') + +agent = ( + Agent(name='xai-agent') + .with_mllm(XaiGrok( + api_key='your-xai-key', + voice='eve', + language='en', + sample_rate=24000, + output_modalities=['audio', 'text'], + )) +) + +session = agent.create_session(client, channel='xai-room', agent_uid='1', remote_uids=['100']) +agent_id = session.start() +session.stop() +``` + +For xAI turn detection, use `mllm.turn_detection` with `agora_vad` or `server_vad`. + ## OpenAI Realtime with Custom Options ```python @@ -145,10 +162,10 @@ mllm = OpenAIRealtime( |---|---|---| | Latency | Lower — single model, no pipeline | Higher — three models in sequence | | Voice control | Model-dependent | Full vendor choice for TTS | -| Vendor flexibility | Limited (OpenAI Realtime or Gemini Live) | Mix and match 4 LLMs, 12 TTS, 10 STT | +| Vendor flexibility | Limited to supported MLLM providers (OpenAI Realtime, Gemini Live, Vertex AI, xAI Grok) | Mix and match LLM, TTS, and STT vendors | | Audio understanding | Model hears tone, pacing, emotion | STT produces text only | ## Next Steps - For the cascading pipeline, see [Cascading Flow](./cascading-flow.md) -- To add a visual avatar, see [Avatars](./avatars.md) +- To add a visual avatar, use the cascading pipeline and see [Avatars](./avatars.md) diff --git a/docs/index.md b/docs/index.md index 9063734..c8d9d12 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,12 +17,12 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on **Cascading flow** uses ASR -> LLM -> TTS and supports the broadest set of vendor combinations. -**MLLM flow** uses a multimodal model such as OpenAI Realtime or Gemini Live for end-to-end audio. +**MLLM flow** uses a multimodal model such as OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok for end-to-end audio. ## Choose a starting point - Use [Quick Start](./getting-started/quick-start.md) if you want the recommended preset-based path with no vendor keys. -- Use [MLLM Flow](./guides/mllm-flow.md) if you want realtime end-to-end audio with OpenAI Realtime or Gemini Live. +- Use [MLLM Flow](./guides/mllm-flow.md) if you want realtime end-to-end audio with OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok. - Use [Cascading Flow](./guides/cascading-flow.md) if you want separate ASR, LLM, and TTS vendors. ## SDK layers @@ -45,8 +45,8 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on | [AgentSession](./concepts/session.md) | Manage the agent lifecycle | | [Vendors](./concepts/vendors.md) | Browse all LLM, TTS, STT, MLLM, and Avatar providers | | [Cascading Flow](./guides/cascading-flow.md) | Build an ASR -> LLM -> TTS pipeline | -| [MLLM Flow](./guides/mllm-flow.md) | Use OpenAI Realtime or Gemini Live for end-to-end audio | -| [Avatars](./guides/avatars.md) | Add a digital avatar with HeyGen or Akool | +| [MLLM Flow](./guides/mllm-flow.md) | Use OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok for end-to-end audio | +| [Avatars](./guides/avatars.md) | Add a digital avatar with LiveAvatar, Akool, Anam, or Generic Avatar | | [Regional Routing](./guides/regional-routing.md) | Route requests to the nearest region | | [Error Handling](./guides/error-handling.md) | Handle API errors with ApiError | | [Pagination](./guides/pagination.md) | Iterate over paginated list endpoints | @@ -56,3 +56,4 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on | [Agent Reference](./reference/agent.md) | Full `Agent` builder API | | [Session Reference](./reference/session.md) | Full `AgentSession` / `AsyncAgentSession` API | | [Vendor Reference](./reference/vendors.md) | Constructor options for all vendor classes | +| [Error Reference](./reference/errors.md) | v2.7 status codes and error reason values | diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 0e7f5e8..1e88b8b 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -83,7 +83,7 @@ agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) ### `with_mllm(vendor: BaseMLLM) -> Agent` -Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. +Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. ```python @@ -93,7 +93,9 @@ agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) ### `with_avatar(vendor: BaseAvatar) -> Agent` -Set the avatar vendor. Raises `ValueError` if TTS sample rate does not match the avatar's `required_sample_rate`. +Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + +Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. ```python @@ -107,6 +109,22 @@ agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', ago Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. +Pause-state detection is configured under semantic end-of-speech: + +```python +agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, +}) +``` + ### `with_interruption(config: InterruptionConfig) -> Agent` Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. @@ -131,13 +149,19 @@ Set SAL (Selective Attention Locking) configuration. Set advanced features (e.g. `{'enable_rtm': True}`). +When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + ### `with_tools(enabled: bool = True) -> Agent` Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. ### `with_parameters(parameters: SessionParams) -> Agent` -Set session parameters (silence config, farewell config, data channel, etc.). +Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + +### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + +Set `parameters.audio_scenario` without replacing existing session parameters. ### `with_failure_message(message: str) -> Agent` @@ -145,7 +169,7 @@ Set the message spoken via TTS when the LLM call fails. ### `with_max_history(max_history: int) -> Agent` -Set the maximum conversation history length. +Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. ### `with_geofence(geofence: GeofenceConfig) -> Agent` diff --git a/docs/reference/errors.md b/docs/reference/errors.md new file mode 100644 index 0000000..d91f335 --- /dev/null +++ b/docs/reference/errors.md @@ -0,0 +1,29 @@ +--- +sidebar_position: 5 +title: Error Responses +description: Error handling notes for Conversational AI API responses. +--- + +# Error Responses + +API v2.7 updates the error status codes and `reason` values surfaced through the generated client. + +## Status Codes + +In addition to existing validation and task errors, integrations should handle: + +- `401` — authentication failed +- `429` — rate limit exceeded +- `500` — internal server error + +## Reason Migration + +If your application branches on `AgentErrorResponse.reason`, update handlers for the v2.7 reasons: + +| Previous | v2.7 replacements | +|---|---| +| `InvalidRequest` | `InvalidRequestBody`, `MissingRequiredField`, `InvalidFieldValue` | + +New reasons include `ServiceNotEnabled`, `AccountSuspended`, and `ResourceAllocationFailed`. + +Prefer treating unknown reasons as retryable only when the HTTP status and operation are safe to retry. diff --git a/docs/reference/session.md b/docs/reference/session.md index 955e08f..f5ce663 100644 --- a/docs/reference/session.md +++ b/docs/reference/session.md @@ -56,14 +56,14 @@ AgentSession( ### `start()` -Start the agent session. Generates an RTC token if not provided, validates avatar/TTS config, and calls the Agora API. +Start the agent session. Generates an RTC token if not provided, validates avatar/TTS config for cascading sessions, and calls the Agora API. MLLM sessions do not require TTS; an enabled avatar is rejected when MLLM is configured (a disabled avatar is allowed). | | Sync (`AgentSession`) | Async (`AsyncAgentSession`) | |---|---|---| | **Signature** | `start() -> str` | `async start() -> str` | | **Returns** | Agent ID | Agent ID | | **Raises** | `RuntimeError` if not in `idle`, `stopped`, or `error` state | Same | -| **Raises** | `ValueError` if avatar/TTS sample rate mismatch | Same | +| **Raises** | `ValueError` if avatar/TTS sample rate mismatch or an enabled avatar is used with MLLM | Same | ```python @@ -154,6 +154,16 @@ session.update(properties) await session.update(properties) ``` +### `think(text, ...)` + +Inject a custom text instruction into the running agent. + +In API v2.7, omitting `on_listening_action` uses the server default `interrupt`. Pass `on_listening_action='inject'` explicitly to preserve the pre-v2.7 behavior. + +```python +session.think('Summarize the last answer', on_listening_action='inject') +``` + ### `get_history()` Retrieve the conversation history. @@ -190,6 +200,22 @@ info = session.get_info() info = await session.get_info() ``` +### `get_turns(page_index=None, page_size=None)` + +Retrieve paginated turn analytics for a completed or running session. In v2.7, the API defaults to page 1 and up to 50 turns per page. Responses include `agent_id`, `name`, `channel`, `total_turn_count`, `pagination`, and `turns`. + +```python +page = session.get_turns(page_index=1, page_size=50) +``` + +### `get_all_turns(page_size=None)` + +Fetch all turn pages and return a single `GetTurnsAgentsResponse` with the combined `turns` list. + +```python +all_turns = session.get_all_turns(page_size=50) +``` + ### `on(event, handler)` Register an event handler. This method is synchronous on both `AgentSession` and `AsyncAgentSession`. diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index 798a7f3..ad1b779 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -10,13 +10,15 @@ All vendor classes are available from `agora_agent.agentkit.vendors`: ```python -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIRealtime, GeminiLive, HeyGenAvatar +from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIRealtime, XaiGrok, GenericAvatar ``` --- ## LLM Vendors +`greeting_configs` accepts either a dict or `LlmGreetingConfigs`. In v2.7, `greeting_configs.interruptable=False` makes the greeting uninterruptible; `True` follows the global `interruption` settings. + ### `OpenAI` | Parameter | Type | Required | Default | Description | @@ -364,8 +366,6 @@ Fixed sample rate: 24000 Hz. | `url` | `str` | No | `None` | Custom WebSocket URL | | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Message played when the model call fails | -| `max_history` | `int` | No | `None` | Maximum conversation history length | -| `predefined_tools` | `List[str]` | No | `None` | Predefined tools (e.g., `["_publish_message"]`) | | `input_modalities` | `List[str]` | No | `None` | Input modalities | | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `messages` | `List[Dict]` | No | `None` | Conversation messages | @@ -383,8 +383,6 @@ Fixed sample rate: 24000 Hz. | `voice` | `str` | No | `None` | Voice name | | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Message played when the model call fails | -| `max_history` | `int` | No | `None` | Maximum conversation history length | -| `predefined_tools` | `List[str]` | No | `None` | Predefined tools (e.g., `["_publish_message"]`) | | `input_modalities` | `List[str]` | No | `None` | Input modalities | | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `messages` | `List[Dict]` | No | `None` | Conversation messages | @@ -403,18 +401,39 @@ Fixed sample rate: 24000 Hz. | `voice` | `str` | No | `None` | Voice name (e.g., `Aoede`, `Charon`) | | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Message played when the model call fails | -| `max_history` | `int` | No | `None` | Maximum conversation history length | -| `predefined_tools` | `List[str]` | No | `None` | Predefined tools (e.g., `["_publish_message"]`) | | `input_modalities` | `List[str]` | No | `None` | Input modalities | | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `messages` | `List[Dict]` | No | `None` | Conversation messages | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | | `turn_detection` | `MllmTurnDetectionConfig` | No | `None` | MLLM turn detection configuration; overrides top-level `turn_detection` | +### `XaiGrok` + +xAI Grok MLLM vendor (`mllm.vendor`: `"xai"`). Matches the [xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai) product docs and the TypeScript SDK. + +> `XaiRealtime` is deprecated and remains only as a backward-compatible alias. + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `api_key` | `str` | Yes | — | xAI API key | +| `url` | `str` | No | `wss://api.x.ai/v1/realtime` | xAI realtime WebSocket URL | +| `voice` | `str` | No | `None` | Voice identifier, for example `eve` or `rex` | +| `language` | `str` | No | `None` | Language code, for example `en` | +| `sample_rate` | `int` | No | `None` | Audio sample rate in Hz | +| `greeting_message` | `str` | No | `None` | Greeting message | +| `failure_message` | `str` | No | `None` | Message played when the model call fails | +| `input_modalities` | `List[str]` | No | `None` | Input modalities | +| `output_modalities` | `List[str]` | No | `None` | Output modalities | +| `messages` | `List[Dict]` | No | `None` | Conversation messages | +| `params` | `Dict[str, Any]` | No | `None` | Additional xAI parameters | +| `turn_detection` | `MllmTurnDetectionConfig` | No | `None` | Supports `agora_vad` and `server_vad` for xAI | + --- ## Avatar Vendors +Avatar vendors are currently supported only with the cascading ASR + LLM + TTS pipeline. + ### `HeyGenAvatar` Required TTS sample rate: **24000 Hz** @@ -424,7 +443,7 @@ Required TTS sample rate: **24000 Hz** | `api_key` | `str` | Yes | — | HeyGen API key | | `quality` | `str` | Yes | — | Avatar quality: `low`, `medium`, or `high` | | `agora_uid` | `str` | Yes | — | Agora UID for avatar video stream | -| `agora_token` | `str` | No | `None` | RTC token for avatar authentication | +| `agora_token` | `str` | No | `None` | Avatar token. When omitted, `AgentSession.start()` generates one for `agora_uid` using the same token path as the agent. | | `avatar_id` | `str` | No | `None` | HeyGen avatar ID | | `enable` | `bool` | No | `True` | Enable or disable the avatar | | `disable_idle_timeout` | `bool` | No | `None` | Disable the idle timeout | @@ -437,5 +456,33 @@ Required TTS sample rate: **16000 Hz** | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Akool API key | -| `agora_uid` | `str` | Yes | — | Agora UID for avatar video stream | | `avatar_id` | `str` | No | `None` | Avatar ID | + +### `LiveAvatarAvatar` + +Required TTS sample rate: **24000 Hz** + +Same options as `HeyGenAvatar`, but serializes `vendor: "liveavatar"`. `agora_token` is optional and generated by `AgentSession.start()` when omitted. + +### `AnamAvatar` + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `api_key` | `str` | Yes | — | Anam API key | +| `persona_id` | `str` | No | `None` | Persona ID | +| `enable` | `bool` | No | `True` | Enable or disable the avatar | + +### `GenericAvatar` + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `api_key` | `str` | Yes | — | Generic avatar provider API key | +| `agora_uid` | `str` | Yes | — | Avatar RTC UID. Must differ from the agent UID. | +| `api_base_url` | `str` | Yes | — | Avatar provider API base URL | +| `avatar_id` | `str` | Yes | — | Avatar ID | +| `agora_token` | `str` | No | `None` | Optional avatar token. Generated by `AgentSession.start()` when omitted. | +| `agora_appid` | `str` | No | `None` | Optional; filled from the session App ID when omitted. | +| `agora_channel` | `str` | No | `None` | Optional; filled from the session channel when omitted. | +| `enable` | `bool` | No | `True` | Enable or disable the avatar | + +Avatar tokens are separate from the agent join token but generated with the same `generate_convo_ai_token` path, using the avatar's `agora_uid` as the account. From eaec58eb2edfe03b1311a32dd137a867edf5d096 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 21 May 2026 15:16:38 -0400 Subject: [PATCH 08/28] refactor(agentkit): align deprecated vendor aliases with canonical names --- src/agora_agent/__init__.py | 4 - src/agora_agent/agentkit/vendors/__init__.py | 3 +- src/agora_agent/agentkit/vendors/avatar.py | 82 ++++++++++---------- src/agora_agent/agentkit/vendors/mllm.py | 25 ------ 4 files changed, 44 insertions(+), 70 deletions(-) diff --git a/src/agora_agent/__init__.py b/src/agora_agent/__init__.py index b6296da..791c156 100644 --- a/src/agora_agent/__init__.py +++ b/src/agora_agent/__init__.py @@ -16,7 +16,6 @@ AgentSessionOptions, GenericAvatar, XaiGrok, - XaiRealtime, generate_rtc_token, GenerateTokenOptions, ) @@ -32,8 +31,6 @@ "GenericAvatar": ".agentkit", "XaiGrok": ".agentkit", "GenerateTokenOptions": ".agentkit", - "Pool": ".core.domain", - "XaiRealtime": ".agentkit", "__version__": ".version", "agentkit": ".agentkit", "agents": ".agents", @@ -78,7 +75,6 @@ def __dir__(): "XaiGrok", "GenerateTokenOptions", "Pool", - "XaiRealtime", "__version__", "agentkit", "agents", diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py index 689eab1..8e2042e 100644 --- a/src/agora_agent/agentkit/vendors/__init__.py +++ b/src/agora_agent/agentkit/vendors/__init__.py @@ -13,7 +13,7 @@ ) from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI -from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok from .stt import ( AmazonSTT, AresSTT, @@ -83,7 +83,6 @@ "GeminiLive", "VertexAI", "XaiGrok", - "XaiRealtime", "HeyGenAvatar", "LiveAvatarAvatar", "AkoolAvatar", diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index 00cad8f..50bdd08 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -5,19 +5,19 @@ from .base import BaseAvatar -HEYGEN_SAMPLE_RATE = 24000 LIVEAVATAR_SAMPLE_RATE = 24000 +HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE AKOOL_SAMPLE_RATE = 16000 -class HeyGenAvatarOptions(BaseModel): +class LiveAvatarAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") - api_key: str = Field(..., description="HeyGen API key") + api_key: str = Field(..., description="LiveAvatar API key") quality: str = Field(..., description="Avatar quality: low, medium, or high") agora_uid: str = Field(..., description="Agora UID for the avatar stream") agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") - avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") @@ -31,20 +31,14 @@ def validate_quality(cls, v: str) -> str: raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") return v -class HeyGenAvatar(BaseAvatar): - """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" +class LiveAvatarAvatar(BaseAvatar): def __init__(self, **kwargs: Any): - warnings.warn( - "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", - DeprecationWarning, - stacklevel=2, - ) - self.options = HeyGenAvatarOptions(**kwargs) + self.options = LiveAvatarAvatarOptions(**kwargs) @property def required_sample_rate(self) -> int: - return HEYGEN_SAMPLE_RATE + return LIVEAVATAR_SAMPLE_RATE def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { @@ -65,71 +59,79 @@ def to_config(self) -> Dict[str, Any]: params = {**self.options.additional_params, **params} enable = self.options.enable if self.options.enable is not None else True - return {"enable": enable, "vendor": "heygen", "params": params} + return {"enable": enable, "vendor": "liveavatar", "params": params} -class AkoolAvatarOptions(BaseModel): - model_config = ConfigDict(extra="forbid") +class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" - api_key: str = Field(..., description="Akool API key") - avatar_id: Optional[str] = Field(default=None, description="Avatar ID") - enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") - additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") -class AkoolAvatar(BaseAvatar): +class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + def __init__(self, **kwargs: Any): - self.options = AkoolAvatarOptions(**kwargs) + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) @property def required_sample_rate(self) -> int: - return AKOOL_SAMPLE_RATE + return HEYGEN_SAMPLE_RATE def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, } + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token if self.options.avatar_id is not None: params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout if self.options.additional_params is not None: params = {**self.options.additional_params, **params} enable = self.options.enable if self.options.enable is not None else True - return {"enable": enable, "vendor": "akool", "params": params} + return {"enable": enable, "vendor": "heygen", "params": params} -class LiveAvatarAvatarOptions(HeyGenAvatarOptions): - pass +class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") -class LiveAvatarAvatar(BaseAvatar): +class AkoolAvatar(BaseAvatar): def __init__(self, **kwargs: Any): - self.options = LiveAvatarAvatarOptions(**kwargs) + self.options = AkoolAvatarOptions(**kwargs) @property def required_sample_rate(self) -> int: - return LIVEAVATAR_SAMPLE_RATE + return AKOOL_SAMPLE_RATE def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "api_key": self.options.api_key, - "quality": self.options.quality, - "agora_uid": self.options.agora_uid, } - if self.options.agora_token is not None: - params["agora_token"] = self.options.agora_token if self.options.avatar_id is not None: params["avatar_id"] = self.options.avatar_id - if self.options.disable_idle_timeout is not None: - params["disable_idle_timeout"] = self.options.disable_idle_timeout - if self.options.activity_idle_timeout is not None: - params["activity_idle_timeout"] = self.options.activity_idle_timeout if self.options.additional_params is not None: params = {**self.options.additional_params, **params} enable = self.options.enable if self.options.enable is not None else True - return {"enable": enable, "vendor": "liveavatar", "params": params} + return {"enable": enable, "vendor": "akool", "params": params} class GenericAvatarOptions(BaseModel): @@ -145,6 +147,7 @@ class GenericAvatarOptions(BaseModel): enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + class GenericAvatar(BaseAvatar): def __init__(self, **kwargs: Any): self.options = GenericAvatarOptions(**kwargs) @@ -178,10 +181,11 @@ class AnamAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Anam API key") - persona_id: Optional[str] = Field(default=None, description="Persona ID") + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + class AnamAvatar(BaseAvatar): def __init__(self, **kwargs: Any): self.options = AnamAvatarOptions(**kwargs) diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index cd6cd07..b58f040 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -1,4 +1,3 @@ -import warnings from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field @@ -119,30 +118,6 @@ def to_config(self) -> Dict[str, Any]: return config -class XaiRealtimeOptions(XaiGrokOptions): - """Deprecated: use :class:`XaiGrokOptions` instead.""" - - def __init__(self, **data: Any): - warnings.warn( - "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", - DeprecationWarning, - stacklevel=2, - ) - super().__init__(**data) - - -class XaiRealtime(XaiGrok): - """Deprecated: use :class:`XaiGrok` instead.""" - - def __init__(self, **kwargs: Any): - warnings.warn( - "XaiRealtime is deprecated; use XaiGrok instead.", - DeprecationWarning, - stacklevel=2, - ) - super().__init__(**kwargs) - - class VertexAIOptions(BaseModel): model_config = ConfigDict(extra="forbid") From 20245632afd066efe5a453665b29c5ba0e13e4f8 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 21 May 2026 15:17:27 -0400 Subject: [PATCH 09/28] feat(agentkit): export type aliases and avatar token helpers --- src/agora_agent/agentkit/__init__.py | 49 ++++++++++++++++------- src/agora_agent/agentkit/agent.py | 22 +++++++++- src/agora_agent/agentkit/agent_session.py | 8 +++- src/agora_agent/agentkit/avatar_types.py | 23 +++++++++-- src/agora_agent/agentkit/constants.py | 10 +++++ 5 files changed, 90 insertions(+), 22 deletions(-) diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py index 5ceda66..e9ab221 100644 --- a/src/agora_agent/agentkit/__init__.py +++ b/src/agora_agent/agentkit/__init__.py @@ -2,6 +2,7 @@ Agent, AgentConfig, AgentConfigUpdate, + AsrConfig, ConversationHistory, ConversationRole, ConversationSessionTurn, @@ -62,23 +63,23 @@ SessionListResponse, SessionSummary, SpeakPriority, + ThinkOnListeningAction, + ThinkOnSpeakingAction, + ThinkOnThinkingAction, + ThinkResponse, ) -from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession -from ..agent_management.types.agent_think_agent_management_response import ( - AgentThinkAgentManagementResponse as AgentThinkResponse, -) -from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( - AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, -) -from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( - AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, -) -from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( - AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, +# Deprecated think type aliases (prefer ThinkOn* names). +from .agent import ( + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + ThinkResponse as AgentThinkResponse, ) +from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession from .avatar_types import ( is_akool_avatar, is_anam_avatar, + is_avatar_token_managed, is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, @@ -94,6 +95,13 @@ GeofenceArea, GeofenceExcludeArea, FillerWordsSelectionRule, + ThinkOnListeningActionIgnore, + ThinkOnListeningActionInject, + ThinkOnListeningActionInterrupt, + ThinkOnSpeakingActionIgnore, + ThinkOnSpeakingActionInterrupt, + ThinkOnThinkingActionIgnore, + ThinkOnThinkingActionInterrupt, TurnDetectionTypeValues, ) from .token import ( @@ -158,7 +166,6 @@ SpeechmaticsSTT, VertexAI, XaiGrok, - XaiRealtime, LiveAvatarAvatar, ) @@ -172,6 +179,7 @@ "LlmConfig", "LlmStyle", "SttConfig", + "AsrConfig", "SttVendor", "TtsConfig", "MllmConfig", @@ -230,6 +238,13 @@ "GeofenceExcludeArea", "FillerWordsSelectionRule", "TurnDetectionTypeValues", + "ThinkOnListeningActionInject", + "ThinkOnListeningActionInterrupt", + "ThinkOnListeningActionIgnore", + "ThinkOnThinkingActionInterrupt", + "ThinkOnThinkingActionIgnore", + "ThinkOnSpeakingActionInterrupt", + "ThinkOnSpeakingActionIgnore", # LLM sub-types "LlmGreetingConfigs", "LlmGreetingConfigsMode", @@ -246,10 +261,16 @@ "ConversationTurns", "ConversationSessionTurn", "SpeakPriority", + "ThinkResponse", + "ThinkOnListeningAction", + "ThinkOnThinkingAction", + "ThinkOnSpeakingAction", "AgentThinkResponse", "AgentThinkRequestOnListeningAction", "AgentThinkRequestOnThinkingAction", "AgentThinkRequestOnSpeakingAction", + "is_avatar_token_managed", + "is_rtc_avatar", "AgentPresets", "DeepgramPresetModels", "OpenAIPresetModels", @@ -303,7 +324,6 @@ "GeminiLive", "VertexAI", "XaiGrok", - "XaiRealtime", "HeyGenAvatar", "LiveAvatarAvatar", "AkoolAvatar", @@ -314,7 +334,6 @@ "is_akool_avatar", "is_anam_avatar", "is_generic_avatar", - "is_rtc_avatar", "validate_avatar_config", "validate_tts_sample_rate", ] diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 86a958e..14933a2 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -66,13 +66,25 @@ from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule from ..types.tts import Tts -from .token import generate_convo_ai_token, _validate_expires_in +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, +) +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, +) +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, +) +from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, +) from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS # Top-level aliases LlmConfig = StartAgentsRequestPropertiesLlm LlmStyle = StartAgentsRequestPropertiesLlmStyle SttConfig = StartAgentsRequestPropertiesAsr +AsrConfig = SttConfig SttVendor = StartAgentsRequestPropertiesAsrVendor TtsConfig = Tts MllmConfig = StartAgentsRequestPropertiesMllm @@ -159,6 +171,14 @@ class SessionParamsInput(typing_extensions.TypedDict, total=False): FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule +# Think type aliases and response +ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction +ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction +ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction +ThinkResponse = AgentThinkAgentManagementResponse + +from .token import generate_convo_ai_token, _validate_expires_in + class Agent: """A reusable agent definition. diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index e41a399..269619e 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -20,10 +20,10 @@ from .avatar_types import ( is_akool_avatar, is_anam_avatar, + is_avatar_token_managed, is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, - is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) @@ -242,7 +242,11 @@ def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) - if not params.get("agora_channel"): params["agora_channel"] = self._channel - if not is_rtc_avatar(avatar): + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) return diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py index a04809c..aea9da1 100644 --- a/src/agora_agent/agentkit/avatar_types.py +++ b/src/agora_agent/agentkit/avatar_types.py @@ -1,3 +1,4 @@ +import warnings import typing @@ -21,11 +22,25 @@ def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: return config.get("vendor") == "generic" +def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + """Return True when AgentKit manages the avatar RTC publisher identity.""" + return ( + is_heygen_avatar(config) + or is_live_avatar_avatar(config) + or is_generic_avatar(config) + ) + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: - params = config.get("params", {}) - return isinstance(params, dict) and bool(params.get("agora_uid")) and ( - is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + warnings.warn( + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + "and keep agora_uid checks in session enrichment.", + DeprecationWarning, + stacklevel=2, ) + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) def validate_avatar_config( @@ -95,7 +110,7 @@ def validate_tts_sample_rate( """Validates that TTS sample rate is compatible with the avatar vendor. Different avatar vendors have specific sample rate requirements: - - HeyGen: ONLY supports 24,000 Hz + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz - Akool: ONLY supports 16,000 Hz Parameters diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py index f86e4d3..c0a852e 100644 --- a/src/agora_agent/agentkit/constants.py +++ b/src/agora_agent/agentkit/constants.py @@ -58,3 +58,13 @@ class TurnDetectionTypeValues: AGORA_VAD = "agora_vad" SERVER_VAD = "server_vad" SEMANTIC_VAD = "semantic_vad" + + +# Think action value constants (match Fern wire values) +ThinkOnListeningActionInject = "inject" +ThinkOnListeningActionInterrupt = "interrupt" +ThinkOnListeningActionIgnore = "ignore" +ThinkOnThinkingActionInterrupt = "interrupt" +ThinkOnThinkingActionIgnore = "ignore" +ThinkOnSpeakingActionInterrupt = "interrupt" +ThinkOnSpeakingActionIgnore = "ignore" From 657c35d003daa6e3be457365a582a6080b3b61a9 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 21 May 2026 16:12:45 -0400 Subject: [PATCH 10/28] updated tests --- tests/custom/test_agentkit_agent.py | 298 -------------------- tests/custom/test_agentkit_session.py | 383 -------------------------- tests/custom/test_agentkit_vendors.py | 122 -------- 3 files changed, 803 deletions(-) delete mode 100644 tests/custom/test_agentkit_agent.py delete mode 100644 tests/custom/test_agentkit_session.py delete mode 100644 tests/custom/test_agentkit_vendors.py diff --git a/tests/custom/test_agentkit_agent.py b/tests/custom/test_agentkit_agent.py deleted file mode 100644 index 9719b04..0000000 --- a/tests/custom/test_agentkit_agent.py +++ /dev/null @@ -1,298 +0,0 @@ -from agora_agent.agentkit import ( - Agent, - AvatarConfig, - AvatarVendor, - LlmConfig, - LlmStyle, - MllmConfig, - MllmVendor, - SttConfig, - SttVendor, - TtsConfig, -) -import pytest - -from agora_agent.agentkit.vendors import ( - AkoolAvatar, - ElevenLabsTTS, - LiveAvatarAvatar, - OpenAI, - OpenAIRealtime, -) - - -def _parameter(config, key): - parameters = config["parameters"] - if isinstance(parameters, dict): - return parameters[key] - return getattr(parameters, key) - - -class _CopyOnlyModel: - def __init__(self, **values): - self.values = values - - def copy(self, update=None): - return _CopyOnlyModel(**{**self.values, **(update or {})}) - - -def test_generated_core_aliases_are_public(): - assert LlmConfig is not None - assert LlmStyle is not None - assert SttConfig is not None - assert SttVendor is not None - assert TtsConfig is not None - assert MllmConfig is not None - assert MllmVendor is not None - assert AvatarConfig is not None - assert AvatarVendor is not None - - -def test_model_copy_helper_supports_pydantic_v1_copy_api(): - copied = Agent._copy_model_update(_CopyOnlyModel(enable_rtm=True), {"data_channel": "rtm"}) # noqa: SLF001 - - assert copied.values == {"enable_rtm": True, "data_channel": "rtm"} - - -def test_with_audio_scenario_sets_session_parameter(): - agent = Agent(name="test").with_audio_scenario("chorus") - - assert _parameter(agent.config, "audio_scenario") == "chorus" - - -def test_with_audio_scenario_preserves_existing_parameters(): - agent = Agent(name="test", parameters={"enable_metrics": True}).with_audio_scenario( - "chorus" - ) - - assert _parameter(agent.config, "enable_metrics") is True - assert _parameter(agent.config, "audio_scenario") == "chorus" - - -def test_enable_rtm_defaults_data_channel_to_rtm(): - properties = Agent(name="test", advanced_features={"enable_rtm": True}).to_properties( - channel="room", - agent_uid="1", - remote_uids=["100"], - token="token", - skip_vendor_validation=True, - ) - - assert properties.parameters.data_channel == "rtm" - - -def test_enable_rtm_preserves_explicit_data_channel(): - properties = Agent( - name="test", - advanced_features={"enable_rtm": True}, - parameters={"data_channel": "datastream"}, - ).to_properties( - channel="room", - agent_uid="1", - remote_uids=["100"], - token="token", - skip_vendor_validation=True, - ) - - assert properties.parameters.data_channel == "datastream" - - -def test_agent_level_llm_fields_override_vendor_defaults(): - agent = ( - Agent(name="test") - .with_llm( - OpenAI( - api_key="llm-key", - greeting_message="vendor greeting", - failure_message="vendor failure", - max_history=1, - ) - ) - .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) - .with_greeting("agent greeting") - .with_failure_message("agent failure") - .with_max_history(2) - ) - - properties = agent.to_properties( - channel="room", - agent_uid="1", - remote_uids=["100"], - token="token", - ) - - assert properties.llm.greeting_message == "agent greeting" - assert properties.llm.failure_message == "agent failure" - assert properties.llm.max_history == 2 - - -def test_avatar_sample_rate_validation_works_when_tts_added_after_avatar(): - agent = Agent(name="test").with_avatar( - LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2") - ) - - with pytest.raises(ValueError, match="24000"): - agent.with_tts( - ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) - ) - - -def test_avatar_sample_rate_validation_uses_wrapper_sample_rate(): - agent = ( - Agent(name="test") - .with_avatar(AkoolAvatar(api_key="avatar-key")) - .with_tts( - ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) - ) - ) - - assert agent.tts_sample_rate == 16000 - - -def test_with_mllm_removes_deprecated_advanced_features_enable_mllm(): - properties = ( - Agent( - name="test", - advanced_features={"enable_mllm": True, "enable_rtm": True}, - greeting="hello from agent", - failure_message="try again", - max_history=5, - ) - .with_mllm(OpenAIRealtime(api_key="openai-key")) - .to_properties( - channel="room", - agent_uid="1", - remote_uids=["100"], - token="rtc-token", - ) - ) - - assert properties.mllm is not None - assert properties.mllm.enable is True - assert properties.mllm.greeting_message == "hello from agent" - assert properties.mllm.failure_message == "try again" - mllm_dump = properties.mllm.model_dump(exclude_none=True) - assert "max_history" not in mllm_dump - assert properties.advanced_features is not None - af_dump = properties.advanced_features.model_dump(exclude_none=True) - assert "enable_mllm" not in af_dump - assert af_dump.get("enable_rtm") is True - - -def test_to_properties_rejects_mllm_with_enabled_avatar(): - agent = ( - Agent(name="test") - .with_mllm(OpenAIRealtime(api_key="mllm-key")) - .with_avatar( - LiveAvatarAvatar( - api_key="avatar-key", - quality="medium", - agora_uid="2", - agora_token="avatar-token", - ) - ) - ) - - with pytest.raises(ValueError, match="cascading"): - agent.to_properties( - channel="room", - agent_uid="1", - remote_uids=["100"], - token="rtc-token", - ) - - -def test_to_properties_mllm_with_avatar_fires_before_token_generation(): - """The guard must fire before the token-generation step so callers get a - clear, actionable error even when app_id/app_certificate are empty. - """ - agent = ( - Agent(name="test") - .with_mllm(OpenAIRealtime(api_key="mllm-key")) - .with_avatar( - LiveAvatarAvatar( - api_key="avatar-key", - quality="medium", - agora_uid="2", - agora_token="avatar-token", - ) - ) - ) - - with pytest.raises(ValueError, match="cascading"): - agent.to_properties( - channel="room", - agent_uid="1", - remote_uids=["100"], - app_id="", - app_certificate="", - ) - - -def test_to_properties_rejects_mllm_with_default_enabled_avatar(): - """Avatar with no `enable` field should be treated as enabled.""" - agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) - agent._avatar = { # noqa: SLF001 - "vendor": "liveavatar", - "params": { - "api_key": "avatar-key", - "quality": "high", - "agora_uid": "200", - "agora_token": "avatar-token", - }, - } - - with pytest.raises(ValueError, match="cascading"): - agent.to_properties( - channel="room", - agent_uid="1", - remote_uids=["100"], - token="rtc-token", - ) - - -def test_to_properties_allows_mllm_with_disabled_avatar_and_no_tts(): - properties = ( - Agent(name="test") - .with_mllm(OpenAIRealtime(api_key="mllm-key")) - .with_avatar( - LiveAvatarAvatar( - api_key="avatar-key", - quality="medium", - agora_uid="2", - agora_token="avatar-token", - enable=False, - ) - ) - .to_properties( - channel="room", - agent_uid="1", - remote_uids=["100"], - token="rtc-token", - ) - ) - - assert properties.mllm is not None and properties.mllm.enable is True - assert properties.tts is None - assert properties.llm is None - assert properties.asr is None - assert properties.avatar is not None and properties.avatar.enable is False - - -def test_to_properties_mllm_without_tts_or_llm_succeeds(): - properties = ( - Agent(name="test") - .with_mllm(OpenAIRealtime(api_key="mllm-key")) - .to_properties( - channel="room", - agent_uid="1", - remote_uids=["100"], - token="rtc-token", - ) - ) - - assert properties.mllm is not None and properties.mllm.enable is True - assert properties.tts is None - assert properties.llm is None - assert properties.asr is None - assert properties.avatar is None diff --git a/tests/custom/test_agentkit_session.py b/tests/custom/test_agentkit_session.py deleted file mode 100644 index 198fcd0..0000000 --- a/tests/custom/test_agentkit_session.py +++ /dev/null @@ -1,383 +0,0 @@ -from types import SimpleNamespace - -import pytest - -from agora_agent.agentkit import Agent, AgentSession -from agora_agent.agentkit.vendors import ( - ElevenLabsTTS, - GenericAvatar, - LiveAvatarAvatar, - OpenAI, - OpenAIRealtime, - RimeTTS, -) -from agora_agent.agents.types.get_turns_agents_response import GetTurnsAgentsResponse - - -APP_ID = "0" * 32 -APP_CERTIFICATE = "1" * 32 - - -class _Agents: - def __init__(self): - self.calls = [] - self.start_calls = [] - - def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): - self.start_calls.append((app_id, name, properties, preset, pipeline_id, request_options)) - return SimpleNamespace(agent_id="agent-1") - - def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): - self.calls.append((app_id, agent_id, page_index, page_size, request_options)) - is_last_page = page_index != 1 - return GetTurnsAgentsResponse( - agent_id=agent_id, - channel="room", - total_turn_count=2, - pagination={ - "page_index": page_index or 1, - "total_pages": 2, - "is_last_page": is_last_page, - }, - turns=[{"turn_id": float(page_index or 1)}], - ) - - -class _Client: - auth_mode = "basic" - app_id = APP_ID - app_certificate = APP_CERTIFICATE - - def __init__(self): - self.agents = _Agents() - self.agent_management = object() - - -def _session(agent, warn=None): - return AgentSession( - client=_Client(), - agent=agent, - app_id=APP_ID, - app_certificate=APP_CERTIFICATE, - name="test", - channel="room", - agent_uid="1", - remote_uids=["100"], - warn=warn, - ) - - -def test_generic_avatar_enrichment_adds_session_context_and_token(): - agent = Agent(name="test").with_avatar( - GenericAvatar( - api_key="avatar-key", - api_base_url="https://avatar.example.com", - avatar_id="avatar-1", - agora_uid="2", - ) - ) - session = _session(agent) - - properties = session._build_start_properties({ # noqa: SLF001 - "app_id": APP_ID, - "app_certificate": APP_CERTIFICATE, - }) - - params = properties["avatar"]["params"] - assert params["agora_appid"] == APP_ID - assert params["agora_channel"] == "room" - assert params["agora_token"] - assert params["agora_token"] != properties["token"] - - -def test_generic_avatar_empty_session_fields_are_filled(): - agent = Agent(name="test").with_avatar( - GenericAvatar( - api_key="avatar-key", - api_base_url="https://avatar.example.com", - avatar_id="avatar-1", - agora_uid="2", - agora_appid="", - agora_channel="", - agora_token="", - ) - ) - session = _session(agent) - - properties = session._build_start_properties({ # noqa: SLF001 - "app_id": APP_ID, - "app_certificate": APP_CERTIFICATE, - }) - - params = properties["avatar"]["params"] - assert params["agora_appid"] == APP_ID - assert params["agora_channel"] == "room" - assert params["agora_token"] - - -def test_avatar_uid_matching_agent_uid_warns(): - warnings = [] - agent = Agent(name="test").with_avatar( - GenericAvatar( - api_key="avatar-key", - api_base_url="https://avatar.example.com", - avatar_id="avatar-1", - agora_uid="1", - ) - ) - session = _session(agent, warn=warnings.append) - - session._build_start_properties({ # noqa: SLF001 - "app_id": APP_ID, - "app_certificate": APP_CERTIFICATE, - }) - - assert any("matches agent_rtc_uid" in warning for warning in warnings) - - -def test_session_start_properties_agent_level_llm_fields_override_vendor_defaults(): - agent = ( - Agent(name="test") - .with_llm( - OpenAI( - api_key="llm-key", - greeting_message="vendor greeting", - failure_message="vendor failure", - max_history=1, - ) - ) - .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) - .with_greeting("agent greeting") - .with_failure_message("agent failure") - .with_max_history(2) - ) - session = _session(agent) - - properties = session._build_start_properties({ # noqa: SLF001 - "app_id": APP_ID, - "app_certificate": APP_CERTIFICATE, - }) - - assert properties["llm"]["greeting_message"] == "agent greeting" - assert properties["llm"]["failure_message"] == "agent failure" - assert properties["llm"]["max_history"] == 2 - - -def test_session_start_properties_applies_mllm_agent_level_defaults(): - agent = ( - Agent(name="test") - .with_mllm(OpenAIRealtime(api_key="mllm-key")) - .with_greeting("agent greeting") - .with_failure_message("agent failure") - .with_max_history(2) - ) - session = _session(agent) - - properties = session._build_start_properties({ # noqa: SLF001 - "app_id": APP_ID, - "app_certificate": APP_CERTIFICATE, - }) - - assert properties["mllm"]["greeting_message"] == "agent greeting" - assert properties["mllm"]["failure_message"] == "agent failure" - assert "max_history" not in properties["mllm"] - - -def test_session_start_properties_preserves_mllm_vendor_defaults(): - agent = ( - Agent(name="test") - .with_mllm( - OpenAIRealtime( - api_key="mllm-key", - greeting_message="vendor greeting", - failure_message="vendor failure", - ) - ) - .with_greeting("agent greeting") - .with_failure_message("agent failure") - .with_max_history(2) - ) - session = _session(agent) - - properties = session._build_start_properties({ # noqa: SLF001 - "app_id": APP_ID, - "app_certificate": APP_CERTIFICATE, - }) - - assert properties["mllm"]["greeting_message"] == "vendor greeting" - assert properties["mllm"]["failure_message"] == "vendor failure" - assert "max_history" not in properties["mllm"] - - -def test_session_start_allows_mllm_without_tts(): - agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) - session = _session(agent) - - assert session.start() == "agent-1" - - -def test_session_start_rejects_mllm_with_enabled_avatar(): - agent = ( - Agent(name="test") - .with_mllm(OpenAIRealtime(api_key="mllm-key")) - .with_avatar( - LiveAvatarAvatar( - api_key="avatar-key", - quality="medium", - agora_uid="2", - agora_token="avatar-token", - ) - ) - ) - session = _session(agent) - - with pytest.raises(ValueError, match="cascading"): - session.start() - assert session._client.agents.start_calls == [] # noqa: SLF001 - - -def test_session_start_allows_mllm_with_disabled_avatar(): - agent = ( - Agent(name="test") - .with_mllm(OpenAIRealtime(api_key="mllm-key")) - .with_avatar( - LiveAvatarAvatar( - api_key="avatar-key", - quality="medium", - agora_uid="2", - agora_token="avatar-token", - enable=False, - ) - ) - ) - session = _session(agent) - - assert session.start() == "agent-1" - - -def test_avatar_sample_rate_validation_uses_serialized_vendor_keys(): - warnings = [] - agent = ( - Agent(name="test") - .with_avatar(LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2")) - .with_tts(RimeTTS(key="tts-key", speaker="speaker", sampling_rate=24000)) - ) - session = _session(agent, warn=warnings.append) - - session._validate_avatar_config() # noqa: SLF001 - - assert warnings == [] - - -def test_avatar_user_token_is_not_overwritten(): - agent = Agent(name="test").with_avatar( - LiveAvatarAvatar( - api_key="live-key", - quality="medium", - agora_uid="2", - agora_token="user-token", - ) - ) - session = _session(agent) - - properties = session._build_start_properties({ # noqa: SLF001 - "app_id": APP_ID, - "app_certificate": APP_CERTIFICATE, - }) - - assert properties["avatar"]["params"]["agora_token"] == "user-token" - - -def test_get_turns_forwards_pagination_args(): - session = _session(Agent(name="test")) - session._agent_id = "agent-id" # noqa: SLF001 - - session.get_turns(page_index=3, page_size=25) - - assert session._client.agents.calls[-1][:4] == (APP_ID, "agent-id", 3, 25) # noqa: SLF001 - - -def test_get_all_turns_aggregates_pages(): - session = _session(Agent(name="test")) - session._agent_id = "agent-id" # noqa: SLF001 - - response = session.get_all_turns(page_size=1) - - assert [turn.turn_id for turn in response.turns] == [1.0, 2.0] - assert response.pagination.page_index == 2 - - -def test_get_all_turns_raises_when_pagination_does_not_advance(): - class _StuckAgents: - def __init__(self): - self.calls = 0 - - def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): - self.calls += 1 - return GetTurnsAgentsResponse( - agent_id=agent_id, - channel="room", - total_turn_count=2, - pagination={"page_index": 1, "is_last_page": False}, - turns=[{"turn_id": 1.0}], - ) - - class _StuckClient: - auth_mode = "basic" - app_id = APP_ID - app_certificate = APP_CERTIFICATE - - def __init__(self): - self.agents = _StuckAgents() - self.agent_management = object() - - session = AgentSession( - client=_StuckClient(), - agent=Agent(name="test"), - app_id=APP_ID, - app_certificate=APP_CERTIFICATE, - name="test", - channel="room", - agent_uid="1", - remote_uids=["100"], - ) - session._agent_id = "agent-id" # noqa: SLF001 - - with pytest.raises(RuntimeError, match="did not advance"): - session.get_all_turns(page_size=1) - - -def test_get_all_turns_raises_when_pagination_metadata_missing(): - class _NoMetaAgents: - def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): - return GetTurnsAgentsResponse( - agent_id=agent_id, - channel="room", - total_turn_count=1, - pagination={"is_last_page": False}, - turns=[{"turn_id": 1.0}], - ) - - class _NoMetaClient: - auth_mode = "basic" - app_id = APP_ID - app_certificate = APP_CERTIFICATE - - def __init__(self): - self.agents = _NoMetaAgents() - self.agent_management = object() - - session = AgentSession( - client=_NoMetaClient(), - agent=Agent(name="test"), - app_id=APP_ID, - app_certificate=APP_CERTIFICATE, - name="test", - channel="room", - agent_uid="1", - remote_uids=["100"], - ) - session._agent_id = "agent-id" # noqa: SLF001 - - with pytest.raises(RuntimeError, match="cannot continue"): - session.get_all_turns(page_size=1) diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py deleted file mode 100644 index 8473821..0000000 --- a/tests/custom/test_agentkit_vendors.py +++ /dev/null @@ -1,122 +0,0 @@ -import pytest -from pydantic import ValidationError - -from agora_agent.agentkit import LlmGreetingConfigs -import warnings - -from agora_agent.agentkit.vendors import GenericAvatar, OpenAI, OpenAIRealtime, XaiGrok, XaiRealtime - - -def test_xai_grok_serializes_v27_shape_without_style(): - config = XaiGrok( - api_key="xai-key", - voice="eve", - language="en", - sample_rate=24000, - output_modalities=["audio", "text"], - params={"temperature": 0.2}, - ).to_config() - - assert config["vendor"] == "xai" - assert config["url"] == "wss://api.x.ai/v1/realtime" - assert config["api_key"] == "xai-key" - assert config["params"] == { - "temperature": 0.2, - "voice": "eve", - "language": "en", - "sample_rate": 24000, - } - assert config["output_modalities"] == ["audio", "text"] - assert "style" not in config - - -def test_xai_grok_emits_params_even_when_empty(): - assert XaiGrok(api_key="xai-key").to_config()["params"] == {} - - -def test_xai_realtime_deprecated_alias_emits_same_vendor(): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always", DeprecationWarning) - config = XaiRealtime(api_key="xai-key").to_config() - assert len(caught) == 1 - assert issubclass(caught[0].category, DeprecationWarning) - assert config["vendor"] == "xai" - - -def test_mllm_rejects_fields_not_in_core_contract(): - with pytest.raises(ValidationError): - OpenAIRealtime(api_key="openai-key", predefined_tools=["_publish_message"]) - - with pytest.raises(ValidationError): - XaiGrok(api_key="xai-key", max_history=10) - - -def test_generic_avatar_omits_session_enriched_fields_when_unset(): - config = GenericAvatar( - api_key="avatar-key", - api_base_url="https://avatar.example.com", - avatar_id="avatar-1", - agora_uid="2", - ).to_config() - - assert config == { - "enable": True, - "vendor": "generic", - "params": { - "api_key": "avatar-key", - "api_base_url": "https://avatar.example.com", - "avatar_id": "avatar-1", - "agora_uid": "2", - }, - } - - -def test_vertex_ai_explicit_fields_override_additional_params(): - from agora_agent.agentkit.vendors import VertexAI - - config = VertexAI( - model="explicit-model", - project_id="explicit-project", - location="explicit-region", - adc_credentials_string="{}", - additional_params={ - "model": "should-be-overridden", - "project_id": "should-be-overridden", - "location": "should-be-overridden", - "adc_credentials_string": "should-be-overridden", - "extra_key": "kept", - }, - ).to_config() - - assert config["vendor"] == "vertexai" - assert config["params"]["model"] == "explicit-model" - assert config["params"]["project_id"] == "explicit-project" - assert config["params"]["location"] == "explicit-region" - assert config["params"]["adc_credentials_string"] == "{}" - assert config["params"]["extra_key"] == "kept" - - -def test_gemini_live_explicit_fields_override_additional_params(): - from agora_agent.agentkit.vendors import GeminiLive - - config = GeminiLive( - api_key="key", - model="explicit-model", - additional_params={ - "model": "should-be-overridden", - "extra_key": "kept", - }, - ).to_config() - - assert config["params"]["model"] == "explicit-model" - assert config["params"]["extra_key"] == "kept" - - -def test_llm_greeting_configs_interruptable_serializes(): - config = OpenAI( - api_key="openai-key", - greeting_configs=LlmGreetingConfigs(mode="single_first", interruptable=False), - ).to_config() - - assert config["greeting_configs"]["mode"] == "single_first" - assert config["greeting_configs"]["interruptable"] is False From 972dd5bdafc09b3981ab2ce4e0d02beae165c626 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 21 May 2026 16:13:35 -0400 Subject: [PATCH 11/28] updated docs --- docs/reference/agent.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 1e88b8b..3163f9c 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -264,3 +264,18 @@ to_properties( | `rtc` | `Optional[RtcConfig]` | RTC configuration | | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | | `config` | `Dict[str, Any]` | Full configuration dict | + +## Type aliases + +Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + +Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + +## Cross-SDK discovery map + +| Concept | Python | TypeScript | Go | +|---|---|---|---| +| STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | +| xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | +| Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | +| Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | From c81907826353a569018352ce7f86c658d391729e Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 21 May 2026 16:13:52 -0400 Subject: [PATCH 12/28] updated changelog --- changelog.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/changelog.md b/changelog.md index 31784c8..be38287 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [Unreleased] + +### Added + +- **Alias parity** — `AsrConfig` (= `SttConfig`), `is_avatar_token_managed`, think type aliases (`ThinkOnListeningAction`, etc.), think value constants, and cross-SDK discovery table in `docs/reference/agent.md`. + +### Changed + +- **Avatar token gating** — Session enrichment uses `is_avatar_token_managed` (vendor-only); UID checks remain in session logic. + +### Deprecated (legacy naming — use replacements) + +- **`HeyGenAvatar`** — Use `LiveAvatarAvatar`. Emits `DeprecationWarning`. +- **`is_rtc_avatar`** — Use `is_avatar_token_managed` for vendor gating. Emits `DeprecationWarning`. +- **`AgentThinkRequestOn*`** / **`AgentThinkResponse`** — Use `ThinkOn*` / `ThinkResponse`. + ## [v1.5.0] — 2026-05-20 ### Added @@ -17,7 +33,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Changed -- **`XaiGrok` is the primary xAI MLLM class** — Matches the product name ([xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai)) and the TypeScript/Go SDKs. `XaiRealtime` remains as a deprecated backward-compatible alias. +- **`XaiGrok` is the primary xAI MLLM class** — Matches the product name ([xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai)) and the TypeScript/Go SDKs. - **Package version** — Bumped to `v1.5.0` to match the Fern-generated SDK headers. - **RTM data channel default** — When `advanced_features.enable_rtm=True`, AgentKit now defaults `parameters.data_channel` to `"rtm"` unless the caller explicitly sets a data channel. - **Agent-level LLM overrides** — In the standard ASR + LLM + TTS pipeline, agent-level `greeting`, `failure_message`, and `max_history` now override vendor defaults, matching the TypeScript SDK. In MLLM mode, agent-level `greeting` and `failure_message` fill only missing fields. @@ -29,7 +45,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Migration notes -- **`XaiRealtime` → `XaiGrok`** — Import and use `XaiGrok` for xAI MLLM (`mllm.vendor`: `"xai"`). `XaiRealtime` still works but emits `DeprecationWarning`. Future xAI cascading vendors will use `XaiSTT` and `XaiTTS`, not `XaiRealtime`. - **`think()` default** — The server default for `on_listening_action` changed from `inject` to `interrupt` in API v2.7. Pass `on_listening_action="inject"` explicitly to preserve the old behavior. - **Turn analytics pagination** — Sessions with more than 50 turns must request additional pages via `get_turns(page_index=..., page_size=...)` or use `get_all_turns()`. - **Error reasons** — API v2.7 adds status codes `401`, `429`, and `500`; `InvalidRequest` is split into `InvalidRequestBody`, `MissingRequiredField`, and `InvalidFieldValue`, with new reasons such as `ServiceNotEnabled`, `AccountSuspended`, and `ResourceAllocationFailed`. @@ -113,7 +128,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). - `MurfTTS` vendor - ### Fixed - `MiniMaxTTS`: added required `group_id`, `url`, and correctly nested `voice_setting.voice_id` — previously missing, requiring users to bypass the SDK entirely From 7c2d9d99e5ccfde353fe6a597dd42cbb5803fa55 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 21 May 2026 17:40:26 -0400 Subject: [PATCH 13/28] feat(agentkit): align session options and token uid handling --- docs/concepts/vendors.md | 2 - docs/reference/vendors.md | 4 +- src/agora_agent/agentkit/__init__.py | 38 ++++++++++++++ src/agora_agent/agentkit/agent.py | 63 ++++++++++++++++++++++- src/agora_agent/agentkit/agent_session.py | 46 ++++++++++++++--- src/agora_agent/agentkit/constants.py | 40 ++++++++++++++ src/agora_agent/agentkit/token.py | 23 +++++++-- src/agora_agent/pool_client.py | 6 +-- tests/custom/test_avatar_token.py | 2 +- 9 files changed, 201 insertions(+), 23 deletions(-) diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 81f585b..7ef72b4 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -99,8 +99,6 @@ Used with `agent.with_mllm()` for the [MLLM flow](../guides/mllm-flow.md). These | `VertexAI` | Vertex AI (Gemini Live) | `model`, `project_id`, `location`, `adc_credentials_string`; optional `turn_detection` | | `XaiGrok` | xAI Grok (`mllm.vendor`: `xai`) | `api_key`; optional `voice`, `language`, `sample_rate`, `turn_detection` | -`XaiRealtime` is deprecated (use `XaiGrok`). Future xAI STT/TTS wrappers will be `XaiSTT` / `XaiTTS`, not `XaiRealtime`. - ```python from agora_agent.agentkit.vendors import OpenAIRealtime diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index ad1b779..0b606dd 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -411,8 +411,6 @@ Fixed sample rate: 24000 Hz. xAI Grok MLLM vendor (`mllm.vendor`: `"xai"`). Matches the [xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai) product docs and the TypeScript SDK. -> `XaiRealtime` is deprecated and remains only as a backward-compatible alias. - | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | xAI API key | @@ -485,4 +483,4 @@ Same options as `HeyGenAvatar`, but serializes `vendor: "liveavatar"`. `agora_to | `agora_channel` | `str` | No | `None` | Optional; filled from the session channel when omitted. | | `enable` | `bool` | No | `True` | Enable or disable the avatar | -Avatar tokens are separate from the agent join token but generated with the same `generate_convo_ai_token` path, using the avatar's `agora_uid` as the account. +Avatar tokens are separate from the agent join token but generated with the same `generate_convo_ai_token` path, using the avatar's `agora_uid` as `uid`. diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py index e9ab221..a03043b 100644 --- a/src/agora_agent/agentkit/__init__.py +++ b/src/agora_agent/agentkit/__init__.py @@ -62,6 +62,11 @@ SessionInfo, SessionListResponse, SessionSummary, + SessionStatus, + SayOptions, + ThinkOptions, + GetTurnsOptions, + SessionOptions, SpeakPriority, ThinkOnListeningAction, ThinkOnSpeakingAction, @@ -95,6 +100,20 @@ GeofenceArea, GeofenceExcludeArea, FillerWordsSelectionRule, + InterruptionDisabledStrategy, + InterruptionDisabledStrategyAppend, + InterruptionDisabledStrategyIgnore, + InterruptionMode as InterruptionModeValues, + InterruptionModeKeywords, + InterruptionModeStartOfSpeech, + MllmTurnDetectionMode as MllmTurnDetectionModeValues, + MllmTurnDetectionModeAgoraVad, + MllmTurnDetectionModeSemanticVad, + MllmTurnDetectionModeServerVad, + SpeakPriority as SpeakPriorityValues, + SpeakPriorityAppend, + SpeakPriorityIgnore, + SpeakPriorityInterrupt, ThinkOnListeningActionIgnore, ThinkOnListeningActionInject, ThinkOnListeningActionInterrupt, @@ -238,6 +257,20 @@ "GeofenceExcludeArea", "FillerWordsSelectionRule", "TurnDetectionTypeValues", + "InterruptionModeValues", + "InterruptionModeStartOfSpeech", + "InterruptionModeKeywords", + "InterruptionDisabledStrategy", + "InterruptionDisabledStrategyAppend", + "InterruptionDisabledStrategyIgnore", + "SpeakPriorityValues", + "SpeakPriorityInterrupt", + "SpeakPriorityAppend", + "SpeakPriorityIgnore", + "MllmTurnDetectionModeValues", + "MllmTurnDetectionModeAgoraVad", + "MllmTurnDetectionModeServerVad", + "MllmTurnDetectionModeSemanticVad", "ThinkOnListeningActionInject", "ThinkOnListeningActionInterrupt", "ThinkOnListeningActionIgnore", @@ -255,6 +288,11 @@ "SessionInfo", "SessionListResponse", "SessionSummary", + "SessionStatus", + "SayOptions", + "ThinkOptions", + "GetTurnsOptions", + "SessionOptions", "ConversationHistory", "ConversationTurn", "ConversationRole", diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 14933a2..a820291 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -20,6 +20,7 @@ from ..agents.types.get_agents_response import GetAgentsResponse from ..agents.types.list_agents_response import ListAgentsResponse from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem +from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole @@ -139,6 +140,7 @@ SessionInfo = GetAgentsResponse SessionListResponse = ListAgentsResponse SessionSummary = ListAgentsResponseDataListItem +SessionStatus = ListAgentsResponseDataListItemStatus ConversationHistory = GetHistoryAgentsResponse ConversationTurn = GetHistoryAgentsResponseContentsItem ConversationRole = GetHistoryAgentsResponseContentsItemRole @@ -156,6 +158,39 @@ class SessionParamsInput(typing_extensions.TypedDict, total=False): enable_error_message: bool audio_scenario: ParametersAudioScenario + +class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + +class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + +class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + +class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + # LLM sub-type aliases LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode @@ -177,7 +212,15 @@ class SessionParamsInput(typing_extensions.TypedDict, total=False): ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction ThinkResponse = AgentThinkAgentManagementResponse -from .token import generate_convo_ai_token, _validate_expires_in +from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + +def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value class Agent: @@ -216,6 +259,7 @@ def __init__( labels: typing.Optional[typing.Dict[str, str]] = None, rtc: typing.Optional[RtcConfig] = None, filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, ): self._name = name self._instructions = instructions @@ -238,6 +282,7 @@ def __init__( self._labels = labels self._rtc = rtc self._filler_words = filler_words + self._greeting_configs = greeting_configs def with_llm(self, vendor: BaseLLM) -> "Agent": new_agent = self._clone() @@ -334,6 +379,12 @@ def with_greeting(self, greeting: str) -> "Agent": new_agent._greeting = greeting return new_agent + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Returns a new Agent with greeting playback configuration.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + def with_name(self, name: str) -> "Agent": new_agent = self._clone() new_agent._name = name @@ -512,6 +563,10 @@ def instructions(self) -> typing.Optional[str]: def greeting(self) -> typing.Optional[str]: return self._greeting + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + @property def failure_message(self) -> typing.Optional[str]: return self._failure_message @@ -574,6 +629,7 @@ def config(self) -> typing.Dict[str, typing.Any]: "labels": self._labels, "rtc": self._rtc, "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, } def create_session( @@ -697,7 +753,7 @@ def to_properties( app_id=app_id, app_certificate=app_certificate, channel_name=channel, - account=agent_uid, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), **token_kwargs, ) @@ -765,6 +821,8 @@ def to_properties( llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] if self._greeting is not None: llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) if self._failure_message is not None: llm_config["failure_message"] = self._failure_message if self._max_history is not None: @@ -800,4 +858,5 @@ def _clone(self) -> "Agent": new_agent._labels = self._labels new_agent._rtc = self._rtc new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs return new_agent diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index 269619e..5590c0d 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -28,7 +28,7 @@ validate_tts_sample_rate, ) from .presets import resolve_session_presets -from .token import generate_convo_ai_token +from .token import generate_convo_ai_token, _parse_numeric_uid class _AgentSessionRequiredOptions(typing.TypedDict, total=True): @@ -169,7 +169,7 @@ def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: app_id=app_id, app_certificate=app_certificate, channel_name=self._channel, - account=self._agent_uid, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), ) return {"Authorization": f"agora token={token}"} @@ -263,7 +263,7 @@ def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) - app_id=self._app_id, app_certificate=self._app_certificate, channel_name=self._channel, - account=str(params["agora_uid"]), + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), **token_kwargs, ) @@ -323,6 +323,8 @@ def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> t llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] if self._agent.greeting is not None: llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) if self._agent.failure_message is not None: llm["failure_message"] = self._agent.failure_message if self._agent.max_history is not None: @@ -525,6 +527,8 @@ def say( text: str, priority: typing.Optional[str] = None, interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, ) -> None: """Send a message to be spoken by the agent. @@ -543,6 +547,8 @@ def say( raise RuntimeError("No agent ID available") kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) if priority is not None: kwargs["priority"] = priority if interruptable is not None: @@ -572,6 +578,7 @@ def think( on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, interruptable: typing.Optional[bool] = None, metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, ) -> AgentThinkResponse: """Inject a custom text instruction into the current session pipeline. @@ -585,6 +592,8 @@ def think( raise RuntimeError("No agent ID available") kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) if on_listening_action is not None: kwargs["on_listening_action"] = on_listening_action if on_thinking_action is not None: @@ -646,17 +655,25 @@ def get_turns( *, page_index: typing.Optional[int] = None, page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, ) -> GetTurnsAgentsResponse: """Get turn-by-turn analytics and timing details for this session.""" if not self._agent_id: raise RuntimeError("No agent ID available") + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + return self._client.agents.get_turns( self._app_id, self._agent_id, - page_index=page_index, - page_size=page_size, request_options=self._request_options(), + **kwargs, ) def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: @@ -833,6 +850,8 @@ async def say( text: str, priority: typing.Optional[str] = None, interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, ) -> None: """Send a message to be spoken by the agent. @@ -851,6 +870,8 @@ async def say( raise RuntimeError("No agent ID available") kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) if priority is not None: kwargs["priority"] = priority if interruptable is not None: @@ -880,6 +901,7 @@ async def think( on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, interruptable: typing.Optional[bool] = None, metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, ) -> AgentThinkResponse: """Inject a custom text instruction into the current session pipeline. @@ -893,6 +915,8 @@ async def think( raise RuntimeError("No agent ID available") kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) if on_listening_action is not None: kwargs["on_listening_action"] = on_listening_action if on_thinking_action is not None: @@ -954,17 +978,25 @@ async def get_turns( *, page_index: typing.Optional[int] = None, page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, ) -> GetTurnsAgentsResponse: """Get turn-by-turn analytics and timing details for this session.""" if not self._agent_id: raise RuntimeError("No agent ID available") + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + return await self._client.agents.get_turns( self._app_id, self._agent_id, - page_index=page_index, - page_size=page_size, request_options=self._request_options(), + **kwargs, ) async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py index c0a852e..eb186da 100644 --- a/src/agora_agent/agentkit/constants.py +++ b/src/agora_agent/agentkit/constants.py @@ -60,6 +60,46 @@ class TurnDetectionTypeValues: SEMANTIC_VAD = "semantic_vad" +class InterruptionMode: + START_OF_SPEECH = "start_of_speech" + KEYWORDS = "keywords" + + +InterruptionModeStartOfSpeech = InterruptionMode.START_OF_SPEECH +InterruptionModeKeywords = InterruptionMode.KEYWORDS + + +class InterruptionDisabledStrategy: + APPEND = "append" + IGNORE = "ignore" + + +InterruptionDisabledStrategyAppend = InterruptionDisabledStrategy.APPEND +InterruptionDisabledStrategyIgnore = InterruptionDisabledStrategy.IGNORE + + +class SpeakPriority: + INTERRUPT = "INTERRUPT" + APPEND = "APPEND" + IGNORE = "IGNORE" + + +SpeakPriorityInterrupt = SpeakPriority.INTERRUPT +SpeakPriorityAppend = SpeakPriority.APPEND +SpeakPriorityIgnore = SpeakPriority.IGNORE + + +class MllmTurnDetectionMode: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + +MllmTurnDetectionModeAgoraVad = MllmTurnDetectionMode.AGORA_VAD +MllmTurnDetectionModeServerVad = MllmTurnDetectionMode.SERVER_VAD +MllmTurnDetectionModeSemanticVad = MllmTurnDetectionMode.SEMANTIC_VAD + + # Think action value constants (match Fern wire values) ThinkOnListeningActionInject = "inject" ThinkOnListeningActionInterrupt = "interrupt" diff --git a/src/agora_agent/agentkit/token.py b/src/agora_agent/agentkit/token.py index 338e9db..9dd1aaa 100644 --- a/src/agora_agent/agentkit/token.py +++ b/src/agora_agent/agentkit/token.py @@ -85,7 +85,7 @@ class GenerateConvoAITokenOptions(typing.TypedDict, total=False): app_id: str app_certificate: str channel_name: str - account: str + uid: int token_expire: int privilege_expire: int @@ -242,7 +242,7 @@ def generate_convo_ai_token( app_id: str, app_certificate: str, channel_name: str, - account: str, + uid: int, token_expire: int = DEFAULT_EXPIRY_SECONDS, privilege_expire: int = 0, ) -> str: @@ -262,8 +262,8 @@ def generate_convo_ai_token( Agora App Certificate. channel_name : str The channel the agent will join (must match the start request). - account : str - String account identity — pass the agent UID as a string (e.g. "1001"). + uid : int + Numeric ConvoAI participant UID. Use the RTC UID for a user, agent, or avatar. token_expire : int Seconds until the token expires (default 86400). privilege_expire : int @@ -281,7 +281,7 @@ def generate_convo_ai_token( app_id, app_certificate, channel_name, - account, + _uid_to_account(uid), ROLE_PUBLISHER, token_expire, privilege_expire, @@ -290,6 +290,7 @@ def generate_convo_ai_token( pass priv_expire = privilege_expire if privilege_expire != 0 else token_expire + account = _uid_to_account(uid) rtc_privileges: typing.Dict[int, int] = { 1: priv_expire, # kPrivilegeJoinChannel @@ -310,3 +311,15 @@ def generate_convo_ai_token( token_expire, [(1, rtc_svc), (2, rtm_svc)], ) + + +def _uid_to_account(uid: int) -> str: + if not isinstance(uid, int) or isinstance(uid, bool): + raise TypeError("uid must be an int") + return str(uid) + + +def _parse_numeric_uid(uid: str, label: str) -> int: + if not uid.isdigit(): + raise ValueError(f"{label} must be a numeric RTC UID when auto-generating a ConvoAI token") + return int(uid) diff --git a/src/agora_agent/pool_client.py b/src/agora_agent/pool_client.py index 652b3d2..bd54fff 100644 --- a/src/agora_agent/pool_client.py +++ b/src/agora_agent/pool_client.py @@ -195,7 +195,7 @@ class Agora(BaseAgora): from agora_agent import Agora, Area from agora_agent.agentkit.token import generate_convo_ai_token - raw_token = generate_convo_ai_token(app_id="...", app_certificate="...", channel_name="...", account="1") + raw_token = generate_convo_ai_token(app_id="...", app_certificate="...", channel_name="...", uid=1) client = Agora( area=Area.US, app_id="YOUR_APP_ID", @@ -323,7 +323,7 @@ def stop_agent(self, agent_id: str) -> None: app_id=self.app_id, app_certificate=self.app_certificate, channel_name="stop", - account=agent_id, + uid=0, ) request_options = {"additional_headers": {"Authorization": f"agora token={token}"}} @@ -534,7 +534,7 @@ async def stop_agent(self, agent_id: str) -> None: app_id=self.app_id, app_certificate=self.app_certificate, channel_name="stop", - account=agent_id, + uid=0, ) request_options = {"additional_headers": {"Authorization": f"agora token={token}"}} diff --git a/tests/custom/test_avatar_token.py b/tests/custom/test_avatar_token.py index efe6ac0..fa73fc0 100644 --- a/tests/custom/test_avatar_token.py +++ b/tests/custom/test_avatar_token.py @@ -6,7 +6,7 @@ def test_avatar_tokens_use_convo_ai_token_path_with_avatar_uid(): app_id="0" * 32, app_certificate="1" * 32, channel_name="room", - account="avatar-uid", + uid=123, ) assert token.startswith("007") From b7f0c36cbb91b40ed1c432d795adf4b95a525b8e Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 21 May 2026 18:11:22 -0400 Subject: [PATCH 14/28] feat(agentkit): release v2.0.0 updates --- changelog.md | 24 +++++++----------------- pyproject.toml | 2 +- src/agora_agent/core/client_wrapper.py | 4 ++-- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/changelog.md b/changelog.md index be38287..1f0848d 100644 --- a/changelog.md +++ b/changelog.md @@ -4,26 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). -## [Unreleased] +## [v2.0.0] — 2026-05-21 ### Added - **Alias parity** — `AsrConfig` (= `SttConfig`), `is_avatar_token_managed`, think type aliases (`ThinkOnListeningAction`, etc.), think value constants, and cross-SDK discovery table in `docs/reference/agent.md`. - -### Changed - -- **Avatar token gating** — Session enrichment uses `is_avatar_token_managed` (vendor-only); UID checks remain in session logic. - -### Deprecated (legacy naming — use replacements) - -- **`HeyGenAvatar`** — Use `LiveAvatarAvatar`. Emits `DeprecationWarning`. -- **`is_rtc_avatar`** — Use `is_avatar_token_managed` for vendor gating. Emits `DeprecationWarning`. -- **`AgentThinkRequestOn*`** / **`AgentThinkResponse`** — Use `ThinkOn*` / `ThinkResponse`. - -## [v1.5.0] — 2026-05-20 - -### Added - - **`XaiGrok`** — New MLLM wrapper for xAI Grok (`mllm.vendor`: `"xai"`), including Realtime API URL, voice, language, sample rate, modalities, messages, and MLLM turn detection support. - **`GenericAvatar`** — New generic avatar wrapper (`vendor: "generic"`) for custom avatar providers. - **Avatar token enrichment** — `AgentSession.start()` now fills missing generic avatar `agora_appid` and `agora_channel` from the session and generates missing avatar `agora_token` values for HeyGen, LiveAvatar, and Generic avatars using each avatar's `agora_uid`. @@ -33,8 +18,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Changed +- **ConvoAI token options** — `generate_convo_ai_token()` now accepts an integer `uid` and handles the internal token string conversion for users, agents, and avatars. +- **Avatar token generation** — Removed the dedicated `generate_avatar_rtc_token()` wrapper; avatar RTC tokens use the existing ConvoAI token helper. +- **Avatar token gating** — Session enrichment uses `is_avatar_token_managed` (vendor-only); UID checks remain in session logic. - **`XaiGrok` is the primary xAI MLLM class** — Matches the product name ([xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai)) and the TypeScript/Go SDKs. -- **Package version** — Bumped to `v1.5.0` to match the Fern-generated SDK headers. +- **Package version** — Bumped to `v2.0.0` to match the Fern-generated SDK headers. - **RTM data channel default** — When `advanced_features.enable_rtm=True`, AgentKit now defaults `parameters.data_channel` to `"rtm"` unless the caller explicitly sets a data channel. - **Agent-level LLM overrides** — In the standard ASR + LLM + TTS pipeline, agent-level `greeting`, `failure_message`, and `max_history` now override vendor defaults, matching the TypeScript SDK. In MLLM mode, agent-level `greeting` and `failure_message` fill only missing fields. - **MLLM core alignment** — MLLM wrappers no longer expose or emit unsupported `predefined_tools` or `max_history` fields because they are not present in the generated v2.7 core `mllm` type. @@ -45,6 +33,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Migration notes +- **Deprecated aliases** — Use `LiveAvatarAvatar` instead of `HeyGenAvatar`, `is_avatar_token_managed` instead of `is_rtc_avatar`, and `ThinkOn*` / `ThinkResponse` instead of `AgentThinkRequestOn*` / `AgentThinkResponse`. + - **`think()` default** — The server default for `on_listening_action` changed from `inject` to `interrupt` in API v2.7. Pass `on_listening_action="inject"` explicitly to preserve the old behavior. - **Turn analytics pagination** — Sessions with more than 50 turns must request additional pages via `get_turns(page_index=..., page_size=...)` or use `get_all_turns()`. - **Error reasons** — API v2.7 adds status codes `401`, `429`, and `500`; `InvalidRequest` is split into `InvalidRequestBody`, `MissingRequiredField`, and `InvalidFieldValue`, with new reasons such as `ServiceNotEnabled`, `AccountSuspended`, and `ResourceAllocationFailed`. diff --git a/pyproject.toml b/pyproject.toml index 4369ff3..716428a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "agora-agent-server-sdk" [tool.poetry] name = "agora-agent-server-sdk" -version = "v1.5.0" +version = "v2.0.0" description = "" readme = "README.md" authors = [] diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index ed2ed34..e4d10bc 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agent-server-sdk/v1.5.0", + "User-Agent": "agora-agent-server-sdk/v2.0.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agent-server-sdk", - "X-Fern-SDK-Version": "v1.5.0", + "X-Fern-SDK-Version": "v2.0.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header From 7465fadafa0f1e62051d99b42d0eeda85f31eeee Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Thu, 21 May 2026 22:01:48 -0400 Subject: [PATCH 15/28] fix(agentkit): resolve Python session typing issues --- src/agora_agent/agentkit/agent_session.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index 5590c0d..fb8e548 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -16,7 +16,7 @@ ) from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties -from .agent import Agent +from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions from .avatar_types import ( is_akool_avatar, is_anam_avatar, @@ -348,6 +348,12 @@ def _response_turns(response: typing.Any) -> typing.List[typing.Any]: turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) return list(turns or []) + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + @classmethod def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: data = cls._dump_model(first_response) @@ -685,7 +691,7 @@ def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAg """ response = self.get_turns(page_index=1, page_size=page_size) all_turns = self._response_turns(response) - pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + pagination = self._response_pagination(response) current_page = self._page_value(pagination, "page_index") or 1 while pagination is not None and self._page_value(pagination, "is_last_page") is False: total_pages = self._page_value(pagination, "total_pages") @@ -700,7 +706,7 @@ def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAg next_page = current_page + 1 response = self.get_turns(page_index=next_page, page_size=page_size) all_turns.extend(self._response_turns(response)) - pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + pagination = self._response_pagination(response) returned_index = self._page_value(pagination, "page_index") if pagination else None if returned_index is not None: if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: @@ -1008,7 +1014,7 @@ async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetT """ response = await self.get_turns(page_index=1, page_size=page_size) all_turns = self._response_turns(response) - pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + pagination = self._response_pagination(response) current_page = self._page_value(pagination, "page_index") or 1 while pagination is not None and self._page_value(pagination, "is_last_page") is False: total_pages = self._page_value(pagination, "total_pages") @@ -1023,7 +1029,7 @@ async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetT next_page = current_page + 1 response = await self.get_turns(page_index=next_page, page_size=page_size) all_turns.extend(self._response_turns(response)) - pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + pagination = self._response_pagination(response) returned_index = self._page_value(pagination, "page_index") if pagination else None if returned_index is not None: if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: From 4323b470a852758cbdb41a0f640c9164cd243311 Mon Sep 17 00:00:00 2001 From: chenyuguo Date: Wed, 27 May 2026 15:22:07 +0800 Subject: [PATCH 16/28] rename python package to agora-agents --- .github/workflows/ci.yml | 52 +++++++++++++++++++- README.md | 8 ++- compat/agora-agent-server-sdk/README.md | 11 +++++ compat/agora-agent-server-sdk/pyproject.toml | 39 +++++++++++++++ pyproject.toml | 4 +- src/agora_agent/core/client_wrapper.py | 4 +- src/agora_agent/version.py | 2 +- 7 files changed, 112 insertions(+), 8 deletions(-) create mode 100644 compat/agora-agent-server-sdk/README.md create mode 100644 compat/agora-agent-server-sdk/pyproject.toml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f46ffcf..7ff5650 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,8 +51,58 @@ jobs: curl -sSL https://install.python-poetry.org | python - -y --version 1.5.1 - name: Install dependencies run: poetry install - - name: Publish to pypi + + - name: Verify package versions match release tag + run: | + TAG_VERSION="${GITHUB_REF_NAME#v}" + ROOT_VERSION="$(python - <<'PY' +from pathlib import Path +import tomllib +data = tomllib.loads(Path("pyproject.toml").read_text()) +print(data["tool"]["poetry"]["version"].lstrip("v")) +PY +)" + COMPAT_VERSION="$(python - <<'PY' +from pathlib import Path +import tomllib +data = tomllib.loads(Path("compat/agora-agent-server-sdk/pyproject.toml").read_text()) +print(data["tool"]["poetry"]["version"].lstrip("v")) +PY +)" + COMPAT_DEP_VERSION="$(python - <<'PY' +from pathlib import Path +import tomllib +data = tomllib.loads(Path("compat/agora-agent-server-sdk/pyproject.toml").read_text()) +print(data["tool"]["poetry"]["dependencies"]["agora-agents"]) +PY +)" + + if [ "$ROOT_VERSION" != "$TAG_VERSION" ]; then + echo "Root package version ($ROOT_VERSION) does not match tag version ($TAG_VERSION)." + exit 1 + fi + + if [ "$COMPAT_VERSION" != "$TAG_VERSION" ]; then + echo "Compat package version ($COMPAT_VERSION) does not match tag version ($TAG_VERSION)." + exit 1 + fi + + if [ "$COMPAT_DEP_VERSION" != ">=${TAG_VERSION},<3.0.0" ]; then + echo "Compat package dependency on agora-agents ($COMPAT_DEP_VERSION) does not match >=${TAG_VERSION},<3.0.0." + exit 1 + fi + + - name: Publish primary package to pypi + run: | + poetry config repositories.remote https://upload.pypi.org/legacy/ + poetry --no-interaction -v publish --build --repository remote --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD" + env: + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + + - name: Publish compatibility package to pypi run: | + cd compat/agora-agent-server-sdk poetry config repositories.remote https://upload.pypi.org/legacy/ poetry --no-interaction -v publish --build --repository remote --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD" env: diff --git a/README.md b/README.md index 4a9f9d6..82e55e1 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Agoraio Python Library [![fern shield](https://img.shields.io/badge/%F0%9F%8C%BF-Built%20with%20Fern-brightgreen)](https://buildwithfern.com?utm_source=github&utm_medium=github&utm_campaign=readme&utm_source=https%3A%2F%2Fgithub.com%2FAgoraIO-Conversational-AI%2Fagent-server-sdk-python) -[![pypi](https://img.shields.io/pypi/v/agora-agent-server-sdk)](https://pypi.python.org/pypi/agora-agent-server-sdk) +[![pypi](https://img.shields.io/pypi/v/agora-agents)](https://pypi.python.org/pypi/agora-agents) The Agora Conversational AI SDK provides convenient access to the Agora Conversational AI APIs, enabling you to build voice-powered AI agents with support for both cascading flows (ASR -> LLM -> TTS) @@ -37,7 +37,7 @@ and multimodal flows (MLLM) for real-time audio processing. ## Installation ```sh -pip install agora-agent-server-sdk +pip install agora-agents ``` ## Quick Start @@ -216,6 +216,10 @@ API reference documentation is available [here](https://docs.agora.io/en/convers A full reference for this library is available [here](https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python/blob/HEAD/./reference.md). +## Package Rename Compatibility + +The published package name is now `agora-agents`, while the Python import path remains `agora_agent` for compatibility. The legacy PyPI distribution name `agora-agent-server-sdk` is maintained as a compatibility package in [compat/agora-agent-server-sdk](./compat/agora-agent-server-sdk), and the tag-based release workflow publishes both distributions together. + ## MLLM Flow (Multimodal) For real-time audio processing using OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. MLLM mode does not require separate TTS, STT, or LLM vendors. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md new file mode 100644 index 0000000..1388836 --- /dev/null +++ b/compat/agora-agent-server-sdk/README.md @@ -0,0 +1,11 @@ +# agora-agent-server-sdk + +This package has been renamed to `agora-agents`. + +New projects should install: + +```sh +pip install agora-agents +``` + +This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml new file mode 100644 index 0000000..8efbe53 --- /dev/null +++ b/compat/agora-agent-server-sdk/pyproject.toml @@ -0,0 +1,39 @@ +[project] +name = "agora-agent-server-sdk" + +[tool.poetry] +name = "agora-agent-server-sdk" +version = "v2.0.0" +description = "Compatibility shim for the renamed agora-agents package." +readme = "README.md" +authors = [] +keywords = [] + +classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" +] + +[tool.poetry.urls] +Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + +[tool.poetry.dependencies] +python = "^3.8" +agora-agents = ">=2.0.0,<3.0.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/pyproject.toml b/pyproject.toml index 716428a..36a9552 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [project] -name = "agora-agent-server-sdk" +name = "agora-agents" [tool.poetry] -name = "agora-agent-server-sdk" +name = "agora-agents" version = "v2.0.0" description = "" readme = "README.md" diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index e4d10bc..c44e886 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,9 +26,9 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agent-server-sdk/v2.0.0", + "User-Agent": "agora-agents/v2.0.0", "X-Fern-Language": "Python", - "X-Fern-SDK-Name": "agora-agent-server-sdk", + "X-Fern-SDK-Name": "agora-agents", "X-Fern-SDK-Version": "v2.0.0", **(self.get_custom_headers() or {}), } diff --git a/src/agora_agent/version.py b/src/agora_agent/version.py index 160096b..6f7b967 100644 --- a/src/agora_agent/version.py +++ b/src/agora_agent/version.py @@ -1,3 +1,3 @@ from importlib import metadata -__version__ = metadata.version("agora-agent-server-sdk") +__version__ = metadata.version("agora-agents") From d29165c4ddd8296af703a4e9ed848516f563dd1b Mon Sep 17 00:00:00 2001 From: chenyuguo Date: Wed, 27 May 2026 17:24:50 +0800 Subject: [PATCH 17/28] make python compat package publishable --- compat/agora-agent-server-sdk/README.md | 2 ++ compat/agora-agent-server-sdk/pyproject.toml | 3 +++ .../src/agora_agent_server_sdk_compat/__init__.py | 1 + 3 files changed, 6 insertions(+) create mode 100644 compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md index 1388836..cff3cfe 100644 --- a/compat/agora-agent-server-sdk/README.md +++ b/compat/agora-agent-server-sdk/README.md @@ -9,3 +9,5 @@ pip install agora-agents ``` This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + +It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml index 8efbe53..ac93128 100644 --- a/compat/agora-agent-server-sdk/pyproject.toml +++ b/compat/agora-agent-server-sdk/pyproject.toml @@ -26,6 +26,9 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", "Typing :: Typed" ] +packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} +] [tool.poetry.urls] Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py new file mode 100644 index 0000000..55522c6 --- /dev/null +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py @@ -0,0 +1 @@ +"""Compatibility package for the renamed agora-agents distribution.""" From fae1249a20c53761a2eb5515a1bf92ca666760d1 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 16:58:18 -0400 Subject: [PATCH 18/28] Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. --- compat/agora-agent-server-sdk/README.md | 7 +++++-- .../src/agora_agent_server_sdk_compat/__init__.py | 14 +++++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md index cff3cfe..e43d1d8 100644 --- a/compat/agora-agent-server-sdk/README.md +++ b/compat/agora-agent-server-sdk/README.md @@ -8,6 +8,9 @@ New projects should install: pip install agora-agents ``` -This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. +This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: -It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. +```python +from agora_agent import Agora, Area +from agora_agent_server_sdk_compat import Agora, Area +``` diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py index 55522c6..6283244 100644 --- a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py @@ -1 +1,13 @@ -"""Compatibility package for the renamed agora-agents distribution.""" +"""Compatibility re-exports for the renamed agora-agents package.""" + +import agora_agent as _agora_agent + +__all__ = getattr(_agora_agent, "__all__", []) + + +def __getattr__(name: str): + return getattr(_agora_agent, name) + + +def __dir__(): + return dir(_agora_agent) From 4d32368cb89e01a1fca410e69ddbb2756cb215e1 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 16:58:35 -0400 Subject: [PATCH 19/28] Add compat-build CI job and harden dual-package PyPI publish Build and verify the compat wheel re-exports, gate publish on compat-build, simplify version checks with poetry version, wait for primary package on PyPI, and retry compat publish on failure. --- .github/workflows/ci.yml | 80 ++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7ff5650..81f51c3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,8 +35,38 @@ jobs: - name: Test run: poetry run pytest -rP . + compat-build: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up python + uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Bootstrap poetry + run: | + curl -sSL https://install.python-poetry.org | python - -y --version 1.5.1 + - name: Install primary package + run: poetry install + - name: Build and verify compatibility package + run: | + cd compat/agora-agent-server-sdk + poetry build + cd ../.. + poetry run pip install compat/agora-agent-server-sdk/dist/*.whl + poetry run python - <<'PY' + import agora_agent + from agora_agent_server_sdk_compat import Agora, Area, __version__ + + assert Agora is agora_agent.Agora + assert Area is agora_agent.Area + assert __version__ == agora_agent.__version__ + print("Compat shim re-exports verified.") + PY + publish: - needs: [compile, test] + needs: [compile, test, compat-build] if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') runs-on: ubuntu-latest steps: @@ -55,27 +85,9 @@ jobs: - name: Verify package versions match release tag run: | TAG_VERSION="${GITHUB_REF_NAME#v}" - ROOT_VERSION="$(python - <<'PY' -from pathlib import Path -import tomllib -data = tomllib.loads(Path("pyproject.toml").read_text()) -print(data["tool"]["poetry"]["version"].lstrip("v")) -PY -)" - COMPAT_VERSION="$(python - <<'PY' -from pathlib import Path -import tomllib -data = tomllib.loads(Path("compat/agora-agent-server-sdk/pyproject.toml").read_text()) -print(data["tool"]["poetry"]["version"].lstrip("v")) -PY -)" - COMPAT_DEP_VERSION="$(python - <<'PY' -from pathlib import Path -import tomllib -data = tomllib.loads(Path("compat/agora-agent-server-sdk/pyproject.toml").read_text()) -print(data["tool"]["poetry"]["dependencies"]["agora-agents"]) -PY -)" + ROOT_VERSION="$(poetry version -s | sed 's/^v//')" + COMPAT_VERSION="$(cd compat/agora-agent-server-sdk && poetry version -s | sed 's/^v//')" + COMPAT_DEP_VERSION="$(python -c "import re, sys; from pathlib import Path; text = Path('compat/agora-agent-server-sdk/pyproject.toml').read_text(); match = re.search(r'^agora-agents\s*=\s*\"([^\"]+)\"', text, re.M); sys.exit('agora-agents dependency not found in compat pyproject.toml') if not match else None; print(match.group(1))")" if [ "$ROOT_VERSION" != "$TAG_VERSION" ]; then echo "Root package version ($ROOT_VERSION) does not match tag version ($TAG_VERSION)." @@ -100,11 +112,33 @@ PY PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + - name: Wait for primary package on PyPI + run: | + TAG_VERSION="${GITHUB_REF_NAME#v}" + PACKAGE="agora-agents" + for attempt in $(seq 1 12); do + if pip index versions "$PACKAGE" 2>/dev/null | grep -q "$TAG_VERSION"; then + echo "$PACKAGE==$TAG_VERSION is available on PyPI." + exit 0 + fi + echo "Waiting for $PACKAGE==$TAG_VERSION on PyPI (attempt $attempt/12)..." + sleep 10 + done + echo "Timed out waiting for $PACKAGE==$TAG_VERSION on PyPI." + exit 1 + - name: Publish compatibility package to pypi run: | cd compat/agora-agent-server-sdk poetry config repositories.remote https://upload.pypi.org/legacy/ - poetry --no-interaction -v publish --build --repository remote --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD" + for attempt in $(seq 1 3); do + if poetry --no-interaction -v publish --build --repository remote --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD"; then + exit 0 + fi + echo "Compat publish failed (attempt $attempt/3). Retrying in 15s..." + sleep 15 + done + exit 1 env: PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} From fc9d93c3026a6109d8a5e8b386418592f8d121c5 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 16:58:47 -0400 Subject: [PATCH 20/28] Document agora-agents PyPI install name and migration notes Update installation guide, root README, and changelog for the renamed distribution while keeping the agora_agent import path. --- README.md | 2 +- changelog.md | 2 ++ docs/getting-started/installation.md | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 82e55e1..fff69d9 100644 --- a/README.md +++ b/README.md @@ -218,7 +218,7 @@ A full reference for this library is available [here](https://github.com/AgoraIO ## Package Rename Compatibility -The published package name is now `agora-agents`, while the Python import path remains `agora_agent` for compatibility. The legacy PyPI distribution name `agora-agent-server-sdk` is maintained as a compatibility package in [compat/agora-agent-server-sdk](./compat/agora-agent-server-sdk), and the tag-based release workflow publishes both distributions together. +The published package name is now `agora-agents`, while the Python import path remains `agora_agent` for compatibility. The legacy PyPI distribution name `agora-agent-server-sdk` is maintained as a compatibility package in [compat/agora-agent-server-sdk](./compat/agora-agent-server-sdk). It re-exports the public API from `agora-agents`, and the tag-based release workflow publishes both distributions together. ## MLLM Flow (Multimodal) diff --git a/changelog.md b/changelog.md index 1f0848d..a3bf495 100644 --- a/changelog.md +++ b/changelog.md @@ -23,6 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). - **Avatar token gating** — Session enrichment uses `is_avatar_token_managed` (vendor-only); UID checks remain in session logic. - **`XaiGrok` is the primary xAI MLLM class** — Matches the product name ([xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai)) and the TypeScript/Go SDKs. - **Package version** — Bumped to `v2.0.0` to match the Fern-generated SDK headers. +- **PyPI distribution rename** — The published package name is now `agora-agents` (formerly `agora-agent-server-sdk`). The Python import path remains `agora_agent`. - **RTM data channel default** — When `advanced_features.enable_rtm=True`, AgentKit now defaults `parameters.data_channel` to `"rtm"` unless the caller explicitly sets a data channel. - **Agent-level LLM overrides** — In the standard ASR + LLM + TTS pipeline, agent-level `greeting`, `failure_message`, and `max_history` now override vendor defaults, matching the TypeScript SDK. In MLLM mode, agent-level `greeting` and `failure_message` fill only missing fields. - **MLLM core alignment** — MLLM wrappers no longer expose or emit unsupported `predefined_tools` or `max_history` fields because they are not present in the generated v2.7 core `mllm` type. @@ -33,6 +34,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Migration notes +- **PyPI package rename** — Install `agora-agents` instead of `agora-agent-server-sdk` (`pip install agora-agents` or `poetry add agora-agents`). The import path is unchanged (`from agora_agent import ...`). The legacy PyPI distribution name remains available as a compatibility shim that re-exports the public API from `agora-agents`. - **Deprecated aliases** — Use `LiveAvatarAvatar` instead of `HeyGenAvatar`, `is_avatar_token_managed` instead of `is_rtc_avatar`, and `ThinkOn*` / `ThinkResponse` instead of `AgentThinkRequestOn*` / `AgentThinkResponse`. - **`think()` default** — The server default for `on_listening_action` changed from `inject` to `interrupt` in API v2.7. Pass `on_listening_action="inject"` explicitly to preserve the old behavior. diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index c14bdb2..f6f1750 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -13,13 +13,13 @@ description: Install the Agora Conversational AI Python SDK. ## Install with pip ```sh -pip install agora-agent-sdk +pip install agora-agents ``` ## Install with Poetry ```sh -poetry add agora-agent-sdk +poetry add agora-agents ``` ## Dependencies From 44c21c14a14aa7ad469a18ce86024ff14ca2bf9b Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 17:28:14 -0400 Subject: [PATCH 21/28] Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. --- src/agora_agent/__init__.py | 56 +++++++++++++---------- src/agora_agent/agentkit/agent.py | 3 +- src/agora_agent/agentkit/agent_session.py | 8 +--- tests/custom/test_root_exports.py | 29 ++++++++++++ 4 files changed, 64 insertions(+), 32 deletions(-) create mode 100644 tests/custom/test_root_exports.py diff --git a/src/agora_agent/__init__.py b/src/agora_agent/__init__.py index 791c156..e1a4942 100644 --- a/src/agora_agent/__init__.py +++ b/src/agora_agent/__init__.py @@ -2,9 +2,13 @@ # isort: skip_file +"""Agora Conversational AI Python SDK.""" + import typing from importlib import import_module +from . import agentkit as _agentkit + if typing.TYPE_CHECKING: from . import agents, agentkit, core, phone_numbers, telephony from .core.domain import Area, Pool, create_pool @@ -20,6 +24,7 @@ GenerateTokenOptions, ) from .agentkit.agent_session import AsyncAgentSession + _dynamic_imports: typing.Dict[str, str] = { "Agora": ".pool_client", "Agent": ".agentkit", @@ -41,29 +46,7 @@ "telephony": ".telephony", } - -def __getattr__(attr_name: str) -> typing.Any: - module_name = _dynamic_imports.get(attr_name) - if module_name is None: - raise AttributeError(f"No {attr_name} found in _dynamic_imports for module name -> {__name__}") - try: - module = import_module(module_name, __package__) - if module_name == f".{attr_name}": - return module - else: - return getattr(module, attr_name) - except ImportError as e: - raise ImportError(f"Failed to import {attr_name} from {module_name}: {e}") from e - except AttributeError as e: - raise AttributeError(f"Failed to get {attr_name} from {module_name}: {e}") from e - - -def __dir__(): - lazy_attrs = list(_dynamic_imports.keys()) - return sorted(lazy_attrs) - - -__all__ = [ +_ROOT_ALL = ( "Agora", "Agent", "AgentSession", @@ -83,4 +66,29 @@ def __dir__(): "generate_rtc_token", "phone_numbers", "telephony", -] +) + +__all__ = sorted({*_ROOT_ALL, *_agentkit.__all__}) + + +def __getattr__(attr_name: str) -> typing.Any: + module_name = _dynamic_imports.get(attr_name) + if module_name is not None: + try: + module = import_module(module_name, __package__) + if module_name == f".{attr_name}": + return module + return getattr(module, attr_name) + except ImportError as e: + raise ImportError(f"Failed to import {attr_name} from {module_name}: {e}") from e + except AttributeError as e: + raise AttributeError(f"Failed to get {attr_name} from {module_name}: {e}") from e + + if attr_name in _agentkit.__all__: + return getattr(_agentkit, attr_name) + + raise AttributeError(f"module {__name__!r} has no attribute {attr_name!r}") + + +def __dir__() -> typing.List[str]: + return list(__all__) diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index a820291..f84862c 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -231,8 +231,7 @@ class Agent: Examples -------- - >>> from agora_agent.agentkit import Agent - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT >>> >>> agent = Agent(instructions="You are a helpful voice assistant.") >>> agent = ( diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index fb8e548..a749d1e 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -412,12 +412,10 @@ class AgentSession(_AgentSessionBase): Examples -------- - >>> from agora_agent import Agora, Area - >>> from agora_agent.agentkit import Agent + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS >>> >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = session.start() @@ -735,12 +733,10 @@ class AsyncAgentSession(_AgentSessionBase): Examples -------- - >>> from agora_agent import AsyncAgora, Area - >>> from agora_agent.agentkit import Agent + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS >>> >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are helpful.") - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = await session.start() diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py new file mode 100644 index 0000000..9b2f508 --- /dev/null +++ b/tests/custom/test_root_exports.py @@ -0,0 +1,29 @@ +import pytest + +import agora_agent +import agora_agent.agentkit as agentkit + + +def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + +def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + +def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + +def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + +def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ From 87fc44880ecd4ce016ce4f1751cc531b8b6efb13 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 17:29:04 -0400 Subject: [PATCH 22/28] Update docs to import from agora_agent package root Replace split agentkit import paths across guides, concepts, and reference pages. Correct architecture to three layers and consolidate installation layout. --- docs/concepts/agent.md | 10 +++----- docs/concepts/architecture.md | 35 ++++++++++++-------------- docs/concepts/session.md | 4 +-- docs/concepts/vendors.md | 14 +++++------ docs/getting-started/authentication.md | 3 +-- docs/getting-started/installation.md | 22 ++++++++++------ docs/getting-started/quick-start.md | 3 +-- docs/guides/agent-builder-features.md | 31 ++++++++++++++--------- docs/guides/avatars.md | 8 +++--- docs/guides/byok.md | 4 +-- docs/guides/cascading-flow.md | 14 +++-------- docs/guides/mllm-flow.md | 20 +++++---------- docs/reference/agent.md | 21 +++++----------- docs/reference/session.md | 4 +-- docs/reference/vendors.md | 14 +++++------ 15 files changed, 92 insertions(+), 115 deletions(-) diff --git a/docs/concepts/agent.md b/docs/concepts/agent.md index 0a8b159..dd9d3ed 100644 --- a/docs/concepts/agent.md +++ b/docs/concepts/agent.md @@ -12,7 +12,7 @@ The `Agent` class is a fluent builder for configuring AI agent properties. It co ```python -from agora_agent.agentkit import Agent +from agora_agent import Agent agent = Agent( name='support-assistant', @@ -75,8 +75,8 @@ Each `with_*` method returns a **new** `Agent` instance — the original is unch ```python -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent +from agora_agent import OpenAI, ElevenLabsTTS, DeepgramSTT agent = ( Agent(name='my-agent', instructions='You are a helpful assistant.') @@ -92,9 +92,7 @@ Because each `with_*` call returns a new `Agent`, you can build a base configura ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md index 42dece9..4bdc208 100644 --- a/docs/concepts/architecture.md +++ b/docs/concepts/architecture.md @@ -6,9 +6,9 @@ description: How the Python SDK layers are structured and when to use each. # Architecture -## Two-Layer Design +## Three-Layer Design -The Python SDK has two layers: +The Python SDK has three layers: ``` +--------------------------------------------------+ @@ -83,23 +83,20 @@ await session.stop() The `Agent` builder class is the same for both — it does not make HTTP calls, so it has no async variant. -## Import Paths +## Import paths - ```python -# Top-level client and types -from agora_agent import Agora, AsyncAgora, Area, Pool - -# Agentkit layer -from agora_agent.agentkit import Agent, AgentSession -from agora_agent.agentkit.agent_session import AsyncAgentSession - -# Vendor classes -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT - -# Token helpers -from agora_agent.agentkit.token import generate_rtc_token - -# Also available from top-level -from agora_agent import Agent, AgentSession, AsyncAgentSession, generate_rtc_token +from agora_agent import ( + Agent, + AgentSession, + AsyncAgentSession, + Agora, + AsyncAgora, + Area, + Pool, + OpenAI, + ElevenLabsTTS, + DeepgramSTT, + generate_rtc_token, +) ``` diff --git a/docs/concepts/session.md b/docs/concepts/session.md index 4c9f17d..a7f3150 100644 --- a/docs/concepts/session.md +++ b/docs/concepts/session.md @@ -35,9 +35,7 @@ Use `Agent.create_session()` to create a session: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 7ef72b4..0ab396b 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -8,11 +8,11 @@ description: Typed vendor classes for LLM, TTS, STT, MLLM, and Avatar providers. The SDK provides typed vendor classes for every supported provider. Each vendor class validates its configuration with Pydantic and produces the correct API payload automatically. -All vendor classes are available from `agora_agent.agentkit.vendors`: +All vendor classes are imported from `agora_agent`. ```python -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT +from agora_agent import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT ``` ## LLM Vendors @@ -28,7 +28,7 @@ Used with `agent.with_llm()` for the cascading flow (ASR → LLM → TTS). ```python -from agora_agent.agentkit.vendors import OpenAI +from agora_agent import OpenAI llm = OpenAI(api_key='your-openai-key', model='gpt-4o-mini') ``` @@ -55,7 +55,7 @@ Used with `agent.with_tts()`. Each TTS vendor produces audio at a specific sampl ```python -from agora_agent.agentkit.vendors import ElevenLabsTTS +from agora_agent import ElevenLabsTTS tts = ElevenLabsTTS( key='your-elevenlabs-key', @@ -83,7 +83,7 @@ Used with `agent.with_stt()`. ```python -from agora_agent.agentkit.vendors import DeepgramSTT +from agora_agent import DeepgramSTT stt = DeepgramSTT(api_key='your-deepgram-key', language='en-US', model='nova-2') ``` @@ -101,7 +101,7 @@ Used with `agent.with_mllm()` for the [MLLM flow](../guides/mllm-flow.md). These ```python -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import OpenAIRealtime mllm = OpenAIRealtime(api_key='your-openai-key', model='gpt-4o-realtime-preview') ``` @@ -120,7 +120,7 @@ Used with `agent.with_avatar()` in the cascading ASR + LLM + TTS pipeline. Some ```python -from agora_agent.agentkit.vendors import HeyGenAvatar +from agora_agent import HeyGenAvatar avatar = HeyGenAvatar(api_key='your-heygen-key', quality='medium', agora_uid='2') ``` diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md index b508647..997a854 100644 --- a/docs/getting-started/authentication.md +++ b/docs/getting-started/authentication.md @@ -13,8 +13,7 @@ Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate`, then let `Ag ## Recommended: app credentials ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent, AgentPresets +from agora_agent import Agent, Agora, Area, AgentPresets client = Agora( area=Area.US, diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index f6f1750..54801e5 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -22,15 +22,13 @@ pip install agora-agents poetry add agora-agents ``` -## Dependencies +## Imports -The following packages are installed automatically: +```python +from agora_agent import Agent, Agora, Area, AgentPresets, DeepgramSTT, OpenAI +``` -| Package | Purpose | -|---|---| -| `httpx` (>= 0.21.2) | HTTP client for sync and async requests | -| `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | -| `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | +The package installs as `agora-agents` and imports as `agora_agent`. ## Sync vs. Async @@ -47,4 +45,12 @@ from agora_agent import Agora, Area from agora_agent import AsyncAgora, AsyncAgentSession, Area ``` -Both clients share the same constructor parameters and capabilities. See [Authentication](./authentication.md) for setup details. +## Dependencies + +| Package | Purpose | +| ------------------------------ | ------------------------------------------------------ | +| `httpx` (>= 0.21.2) | HTTP client for sync and async requests | +| `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | +| `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + +See [Authentication](./authentication.md) for setup details. diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index 53ca81d..217a097 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -16,8 +16,7 @@ This guide uses the recommended onboarding path: ## Sync example ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent, AgentPresets +from agora_agent import Agent, Agora, Area, AgentPresets def main() -> None: diff --git a/docs/guides/agent-builder-features.md b/docs/guides/agent-builder-features.md index 16efad6..3b55b49 100644 --- a/docs/guides/agent-builder-features.md +++ b/docs/guides/agent-builder-features.md @@ -30,9 +30,17 @@ For string values with a finite set of options (e.g. `data_channel`, `sal_mode`, SAL helps the agent focus on the primary speaker and suppress background noise. Enable it via `advanced_features` and configure with `with_sal`: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent, AdvancedFeatures, SalConfig, SalModeValues -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import ( + Agent, + Agora, + Area, + AdvancedFeatures, + SalConfig, + SalModeValues, + OpenAI, + ElevenLabsTTS, + DeepgramSTT, +) agent = ( Agent( @@ -57,8 +65,7 @@ Use `SalModeValues.LOCKING` or `SalModeValues.RECOGNITION` for type safety. Enable MLLM, RTM, SAL, or tools: ```python -from agora_agent.agentkit import Agent, AdvancedFeatures -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import Agent, AdvancedFeatures, OpenAIRealtime # MLLM mode (see mllm-flow guide) agent = Agent().with_mllm(OpenAIRealtime(api_key='...')) @@ -75,7 +82,7 @@ agent = Agent().with_tools() Configure silence handling, farewell behavior, and data channel: ```python -from agora_agent.agentkit import ( +from agora_agent import ( Agent, SessionParams, SilenceConfig, @@ -134,7 +141,7 @@ agent = ( Restrict which geographic regions the backend can use: ```python -from agora_agent.agentkit import Agent, GeofenceConfig, GeofenceArea, GeofenceExcludeArea +from agora_agent import Agent, GeofenceConfig, GeofenceArea, GeofenceExcludeArea agent = ( Agent() @@ -179,7 +186,7 @@ agent = ( Configure RTC media encryption: ```python -from agora_agent.agentkit import Agent, RtcConfig +from agora_agent import Agent, RtcConfig agent = ( Agent() @@ -198,7 +205,7 @@ agent = ( Play filler words while waiting for the LLM response: ```python -from agora_agent.agentkit import ( +from agora_agent import ( Agent, FillerWordsConfig, FillerWordsTrigger, @@ -235,7 +242,7 @@ agent = ( Read back configuration via properties: ```python -from agora_agent.agentkit import Agent, GeofenceConfig, GeofenceArea +from agora_agent import Agent, GeofenceConfig, GeofenceArea agent = ( Agent(max_history=20) @@ -260,7 +267,7 @@ agent.config # Full read-only snapshot ```python from agora_agent import Agora, Area -from agora_agent.agentkit import ( +from agora_agent import ( Agent, AdvancedFeatures, SessionParams, @@ -277,7 +284,7 @@ from agora_agent.agentkit import ( DataChannel, FillerWordsSelectionRule, ) -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora( area=Area.US, diff --git a/docs/guides/avatars.md b/docs/guides/avatars.md index 1f1200a..ca50966 100644 --- a/docs/guides/avatars.md +++ b/docs/guides/avatars.md @@ -45,9 +45,7 @@ Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. L HeyGen requires a TTS vendor configured at 24000 Hz: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT, HeyGenAvatar +from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT, HeyGenAvatar client = Agora( area=Area.US, @@ -84,7 +82,7 @@ session.stop() `GenericAvatar` supports custom avatar providers. `agora_appid`, `agora_channel`, and `agora_token` are optional when using `AgentSession.start()`. ```python -from agora_agent.agentkit.vendors import GenericAvatar +from agora_agent import GenericAvatar agent = agent.with_avatar(GenericAvatar( api_key='your-avatar-provider-key', @@ -99,7 +97,7 @@ agent = agent.with_avatar(GenericAvatar( Akool requires a TTS vendor configured at 16000 Hz: ```python -from agora_agent.agentkit.vendors import ElevenLabsTTS, AkoolAvatar +from agora_agent import ElevenLabsTTS, AkoolAvatar agent = ( Agent(name='akool-agent', instructions='You are a helpful assistant.') diff --git a/docs/guides/byok.md b/docs/guides/byok.md index 53fee78..ffbb91b 100644 --- a/docs/guides/byok.md +++ b/docs/guides/byok.md @@ -20,9 +20,7 @@ Typical reasons: ```python import os -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import DeepgramSTT, ElevenLabsTTS, OpenAI +from agora_agent import Agent, Agora, Area, DeepgramSTT, ElevenLabsTTS, OpenAI def main() -> None: diff --git a/docs/guides/cascading-flow.md b/docs/guides/cascading-flow.md index 6b4d59e..43ff2af 100644 --- a/docs/guides/cascading-flow.md +++ b/docs/guides/cascading-flow.md @@ -17,9 +17,7 @@ User audio → STT → LLM → TTS → Agent audio ### Sync ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora( area=Area.US, @@ -45,9 +43,7 @@ session.stop() ```python import asyncio -from agora_agent import AsyncAgora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent, AsyncAgora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT async def main(): client = AsyncAgora( @@ -77,9 +73,7 @@ asyncio.run(main()) This combination keeps everything within the Azure ecosystem: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import AzureOpenAI, MicrosoftTTS, MicrosoftSTT +from agora_agent import Agent, Agora, Area, AzureOpenAI, MicrosoftTTS, MicrosoftSTT client = Agora( area=Area.EU, @@ -118,7 +112,7 @@ session.stop() All LLM vendors support optional parameters for fine-tuning: ```python -from agora_agent.agentkit.vendors import OpenAI +from agora_agent import OpenAI llm = OpenAI( api_key='your-openai-key', diff --git a/docs/guides/mllm-flow.md b/docs/guides/mllm-flow.md index 392206c..328514b 100644 --- a/docs/guides/mllm-flow.md +++ b/docs/guides/mllm-flow.md @@ -20,7 +20,7 @@ MLLM vendors supported by AgentKit: Call `agent.with_mllm(vendor)` to enable MLLM mode. The builder sets `mllm.enable = True` automatically. MLLM sessions do not require TTS, STT, or LLM vendors. Avatars are currently supported only with the cascading ASR + LLM + TTS pipeline. ```python -from agora_agent.agentkit import Agent +from agora_agent import Agent agent = Agent(name='realtime-agent') ``` @@ -30,9 +30,7 @@ agent = Agent(name='realtime-agent') ### Sync ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import Agent, Agora, Area, OpenAIRealtime client = Agora( area=Area.US, @@ -58,9 +56,7 @@ session.stop() ```python import asyncio -from agora_agent import AsyncAgora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import Agent, AsyncAgora, Area, OpenAIRealtime async def main(): client = AsyncAgora( @@ -89,9 +85,7 @@ asyncio.run(main()) Gemini Live uses a Google AI API key: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import GeminiLive +from agora_agent import Agent, Agora, Area, GeminiLive client = Agora( area=Area.AP, @@ -116,9 +110,7 @@ session.stop() ## xAI Grok ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import XaiGrok +from agora_agent import Agent, Agora, Area, XaiGrok client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') @@ -143,7 +135,7 @@ For xAI turn detection, use `mllm.turn_detection` with `agora_vad` or `server_va ## OpenAI Realtime with Custom Options ```python -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import OpenAIRealtime mllm = OpenAIRealtime( api_key='your-openai-key', diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 3163f9c..93770e9 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -6,7 +6,7 @@ description: Full API reference for the Python Agent builder class. # Agent Reference -**Import:** `from agora_agent.agentkit import Agent` or `from agora_agent import Agent` +**Import:** `from agora_agent import Agent` ## Constructor @@ -57,7 +57,7 @@ Set the LLM vendor for cascading flow. ```python -from agora_agent.agentkit.vendors import OpenAI +from agora_agent import OpenAI agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) ``` @@ -67,7 +67,7 @@ Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. ```python -from agora_agent.agentkit.vendors import ElevenLabsTTS +from agora_agent import ElevenLabsTTS agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) ``` @@ -77,7 +77,7 @@ Set the STT (ASR) vendor. ```python -from agora_agent.agentkit.vendors import DeepgramSTT +from agora_agent import DeepgramSTT agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) ``` @@ -87,7 +87,7 @@ Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically set ```python -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import OpenAIRealtime agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) ``` @@ -99,7 +99,7 @@ Raises `ValueError` if the TTS sample rate does not match the avatar's `required ```python -from agora_agent.agentkit.vendors import HeyGenAvatar +from agora_agent import HeyGenAvatar agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) ``` @@ -270,12 +270,3 @@ to_properties( Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. - -## Cross-SDK discovery map - -| Concept | Python | TypeScript | Go | -|---|---|---|---| -| STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | -| xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | -| Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | -| Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | diff --git a/docs/reference/session.md b/docs/reference/session.md index f5ce663..63402f6 100644 --- a/docs/reference/session.md +++ b/docs/reference/session.md @@ -9,8 +9,8 @@ description: Full API reference for the Python AgentSession class. **Import:** ```python -from agora_agent.agentkit import AgentSession -from agora_agent.agentkit.agent_session import AsyncAgentSession +from agora_agent import AgentSession +from agora_agent import AsyncAgentSession # or from top-level: from agora_agent import AgentSession, AsyncAgentSession ``` diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index 0b606dd..eb8290f 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -6,11 +6,11 @@ description: Constructor options for all LLM, TTS, STT, MLLM, and Avatar vendor # Vendor Reference -All vendor classes are available from `agora_agent.agentkit.vendors`: +All vendor classes are available from `agora_agent`: ```python -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIRealtime, XaiGrok, GenericAvatar +from agora_agent import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIRealtime, XaiGrok, GenericAvatar ``` --- @@ -41,7 +41,7 @@ from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, Dee ```python -from agora_agent.agentkit.vendors import OpenAI +from agora_agent import OpenAI llm = OpenAI(api_key='your-key', model='gpt-4o-mini', temperature=0.7) ``` @@ -69,7 +69,7 @@ llm = OpenAI(api_key='your-key', model='gpt-4o-mini', temperature=0.7) ```python -from agora_agent.agentkit.vendors import AzureOpenAI +from agora_agent import AzureOpenAI llm = AzureOpenAI( api_key='your-azure-key', @@ -99,7 +99,7 @@ llm = AzureOpenAI( ```python -from agora_agent.agentkit.vendors import Anthropic +from agora_agent import Anthropic llm = Anthropic(api_key='your-anthropic-key', model='claude-3-5-sonnet-20241022') ``` @@ -126,7 +126,7 @@ llm = Anthropic(api_key='your-anthropic-key', model='claude-3-5-sonnet-20241022' ```python -from agora_agent.agentkit.vendors import Gemini +from agora_agent import Gemini llm = Gemini(api_key='your-google-key', model='gemini-2.0-flash-exp') ``` @@ -409,7 +409,7 @@ Fixed sample rate: 24000 Hz. ### `XaiGrok` -xAI Grok MLLM vendor (`mllm.vendor`: `"xai"`). Matches the [xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai) product docs and the TypeScript SDK. +xAI Grok MLLM vendor (`mllm.vendor`: `"xai"`). Matches the [xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai) product docs. | Parameter | Type | Required | Default | Description | |---|---|---|---|---| From 3ee455879cbf211612502c18836119f4a27a5cc8 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 17:29:15 -0400 Subject: [PATCH 23/28] Clean up README and changelog for agora-agents rename Fix README title, table of contents, and duplicate MLLM section. Consolidate quick-start imports to the root path. Correct the v2.0.0 type alias entry. --- README.md | 69 +++++++--------------------------------------------- changelog.md | 2 +- 2 files changed, 10 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index fff69d9..4fa990f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Agoraio Python Library +# Agora Conversational AI Python SDK [![fern shield](https://img.shields.io/badge/%F0%9F%8C%BF-Built%20with%20Fern-brightgreen)](https://buildwithfern.com?utm_source=github&utm_medium=github&utm_campaign=readme&utm_source=https%3A%2F%2Fgithub.com%2FAgoraIO-Conversational-AI%2Fagent-server-sdk-python) [![pypi](https://img.shields.io/pypi/v/agora-agents)](https://pypi.python.org/pypi/agora-agents) @@ -13,12 +13,11 @@ and multimodal flows (MLLM) for real-time audio processing. - [Requirements](#requirements) - [Installation](#installation) - [Quick Start](#quick-start) -- [Byok](#byok) -- [Mllm Realtime Multimodal](#mllm-realtime-multimodal) +- [BYOK](#byok) +- [MLLM (Realtime / Multimodal)](#mllm-realtime--multimodal) - [Documentation](#documentation) - [Reference](#reference) -- [Mllm Flow Multimodal](#mllm-flow-multimodal) -- [Mllm Flow Multimodal](#mllm-flow-multimodal) +- [Package Rename Compatibility](#package-rename-compatibility) - [Usage](#usage) - [Async Client](#async-client) - [Exception Handling](#exception-handling) @@ -48,9 +47,10 @@ The recommended onboarding path is a server-side builder flow: define the agent import os import time -from agora_agent import Agora, Area -from agora_agent.agentkit import ( +from agora_agent import ( Agent, + Agora, + Area, DataChannel, DeepgramSTT, GenericAvatar, @@ -195,7 +195,7 @@ If you want to bring your own vendor credentials instead of using Agora-managed Use `with_mllm()` for OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok. No STT, LLM, or TTS vendor is needed when MLLM mode is enabled. ```python -from agora_agent.agentkit import Agent, OpenAIRealtime +from agora_agent import Agent, OpenAIRealtime agent = Agent(name="realtime-assistant").with_mllm( OpenAIRealtime( @@ -218,58 +218,7 @@ A full reference for this library is available [here](https://github.com/AgoraIO ## Package Rename Compatibility -The published package name is now `agora-agents`, while the Python import path remains `agora_agent` for compatibility. The legacy PyPI distribution name `agora-agent-server-sdk` is maintained as a compatibility package in [compat/agora-agent-server-sdk](./compat/agora-agent-server-sdk). It re-exports the public API from `agora-agents`, and the tag-based release workflow publishes both distributions together. - -## MLLM Flow (Multimodal) - -For real-time audio processing using OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. MLLM mode does not require separate TTS, STT, or LLM vendors. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. - -```python -from agora_agent import Agora -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesMllm, - StartAgentsRequestPropertiesMllmVendor, - StartAgentsRequestPropertiesTurnDetection, - StartAgentsRequestPropertiesTurnDetectionType, -) - -client = Agora( - customer_id="YOUR_CUSTOMER_ID", - customer_secret="YOUR_CUSTOMER_SECRET", -) - -client.agents.start( - appid="your_app_id", - name="mllm_agent", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="your_token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - mllm=StartAgentsRequestPropertiesMllm( - enable=True, - url="wss://api.openai.com/v1/realtime", - api_key="", - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, - params={ - "model": "gpt-4o-realtime-preview", - "voice": "alloy", - }, - input_modalities=["audio"], - output_modalities=["text", "audio"], - greeting_message="Hello! I'm ready to chat in real-time.", - ), - turn_detection=StartAgentsRequestPropertiesTurnDetection( - type=StartAgentsRequestPropertiesTurnDetectionType.SERVER_VAD, - threshold=0.5, - silence_duration_ms=500, - ), - ), -) -``` - +The published package name is now `agora-agents`. The import path is still `agora_agent`. The legacy PyPI name `agora-agent-server-sdk` remains available as a compatibility package in [compat/agora-agent-server-sdk](./compat/agora-agent-server-sdk). ## MLLM Flow (Multimodal) diff --git a/changelog.md b/changelog.md index a3bf495..1174850 100644 --- a/changelog.md +++ b/changelog.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Added -- **Alias parity** — `AsrConfig` (= `SttConfig`), `is_avatar_token_managed`, think type aliases (`ThinkOnListeningAction`, etc.), think value constants, and cross-SDK discovery table in `docs/reference/agent.md`. +- **Type aliases** — `AsrConfig` (= `SttConfig`), `is_avatar_token_managed`, think type aliases (`ThinkOnListeningAction`, etc.), and think value constants. - **`XaiGrok`** — New MLLM wrapper for xAI Grok (`mllm.vendor`: `"xai"`), including Realtime API URL, voice, language, sample rate, modalities, messages, and MLLM turn detection support. - **`GenericAvatar`** — New generic avatar wrapper (`vendor: "generic"`) for custom avatar providers. - **Avatar token enrichment** — `AgentSession.start()` now fills missing generic avatar `agora_appid` and `agora_channel` from the session and generates missing avatar `agora_token` values for HeyGen, LiveAvatar, and Generic avatars using each avatar's `agora_uid`. From 923cf954808f9f366a734d6706310eab21a43e50 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 18:06:06 -0400 Subject: [PATCH 24/28] Prioritize app credentials and builder in Python docs Rewrite getting-started auth and quick-start for app credentials with the builder API. De-emphasize presets and align index, BYOK, and README with the recommended onboarding path. --- README.md | 2 +- docs/concepts/architecture.md | 2 +- docs/getting-started/authentication.md | 74 +++++++++++++++++--------- docs/getting-started/installation.md | 2 +- docs/getting-started/quick-start.md | 40 +++++++------- docs/guides/byok.md | 10 ++-- docs/index.md | 9 ++-- 7 files changed, 82 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 4fa990f..4d56fee 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ pip install agora-agents ## Quick Start -The recommended onboarding path is a server-side builder flow: define the agent once, configure preset-backed providers in the builder, and let AgentKit infer the reseller `preset` values when the session starts. +The recommended onboarding path is a server-side builder flow: define the agent once, configure vendors on the builder, and let AgentKit infer Agora-managed configuration when credentials are omitted. ```python import os diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md index 4bdc208..f2c058c 100644 --- a/docs/concepts/architecture.md +++ b/docs/concepts/architecture.md @@ -22,7 +22,7 @@ The Python SDK has three layers: +--------------------------------------------------+ ``` -### Agentkit Layer (`agora_agent.agentkit`) +### AgentKit Layer (`agora_agent.agentkit`) This is the primary developer-facing API. It provides: diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md index 997a854..31dcc56 100644 --- a/docs/getting-started/authentication.md +++ b/docs/getting-started/authentication.md @@ -1,19 +1,17 @@ --- sidebar_position: 2 title: Authentication -description: Configure the Python SDK with the recommended app-credentials flow and understand the supported auth modes. +description: Configure the Python SDK with app credentials and understand other supported auth modes. --- # Authentication -The recommended production path is app credentials mode. +Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate` only. The SDK mints a fresh ConvoAI REST token for each API call and generates the RTC join token when the session starts. -Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate`, then let `AgentSession` generate the ConvoAI REST auth token and the RTC join token automatically. - -## Recommended: app credentials +## App credentials ```python -from agora_agent import Agent, Agora, Area, AgentPresets +from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS client = Agora( area=Area.US, @@ -21,40 +19,68 @@ client = Agora( app_certificate="your-app-certificate", ) -agent = Agent(instructions="Be concise.") +agent = ( + Agent(instructions="Be concise.") + .with_stt(DeepgramSTT(model="nova-3")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) +) session = agent.create_session( client, channel="room-123", agent_uid="1", remote_uids=["100"], - preset=[ - AgentPresets.asr.deepgram_nova_3, - AgentPresets.llm.openai_gpt_5_mini, - AgentPresets.tts.openai_tts_1, - ], ) ``` -## Why this is the default +## Why app credentials + +- Fresh short-lived tokens per API call instead of reusing long-lived credentials +- No Customer ID / Customer Secret in request headers +- No manual REST or RTC token provisioning in application code + +## Inspecting auth mode -- The SDK handles ConvoAI REST auth and RTC join token generation for you. -- Your onboarding code stays focused on agent behavior instead of auth plumbing. -- Your quick start code stays vendor-key free when you use presets. +```python +print(client.auth_mode) # "app-credentials" +``` -## Other supported modes +## Other auth modes -The SDK also supports app-credentials mode and Basic Auth, but they are intentionally not the default onboarding path. +The SDK also supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. These are not recommended for new applications. -- App credentials are useful when your backend wants the SDK to mint ConvoAI REST tokens automatically. -- Basic Auth is supported for legacy integrations and account-level workflows. +### Token auth (`auth_token`) -## Inspecting the resolved auth mode +Pass a pre-minted Agora REST token on the client. You must also supply the RTC join token on `create_session(..., token=...)`. ```python -print(client.auth_mode) # "app-credentials" +client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + auth_token="your-rest-auth-token", +) + +session = agent.create_session( + client, + channel="room-123", + agent_uid="1", + remote_uids=["100"], + token="your-rtc-join-token", +) ``` -## Other supported modes +### Basic Auth (`customer_id` + `customer_secret`) + +Uses HTTP Basic Auth with Customer ID and Secret from Agora Console. Avoid for new integrations — the same credentials are sent on every request instead of minting fresh tokens. -`auth_token` and Basic Auth are still supported for advanced or legacy cases, but they are not the default onboarding path. +```python +client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + customer_id="your-customer-id", + customer_secret="your-customer-secret", +) +``` diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 54801e5..04b48da 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -25,7 +25,7 @@ poetry add agora-agents ## Imports ```python -from agora_agent import Agent, Agora, Area, AgentPresets, DeepgramSTT, OpenAI +from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI ``` The package installs as `agora-agents` and imports as `agora_agent`. diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index 217a097..af0ef3c 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -1,7 +1,7 @@ --- sidebar_position: 3 title: Quick Start -description: Build and run your first Agora Conversational AI agent in Python with app credentials and presets. +description: Build and run your first Agora Conversational AI agent in Python with app credentials and the builder API. --- # Quick Start @@ -9,14 +9,14 @@ description: Build and run your first Agora Conversational AI agent in Python wi This guide uses the recommended onboarding path: - `app_id`, `app_certificate`, and `area` on `Agora` or `AsyncAgora` -- `preset` for Agora-managed ASR, LLM, and TTS +- the `Agent` builder with `.with_stt()`, `.with_llm()`, and `.with_tts()` - automatic ConvoAI REST auth and RTC join token generation -- no vendor API keys in application code +- no vendor API keys when using supported Agora-managed models ## Sync example ```python -from agora_agent import Agent, Agora, Area, AgentPresets +from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS def main() -> None: @@ -26,12 +26,16 @@ def main() -> None: app_certificate="your-app-certificate", ) - # Agent-level behavior lives here. Vendor selection comes from presets below. - agent = Agent( - name="support-assistant", - instructions="You are a concise support voice assistant.", - greeting="Hello! How can I help you today?", - max_history=10, + agent = ( + Agent( + name="support-assistant", + instructions="You are a concise support voice assistant.", + greeting="Hello! How can I help you today?", + max_history=10, + ) + .with_stt(DeepgramSTT(model="nova-3", language="en")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) ) session = agent.create_session( @@ -40,11 +44,6 @@ def main() -> None: agent_uid="1", remote_uids=["100"], idle_timeout=120, - preset=[ - AgentPresets.asr.deepgram_nova_3, - AgentPresets.llm.openai_gpt_5_mini, - AgentPresets.tts.openai_tts_1, - ], ) agent_session_id = session.start() @@ -62,22 +61,21 @@ if __name__ == "__main__": 1. `Agora` runs in app-credentials mode when you pass `app_id` and `app_certificate` only. 2. `Agent` holds reusable behavior such as instructions, greeting, and history settings. -3. `preset` tells Agora which managed ASR, LLM, and TTS vendors to run. -4. `session.start()` lets the SDK generate the required auth tokens automatically. -5. `session.start()` returns the unique agent session ID. +3. Vendor classes on the builder select the ASR, LLM, and TTS stack. AgentKit infers Agora-managed configuration when credentials are omitted for supported models. +4. `session.start()` generates the required auth tokens and returns the unique agent session ID. ## Async applications -For `asyncio` services, switch to `AsyncAgora` and `await` the session methods. The preset and token-auth flow stays the same. +For `asyncio` services, switch to `AsyncAgora` and `await` the session methods. The builder and app-credentials flow stay the same. ## When to use BYOK instead -Use presets when you want the fastest path to a working agent. +Use the builder without vendor API keys when you want the fastest path with Agora-managed models. Use BYOK when you need to: - supply your own vendor API keys -- use models outside the preset catalog +- use models outside the Agora-managed catalog - point at custom vendor endpoints - manage vendor-specific parameters directly diff --git a/docs/guides/byok.md b/docs/guides/byok.md index ffbb91b..15b7684 100644 --- a/docs/guides/byok.md +++ b/docs/guides/byok.md @@ -6,11 +6,11 @@ description: Bring your own vendor credentials and use custom vendor configurati # BYOK -Use BYOK when you want to provide vendor credentials yourself instead of relying on Agora-managed presets. +Use BYOK when you want to provide vendor credentials yourself instead of relying on Agora-managed models via the builder. Typical reasons: -- you need a vendor model that is not part of the preset catalog +- you need a vendor model outside the Agora-managed catalog - you want to point to a custom endpoint - you want direct control over vendor-specific parameters - your organization manages vendor billing separately from Agora @@ -79,7 +79,7 @@ if __name__ == "__main__": main() ``` -## Presets vs BYOK +## Builder-managed vs BYOK -- Presets: fastest path, no vendor keys in app code -- BYOK: most control, your keys and your vendor configuration +- Builder without vendor keys: fastest path with Agora-managed models +- BYOK: your keys and full vendor control diff --git a/docs/index.md b/docs/index.md index c8d9d12..1935d5f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -21,7 +21,7 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on ## Choose a starting point -- Use [Quick Start](./getting-started/quick-start.md) if you want the recommended preset-based path with no vendor keys. +- Use [Quick Start](./getting-started/quick-start.md) if you want the recommended builder-based path with app credentials. - Use [MLLM Flow](./guides/mllm-flow.md) if you want realtime end-to-end audio with OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok. - Use [Cascading Flow](./guides/cascading-flow.md) if you want separate ASR, LLM, and TTS vendors. @@ -29,7 +29,7 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on | Layer | What it does | When to use | |---|---|---| -| **Agentkit** (`Agent`, `AgentSession`, vendors, presets) | High-level builder pattern, lifecycle, typed vendors | Most use cases | +| **AgentKit** (`Agent`, `AgentSession`, vendor classes) | High-level builder pattern, lifecycle, typed vendors | Most use cases | | **Fern-generated core** (`client.agents`, `client.telephony`) | Direct REST client mapping every API endpoint | Advanced use cases | ## Documentation @@ -37,8 +37,9 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on | Section | What you will learn | |---|---| | [Installation](./getting-started/installation.md) | Install the SDK and prerequisites | -| [Authentication](./getting-started/authentication.md) | Token auth for REST and RTC joins | -| [Quick Start](./getting-started/quick-start.md) | Recommended preset-based onboarding flow | +| [Authentication](./getting-started/authentication.md) | App credentials and other auth modes | +| [Quick Start](./getting-started/quick-start.md) | Recommended builder-based onboarding flow | +| [Agent Builder Features](./guides/agent-builder-features.md) | Turn detection, SAL, filler words, and advanced agent options | | [BYOK](./guides/byok.md) | Bring your own vendor credentials and config | | [Architecture](./concepts/architecture.md) | Understand the SDK layers and client types | | [Agent](./concepts/agent.md) | Configure agents with the fluent builder | From d475306bd42279984bcf4934b900003e8e02c4eb Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 20:40:49 -0400 Subject: [PATCH 25/28] Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. --- .fernignore | 5 +++++ README.md | 7 ++----- compat/agora-agent-server-sdk/README.md | 2 ++ docs/getting-started/installation.md | 13 ++++++++++++- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/.fernignore b/.fernignore index 1d34ba0..dc9b28b 100644 --- a/.fernignore +++ b/.fernignore @@ -9,6 +9,11 @@ src/agora_agent/agentkit/ # Documentation - managed manually, not generated by Fern docs/ +README.md + +# Compatibility shim and CI/release workflows are managed manually +compat/ +.github/workflows/ # Dependency manifests/lockfiles are managed manually pyproject.toml diff --git a/README.md b/README.md index 4d56fee..1ce5bf8 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,6 @@ and multimodal flows (MLLM) for real-time audio processing. - [MLLM (Realtime / Multimodal)](#mllm-realtime--multimodal) - [Documentation](#documentation) - [Reference](#reference) -- [Package Rename Compatibility](#package-rename-compatibility) - [Usage](#usage) - [Async Client](#async-client) - [Exception Handling](#exception-handling) @@ -184,6 +183,8 @@ agent = Agent( ) ``` +Migrating from `agora-agent-server-sdk` on PyPI? Use `pip install agora-agents`; imports stay `agora_agent` — see [changelog migration notes](./changelog.md#migration-notes) or [installation guide](./docs/getting-started/installation.md#migrating-from-a-previous-package-name). + ## BYOK If you want to bring your own vendor credentials instead of using Agora-managed presets, use the BYOK guide: @@ -216,10 +217,6 @@ API reference documentation is available [here](https://docs.agora.io/en/convers A full reference for this library is available [here](https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python/blob/HEAD/./reference.md). -## Package Rename Compatibility - -The published package name is now `agora-agents`. The import path is still `agora_agent`. The legacy PyPI name `agora-agent-server-sdk` remains available as a compatibility package in [compat/agora-agent-server-sdk](./compat/agora-agent-server-sdk). - ## MLLM Flow (Multimodal) For real-time audio processing using OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. MLLM mode does not require separate TTS, STT, or LLM vendors. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md index e43d1d8..1da36aa 100644 --- a/compat/agora-agent-server-sdk/README.md +++ b/compat/agora-agent-server-sdk/README.md @@ -14,3 +14,5 @@ This compatibility package re-exports the public API from `agora-agents` to supp from agora_agent import Agora, Area from agora_agent_server_sdk_compat import Agora, Area ``` + +Maintainers: dual-publish steps live in the repository release workflow, not in the root README. diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 04b48da..8fca9ab 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -53,4 +53,15 @@ from agora_agent import AsyncAgora, AsyncAgentSession, Area | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | -See [Authentication](./authentication.md) for setup details. +## Next steps + +- [Authentication](./authentication.md) — configure your credentials +- [Quick Start](./quick-start.md) — build your first conversational agent + +## Migrating from a previous package name + +The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + +The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + +For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). From c93555763ffd63267a737b3e430217a890f203db Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 20:59:13 -0400 Subject: [PATCH 26/28] Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. --- README.md | 197 +------------------------ docs/getting-started/authentication.md | 39 +---- docs/guides/low-level-api.md | 182 ++++------------------- 3 files changed, 34 insertions(+), 384 deletions(-) diff --git a/README.md b/README.md index 1ce5bf8..3f788cc 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,6 @@ and multimodal flows (MLLM) for real-time audio processing. - [MLLM (Realtime / Multimodal)](#mllm-realtime--multimodal) - [Documentation](#documentation) - [Reference](#reference) -- [Usage](#usage) -- [Async Client](#async-client) - [Exception Handling](#exception-handling) - [Pagination](#pagination) - [Advanced](#advanced) @@ -148,7 +146,7 @@ def start_conversation() -> str: ### Why no token or vendor key in the example? -`Agora` generates the required ConvoAI REST auth and RTC join tokens automatically when you provide `app_id` and `app_certificate`. AgentKit then inspects the builder-provided vendor configs and infers the matching supported `preset` values for reseller-backed models, so you do not pass vendor API keys in this flow. +`Agora` generates the required ConvoAI REST auth and RTC join tokens automatically when you provide `app_id` and `app_certificate`. AgentKit inspects the builder-provided vendor configs and infers Agora-managed configuration for supported models, so you do not pass vendor API keys in this flow. ### BYOK version of the same builder flow @@ -187,7 +185,7 @@ Migrating from `agora-agent-server-sdk` on PyPI? Use `pip install agora-agents`; ## BYOK -If you want to bring your own vendor credentials instead of using Agora-managed presets, use the BYOK guide: +If you want to bring your own vendor credentials instead of using Agora-managed models, use the BYOK guide: - [BYOK Guide](./docs/guides/byok.md) @@ -217,187 +215,6 @@ API reference documentation is available [here](https://docs.agora.io/en/convers A full reference for this library is available [here](https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python/blob/HEAD/./reference.md). -## MLLM Flow (Multimodal) - -For real-time audio processing using OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. MLLM mode does not require separate TTS, STT, or LLM vendors. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. - -```python -from agora_agent import Agora -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesMllm, - StartAgentsRequestPropertiesMllmVendor, - StartAgentsRequestPropertiesTurnDetection, - StartAgentsRequestPropertiesTurnDetectionType, -) - -client = Agora( - customer_id="YOUR_CUSTOMER_ID", - customer_secret="YOUR_CUSTOMER_SECRET", -) - -client.agents.start( - appid="your_app_id", - name="mllm_agent", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="your_token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - mllm=StartAgentsRequestPropertiesMllm( - enable=True, - url="wss://api.openai.com/v1/realtime", - api_key="", - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, - params={ - "model": "gpt-4o-realtime-preview", - "voice": "alloy", - }, - input_modalities=["audio"], - output_modalities=["text", "audio"], - greeting_message="Hello! I'm ready to chat in real-time.", - ), - turn_detection=StartAgentsRequestPropertiesTurnDetection( - type=StartAgentsRequestPropertiesTurnDetectionType.SERVER_VAD, - threshold=0.5, - silence_duration_ms=500, - ), - ), -) -``` - - -## Usage - -Instantiate and use the client with the following: - -```python -from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, - StartAgentsRequestPropertiesTurnDetection, - StartAgentsRequestPropertiesTurnDetectionConfig, - StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, -) - -client = Agora( - authorization="YOUR_AUTHORIZATION", - username="YOUR_USERNAME", - password="YOUR_PASSWORD", -) -client.agents.start( - appid="appid", - name="unique_name", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), - tts=Tts_Microsoft( - params=MicrosoftTtsParams( - key="key", - region="region", - voice_name="voice_name", - ), - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - api_key="", - system_messages=[ - {"role": "system", "content": "You are a helpful chatbot."} - ], - params={"model": "gpt-4o-mini"}, - max_history=32, - greeting_message="Hello, how can I assist you today?", - failure_message="Please hold on a second.", - ), - turn_detection=StartAgentsRequestPropertiesTurnDetection( - config=StartAgentsRequestPropertiesTurnDetectionConfig( - end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( - mode="semantic", - ), - ), - ), - ), -) -``` - -## Async Client - -The SDK also exports an `async` client so that you can make non-blocking calls to our API. Note that if you are constructing an Async httpx client class to pass into this client, use `httpx.AsyncClient()` instead of `httpx.Client()` (e.g. for the `httpx_client` parameter of this client). - -```python -import asyncio - -from agora_agent import AsyncAgora, MicrosoftTtsParams, Tts_Microsoft -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, - StartAgentsRequestPropertiesTurnDetection, - StartAgentsRequestPropertiesTurnDetectionConfig, - StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, -) - -client = AsyncAgora( - authorization="YOUR_AUTHORIZATION", - username="YOUR_USERNAME", - password="YOUR_PASSWORD", -) - - -async def main() -> None: - await client.agents.start( - appid="appid", - name="unique_name", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), - tts=Tts_Microsoft( - params=MicrosoftTtsParams( - key="key", - region="region", - voice_name="voice_name", - ), - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - api_key="", - system_messages=[ - {"role": "system", "content": "You are a helpful chatbot."} - ], - params={"model": "gpt-4o-mini"}, - max_history=32, - greeting_message="Hello, how can I assist you today?", - failure_message="Please hold on a second.", - ), - turn_detection=StartAgentsRequestPropertiesTurnDetection( - config=StartAgentsRequestPropertiesTurnDetectionConfig( - end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( - mode="semantic", - ), - ), - ), - ), - ) - - -asyncio.run(main()) -``` - ## Exception Handling When the API returns a non-success status code (4xx or 5xx response), a subclass of the following error @@ -418,15 +235,15 @@ except ApiError as e: Paginated requests will return a `SyncPager` or `AsyncPager`, which can be used as generators for the underlying object. ```python -from agora_agent import Agora +from agora_agent import Agora, Area client = Agora( - authorization="YOUR_AUTHORIZATION", - username="YOUR_USERNAME", - password="YOUR_PASSWORD", + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", ) response = client.agents.list( - appid="appid", + appid=client.app_id, ) for item in response: yield item diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md index 31dcc56..74c62cd 100644 --- a/docs/getting-started/authentication.md +++ b/docs/getting-started/authentication.md @@ -46,41 +46,6 @@ session = agent.create_session( print(client.auth_mode) # "app-credentials" ``` -## Other auth modes +## Legacy auth modes -The SDK also supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. These are not recommended for new applications. - -### Token auth (`auth_token`) - -Pass a pre-minted Agora REST token on the client. You must also supply the RTC join token on `create_session(..., token=...)`. - -```python -client = Agora( - area=Area.US, - app_id="your-app-id", - app_certificate="your-app-certificate", - auth_token="your-rest-auth-token", -) - -session = agent.create_session( - client, - channel="room-123", - agent_uid="1", - remote_uids=["100"], - token="your-rtc-join-token", -) -``` - -### Basic Auth (`customer_id` + `customer_secret`) - -Uses HTTP Basic Auth with Customer ID and Secret from Agora Console. Avoid for new integrations — the same credentials are sent on every request instead of minting fresh tokens. - -```python -client = Agora( - area=Area.US, - app_id="your-app-id", - app_certificate="your-app-certificate", - customer_id="your-customer-id", - customer_secret="your-customer-secret", -) -``` +The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md index 6677b45..47397b7 100644 --- a/docs/guides/low-level-api.md +++ b/docs/guides/low-level-api.md @@ -1,187 +1,55 @@ --- sidebar_position: 10 title: Low-Level API -description: Direct client.agents.start() usage without the builder pattern. +description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. --- # Low-Level API -For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. +Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. -## Raw telephony and phone-number APIs - -AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: - -- `client.telephony` for call status and hangup operations -- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations +Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. -## Cascading flow (ASR → LLM → TTS) +## Client setup ```python from agora_agent import Agora, Area -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, -) -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams -from agora_agent.types.tts import Tts_Elevenlabs client = Agora( area=Area.US, - app_id="YOUR_APP_ID", - app_certificate="YOUR_APP_CERTIFICATE", - auth_token="your-rest-auth-token", -) -client.agents.start( - client.app_id, - name="unique_name", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - vendor="deepgram", - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, - ), - tts=Tts_Elevenlabs( - params=ElevenLabsTtsParams( - key="YOUR_ELEVENLABS_API_KEY", - model_id="eleven_flash_v2_5", - voice_id="pNInz6obpgDQGcFmaJgB", - sample_rate=24000, - ), - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - api_key="", - system_messages=[ - {"role": "system", "content": "You are a helpful chatbot."} - ], - params={"model": "gpt-4o-mini"}, - max_history=32, - greeting_message="Hello, how can I assist you today?", - failure_message="Please hold on a second.", - ), - ), + app_id="your-app-id", + app_certificate="your-app-certificate", ) ``` -## Async (low-level) +## Raw telephony and phone-number APIs -```python -import asyncio -from agora_agent import Area, AsyncAgora -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, -) -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams -from agora_agent.types.tts import Tts_Elevenlabs +AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: -client = AsyncAgora( - area=Area.US, - app_id="YOUR_APP_ID", - app_certificate="YOUR_APP_CERTIFICATE", - auth_token="your-rest-auth-token", +- `client.telephony` for call status and hangup operations +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + +```python +calls = client.telephony.list( + appid=client.app_id, + type="sip", ) -async def main() -> None: - await client.agents.start( - client.app_id, - name="unique_name", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - vendor="deepgram", - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, - ), - tts=Tts_Elevenlabs( - params=ElevenLabsTtsParams( - key="YOUR_ELEVENLABS_API_KEY", - model_id="eleven_flash_v2_5", - voice_id="pNInz6obpgDQGcFmaJgB", - sample_rate=24000, - ), - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - api_key="", - system_messages=[ - {"role": "system", "content": "You are a helpful chatbot."} - ], - params={"model": "gpt-4o-mini"}, - max_history=32, - greeting_message="Hello, how can I assist you today?", - failure_message="Please hold on a second.", - ), - ), - ) - -asyncio.run(main()) +for call in calls: + print(call.id, call.state) ``` -## MLLM flow (multimodal) +## Direct agent APIs -For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flow instead of the cascading ASR → LLM → TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview). +`client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. -```python -from agora_agent import Agora, Area -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesMllm, - StartAgentsRequestPropertiesMllmVendor, - StartAgentsRequestPropertiesTts, - StartAgentsRequestPropertiesTtsVendor, - StartAgentsRequestPropertiesLlm, -) +If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: -client = Agora( - area=Area.US, - app_id="YOUR_APP_ID", - app_certificate="YOUR_APP_CERTIFICATE", - auth_token="your-rest-auth-token", -) - -client.agents.start( - client.app_id, - name="mllm_agent", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="your_token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - mllm=StartAgentsRequestPropertiesMllm( - enable=True, - url="wss://api.openai.com/v1/realtime", - api_key="", - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, - params={ - "model": "gpt-4o-realtime-preview", - "voice": "alloy", - }, - input_modalities=["audio"], - output_modalities=["text", "audio"], - greeting_message="Hello! I'm ready to chat in real-time.", - turn_detection={ - "mode": "server_vad", - "server_vad_config": { - "idle_timeout_ms": 5000, - }, - }, - ), - ), +```python +info = session.raw.get( + appid=session.app_id, + agent_id=session.id, ) ``` -For more on the agentkit-based MLLM flow, see [MLLM Flow](./mllm-flow.md). +You must pass `appid` and `agent_id` manually when using generated raw methods. From 98ecb4d33d18faf03adcec167385c3a50dcaa08f Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 22:01:32 -0400 Subject: [PATCH 27/28] Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM vendor helpers. Introduce named LLM vendor classes with correct request serialization, export them from the package root, and add tests covering each provider's config shape. --- src/agora_agent/agentkit/__init__.py | 10 ++ src/agora_agent/agentkit/vendors/__init__.py | 7 +- src/agora_agent/agentkit/vendors/llm.py | 136 +++++++++++++++++++ src/agora_agent/types/open_ai_tts_params.py | 2 +- tests/custom/test_llm_vendors.py | 60 ++++++++ 5 files changed, 213 insertions(+), 2 deletions(-) create mode 100644 tests/custom/test_llm_vendors.py diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py index a03043b..712d0dd 100644 --- a/src/agora_agent/agentkit/__init__.py +++ b/src/agora_agent/agentkit/__init__.py @@ -142,6 +142,7 @@ ) from .vendors import ( AkoolAvatar, + AmazonBedrock, AmazonSTT, AmazonTTS, AnamAvatar, @@ -156,16 +157,19 @@ BaseTTS, CartesiaSampleRate, CartesiaTTS, + CustomLLM, DeepgramSTT, DeepgramTTS, ElevenLabsSampleRate, ElevenLabsTTS, + Dify, FishAudioTTS, Gemini, GeminiLive, GenericAvatar, GoogleSTT, GoogleTTS, + Groq, HeyGenAvatar, HumeAITTS, MicrosoftSampleRate, @@ -184,6 +188,7 @@ SarvamTTS, SpeechmaticsSTT, VertexAI, + VertexAILLM, XaiGrok, LiveAvatarAvatar, ) @@ -336,6 +341,11 @@ "AzureOpenAI", "Anthropic", "Gemini", + "Groq", + "VertexAILLM", + "AmazonBedrock", + "Dify", + "CustomLLM", "ElevenLabsTTS", "MicrosoftTTS", "OpenAITTS", diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py index 8e2042e..0cd901e 100644 --- a/src/agora_agent/agentkit/vendors/__init__.py +++ b/src/agora_agent/agentkit/vendors/__init__.py @@ -12,7 +12,7 @@ SampleRate, ) from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar -from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI +from .llm import AmazonBedrock, Anthropic, AzureOpenAI, CustomLLM, Dify, Gemini, Groq, OpenAI, VertexAILLM from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok from .stt import ( AmazonSTT, @@ -57,6 +57,11 @@ "AzureOpenAI", "Anthropic", "Gemini", + "Groq", + "VertexAILLM", + "AmazonBedrock", + "Dify", + "CustomLLM", "ElevenLabsTTS", "MicrosoftTTS", "OpenAITTS", diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 6f74b43..b521867 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -311,3 +311,139 @@ def to_config(self) -> Dict[str, Any]: config["max_history"] = self.options.max_history return config + + +class GroqOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Groq API key") + model: str = Field(default="llama-3.3-70b-versatile", description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom Groq-compatible endpoint") + + +class Groq(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GroqOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["url"] = self.options.base_url or "https://api.groq.com/openai/v1/chat/completions" + return config + + +class CustomLLMOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Custom LLM API key") + base_url: str = Field(..., description="OpenAI-compatible chat completions endpoint") + + +class CustomLLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = CustomLLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["vendor"] = self.options.vendor or "custom" + return config + + +class VertexAILLMOptions(GeminiOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Vertex AI access token or API key") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location") + + +class VertexAILLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAILLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + options = _dump_optional_model(self.options) + options.pop("project_id", None) + options.pop("location", None) + config = Gemini(**options).to_config() + params = dict(config["params"]) + params["project_id"] = self.options.project_id + params["location"] = self.options.location + config["params"] = params + return config + + +class AmazonBedrockOptions(AnthropicOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Amazon Bedrock API key or gateway token") + url: str = Field(..., description="Amazon Bedrock proxy or runtime endpoint") + + +class AmazonBedrock(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AmazonBedrockOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + return Anthropic(**_dump_optional_model(self.options)).to_config() + + +class DifyOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Dify API key") + url: str = Field(..., description="Dify workflow or chat endpoint") + user: Optional[str] = Field(default=None, description="Dify user identifier") + conversation_id: Optional[str] = Field(default=None, description="Dify conversation ID") + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0) + + +class Dify(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = DifyOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.user is not None: + params["user"] = self.options.user + if self.options.conversation_id is not None: + params["conversation_id"] = self.options.conversation_id + + config: Dict[str, Any] = { + "url": self.options.url, + "api_key": self.options.api_key, + "params": params, + "style": "dify", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config diff --git a/src/agora_agent/types/open_ai_tts_params.py b/src/agora_agent/types/open_ai_tts_params.py index f042c1f..3839646 100644 --- a/src/agora_agent/types/open_ai_tts_params.py +++ b/src/agora_agent/types/open_ai_tts_params.py @@ -14,7 +14,7 @@ class OpenAiTtsParams(UncheckedBaseModel): api_key: typing.Optional[str] = pydantic.Field(default=None) """ - OpenAI API key. Optional for preset-backed OpenAI TTS usage. + OpenAI API key. Optional for Agora-managed OpenAI TTS usage. """ voice: str = pydantic.Field() diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py new file mode 100644 index 0000000..faca9bf --- /dev/null +++ b/tests/custom/test_llm_vendors.py @@ -0,0 +1,60 @@ +from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM + + +def test_groq_serializes_as_openai_compatible() -> None: + config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() + + assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" + assert config["api_key"] == "groq-key" + assert config["style"] == "openai" + assert config["params"]["model"] == "llama-3.3-70b-versatile" + + +def test_custom_llm_marks_request_as_custom() -> None: + config = CustomLLM(api_key="key", model="model", base_url="https://llm.example.com/chat").to_config() + + assert config["url"] == "https://llm.example.com/chat" + assert config["api_key"] == "key" + assert config["vendor"] == "custom" + assert config["style"] == "openai" + + +def test_vertex_ai_llm_includes_project_routing() -> None: + config = VertexAILLM( + api_key="vertex-token", + model="gemini-2.0-flash", + project_id="project", + location="us-central1", + ).to_config() + + assert config["api_key"] == "vertex-token" + assert config["style"] == "gemini" + assert config["params"]["model"] == "gemini-2.0-flash" + assert config["params"]["project_id"] == "project" + assert config["params"]["location"] == "us-central1" + + +def test_amazon_bedrock_serializes_as_anthropic_style() -> None: + config = AmazonBedrock( + api_key="bedrock-key", + url="https://bedrock.example.com/messages", + model="anthropic.claude-3-5-sonnet-20241022-v2:0", + ).to_config() + + assert config["api_key"] == "bedrock-key" + assert config["style"] == "anthropic" + assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" + + +def test_dify_serializes_conversation_fields() -> None: + config = Dify( + api_key="dify-key", + url="https://api.dify.ai/v1/chat-messages", + user="user-1", + conversation_id="conversation-1", + ).to_config() + + assert config["api_key"] == "dify-key" + assert config["style"] == "dify" + assert config["params"]["user"] == "user-1" + assert config["params"]["conversation_id"] == "conversation-1" From a5097b8d9f7583c2aaa781ae462741df92bc9842 Mon Sep 17 00:00:00 2001 From: digitallysavvy Date: Wed, 27 May 2026 22:03:05 -0400 Subject: [PATCH 28/28] Document new LLM vendors and tighten onboarding docs. Add Groq, Vertex AI, Bedrock, Dify, and Custom LLM to vendor references, simplify README and index navigation, and align quick-start and terminology with Agora-managed model language. --- README.md | 43 ++++++++--------------------- docs/concepts/session.md | 6 ++-- docs/concepts/vendors.md | 5 ++++ docs/getting-started/quick-start.md | 6 ++-- docs/guides/byok.md | 2 +- docs/guides/regional-routing.md | 4 +-- docs/index.md | 18 ++++++------ docs/reference/client.md | 2 +- docs/reference/vendors.md | 12 ++++++++ 9 files changed, 48 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 3f788cc..983932b 100644 --- a/README.md +++ b/README.md @@ -7,38 +7,19 @@ The Agora Conversational AI SDK provides convenient access to the Agora Conversa enabling you to build voice-powered AI agents with support for both cascading flows (ASR -> LLM -> TTS) and multimodal flows (MLLM) for real-time audio processing. +## Install -## Table of Contents - -- [Requirements](#requirements) -- [Installation](#installation) -- [Quick Start](#quick-start) -- [BYOK](#byok) -- [MLLM (Realtime / Multimodal)](#mllm-realtime--multimodal) -- [Documentation](#documentation) -- [Reference](#reference) -- [Exception Handling](#exception-handling) -- [Pagination](#pagination) -- [Advanced](#advanced) - - [Access Raw Response Data](#access-raw-response-data) - - [Retries](#retries) - - [Timeouts](#timeouts) - - [Custom Client](#custom-client) -- [Contributing](#contributing) +```sh +pip install agora-agents +``` ## Requirements - Python 3.8+ -## Installation - -```sh -pip install agora-agents -``` - ## Quick Start -The recommended onboarding path is a server-side builder flow: define the agent once, configure vendors on the builder, and let AgentKit infer Agora-managed configuration when credentials are omitted. +Start with the `Agent` builder: create a client with app credentials, choose your ASR, LLM, and TTS providers, then start a session. Omit vendor API keys for supported Agora-managed models, or provide keys when you want BYOK. ```python import os @@ -146,11 +127,11 @@ def start_conversation() -> str: ### Why no token or vendor key in the example? -`Agora` generates the required ConvoAI REST auth and RTC join tokens automatically when you provide `app_id` and `app_certificate`. AgentKit inspects the builder-provided vendor configs and infers Agora-managed configuration for supported models, so you do not pass vendor API keys in this flow. +`Agora` generates the required ConvoAI REST auth and RTC join tokens automatically when you provide `app_id` and `app_certificate`. For supported Agora-managed models, leave vendor API keys unset; provide keys when you want BYOK. -### BYOK version of the same builder flow +### BYOK version -Use the same `Agent` builder shape, but provide credentials explicitly when you want vendor-managed billing and routing instead of Agora-managed presets. +Use the same `Agent` builder shape, but provide credentials explicitly when you want vendor-managed billing and routing instead of Agora-managed models. ```python agent = Agent( @@ -232,7 +213,7 @@ except ApiError as e: ## Pagination -Paginated requests will return a `SyncPager` or `AsyncPager`, which can be used as generators for the underlying object. +Paginated requests will return a `SyncPager` or `AsyncPager`, which can be used as generators for the returned object. ```python from agora_agent import Agora, Area @@ -276,15 +257,15 @@ client = Agora( ) response = client.agents.with_raw_response.start(...) print(response.headers) # access the response headers -print(response.data) # access the underlying object +print(response.data) # access the returned object pager = client.agents.list(...) print(pager.response) # access the typed response for the first page for item in pager: - print(item) # access the underlying object(s) + print(item) # access the returned object(s) for page in pager.iter_pages(): print(page.response) # access the typed response for each page for item in page: - print(item) # access the underlying object(s) + print(item) # access the returned object(s) ``` ### Retries diff --git a/docs/concepts/session.md b/docs/concepts/session.md index a7f3150..e4883f2 100644 --- a/docs/concepts/session.md +++ b/docs/concepts/session.md @@ -145,13 +145,13 @@ session.off('started', on_started) | `session.app_id` | `str` | The Agora App ID | | `session.raw` | `AgentsClient` | Direct access to the Fern-generated agents client | -## `session.raw` — Escape Hatch +## Direct API access with `session.raw` -If the agentkit does not yet expose a method for a new API endpoint, use `session.raw` to access the underlying Fern-generated `AgentsClient` (sync) or `AsyncAgentsClient` (async) directly: +If AgentKit does not yet expose a method for a new API endpoint, use `session.raw` to access the generated `AgentsClient` (sync) or `AsyncAgentsClient` (async) directly: ```python -# Access any Fern-generated method +# Access any generated REST method response = session.raw.list(session.app_id) ``` diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 0ab396b..8d58cd1 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -25,6 +25,11 @@ Used with `agent.with_llm()` for the cascading flow (ASR → LLM → TTS). | `AzureOpenAI` | Azure OpenAI | `api_key`, `endpoint`, `deployment_name` | | `Anthropic` | Anthropic | `api_key` | | `Gemini` | Google Gemini | `api_key` | +| `Groq` | Groq | `api_key` | +| `VertexAILLM` | Google Vertex AI | `api_key`, `project_id`, `location` | +| `AmazonBedrock` | Amazon Bedrock | `api_key`, `url`, `model` | +| `Dify` | Dify | `api_key`, `url` | +| `CustomLLM` | OpenAI-compatible LLM | `api_key`, `base_url`, `model` | ```python diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index af0ef3c..472ac57 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -6,7 +6,7 @@ description: Build and run your first Agora Conversational AI agent in Python wi # Quick Start -This guide uses the recommended onboarding path: +This guide starts with the standard AgentKit path: - `app_id`, `app_certificate`, and `area` on `Agora` or `AsyncAgora` - the `Agent` builder with `.with_stt()`, `.with_llm()`, and `.with_tts()` @@ -61,7 +61,7 @@ if __name__ == "__main__": 1. `Agora` runs in app-credentials mode when you pass `app_id` and `app_certificate` only. 2. `Agent` holds reusable behavior such as instructions, greeting, and history settings. -3. Vendor classes on the builder select the ASR, LLM, and TTS stack. AgentKit infers Agora-managed configuration when credentials are omitted for supported models. +3. Vendor classes on the builder select the ASR, LLM, and TTS stack. Leave vendor credentials unset for supported Agora-managed models, or provide keys when you want BYOK. 4. `session.start()` generates the required auth tokens and returns the unique agent session ID. ## Async applications @@ -70,7 +70,7 @@ For `asyncio` services, switch to `AsyncAgora` and `await` the session methods. ## When to use BYOK instead -Use the builder without vendor API keys when you want the fastest path with Agora-managed models. +Use the builder without vendor API keys when you are using supported Agora-managed models. Use BYOK when you need to: diff --git a/docs/guides/byok.md b/docs/guides/byok.md index 15b7684..3b03ebe 100644 --- a/docs/guides/byok.md +++ b/docs/guides/byok.md @@ -81,5 +81,5 @@ if __name__ == "__main__": ## Builder-managed vs BYOK -- Builder without vendor keys: fastest path with Agora-managed models +- Builder without vendor keys: supported Agora-managed models - BYOK: your keys and full vendor control diff --git a/docs/guides/regional-routing.md b/docs/guides/regional-routing.md index 4c4ffcb..00584d0 100644 --- a/docs/guides/regional-routing.md +++ b/docs/guides/regional-routing.md @@ -108,7 +108,7 @@ print(client.get_current_url()) ## Accessing the Pool Directly -For advanced use, access the underlying `Pool` object: +For advanced use, access the `Pool` object: ```python pool = client.pool @@ -127,4 +127,4 @@ area = pool.get_area() | `next_region()` | sync | sync | Cycle to next region prefix | | `select_best_domain()` | sync | **`async` (requires `await`)** | DNS-based domain selection | | `get_current_url()` | sync | sync | Get the current base URL | -| `pool` (property) | sync | sync | Access the underlying `Pool` object | +| `pool` (property) | sync | sync | Access the `Pool` object | diff --git a/docs/index.md b/docs/index.md index 1935d5f..f6d2b82 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,18 +19,18 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on **MLLM flow** uses a multimodal model such as OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok for end-to-end audio. -## Choose a starting point +## Start here -- Use [Quick Start](./getting-started/quick-start.md) if you want the recommended builder-based path with app credentials. -- Use [MLLM Flow](./guides/mllm-flow.md) if you want realtime end-to-end audio with OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok. -- Use [Cascading Flow](./guides/cascading-flow.md) if you want separate ASR, LLM, and TTS vendors. +- Start with [Quick Start](./getting-started/quick-start.md). It shows the baseline app-credentials setup and starts a cascading ASR -> LLM -> TTS agent. +- Use [MLLM Flow](./guides/mllm-flow.md) when your agent uses one realtime multimodal model, such as OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok. +- Use [Cascading Flow](./guides/cascading-flow.md) for more examples of the default ASR -> LLM -> TTS flow, including provider-specific configuration. -## SDK layers +## How the SDK is organized | Layer | What it does | When to use | |---|---|---| | **AgentKit** (`Agent`, `AgentSession`, vendor classes) | High-level builder pattern, lifecycle, typed vendors | Most use cases | -| **Fern-generated core** (`client.agents`, `client.telephony`) | Direct REST client mapping every API endpoint | Advanced use cases | +| **Generated REST clients** (`client.agents`, `client.telephony`) | Typed access to REST APIs not covered by AgentKit | Advanced use cases | ## Documentation @@ -38,10 +38,10 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on |---|---| | [Installation](./getting-started/installation.md) | Install the SDK and prerequisites | | [Authentication](./getting-started/authentication.md) | App credentials and other auth modes | -| [Quick Start](./getting-started/quick-start.md) | Recommended builder-based onboarding flow | +| [Quick Start](./getting-started/quick-start.md) | App credentials and AgentKit | | [Agent Builder Features](./guides/agent-builder-features.md) | Turn detection, SAL, filler words, and advanced agent options | | [BYOK](./guides/byok.md) | Bring your own vendor credentials and config | -| [Architecture](./concepts/architecture.md) | Understand the SDK layers and client types | +| [Architecture](./concepts/architecture.md) | SDK structure and generated REST clients | | [Agent](./concepts/agent.md) | Configure agents with the fluent builder | | [AgentSession](./concepts/session.md) | Manage the agent lifecycle | | [Vendors](./concepts/vendors.md) | Browse all LLM, TTS, STT, MLLM, and Avatar providers | @@ -52,7 +52,7 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on | [Error Handling](./guides/error-handling.md) | Handle API errors with ApiError | | [Pagination](./guides/pagination.md) | Iterate over paginated list endpoints | | [Advanced](./guides/advanced.md) | Raw response, retries, timeouts, custom httpx client | -| [Low-Level API](./guides/low-level-api.md) | Direct `client.agents.start()` usage | +| [Low-Level API](./guides/low-level-api.md) | Generated REST APIs | | [Client Reference](./reference/client.md) | Full `Agora` / `AsyncAgora` API | | [Agent Reference](./reference/agent.md) | Full `Agent` builder API | | [Session Reference](./reference/session.md) | Full `AgentSession` / `AsyncAgentSession` API | diff --git a/docs/reference/client.md b/docs/reference/client.md index 4182214..9c1a38c 100644 --- a/docs/reference/client.md +++ b/docs/reference/client.md @@ -100,7 +100,7 @@ url = client.get_current_url() ### `pool` (property) -Access the underlying `Pool` object for advanced domain management. +Access the `Pool` object for advanced domain management. ```python diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index eb8290f..7395eea 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -131,6 +131,18 @@ from agora_agent import Gemini llm = Gemini(api_key='your-google-key', model='gemini-2.0-flash-exp') ``` +### Other LLM vendors + +The SDK also includes named helpers for the remaining Agora-supported LLM providers. These helpers choose the correct request format internally. + +| Class | Provider | Key parameters | +|---|---|---| +| `Groq` | Groq | `api_key`, `model`, `base_url?` | +| `VertexAILLM` | Google Vertex AI | `api_key`, `model`, `project_id`, `location`, `url?` | +| `AmazonBedrock` | Amazon Bedrock | `api_key`, `url`, `model` | +| `Dify` | Dify | `api_key`, `url`, `user?`, `conversation_id?` | +| `CustomLLM` | OpenAI-compatible LLM | `api_key`, `model`, `base_url` | + --- ## TTS Vendors