diff --git a/.fern/replay.lock b/.fern/replay.lock new file mode 100644 index 0000000..536b6e4 --- /dev/null +++ b/.fern/replay.lock @@ -0,0 +1,10 @@ +# DO NOT EDIT MANUALLY - Managed by Fern Replay +version: "1.0" +generations: + - commit_sha: a217c8ecfd919345831eebaca8295e292d65ebcf + tree_hash: 707f496ae7e028b80fc9a2adc1e5d69468f170b3 + timestamp: 2026-05-20T20:38:02.180Z + cli_version: unknown + generator_versions: {} +current_generation: a217c8ecfd919345831eebaca8295e292d65ebcf +patches: [] diff --git a/.fernignore b/.fernignore index d85561f..dc9b28b 100644 --- a/.fernignore +++ b/.fernignore @@ -9,8 +9,16 @@ src/agora_agent/agentkit/ # Documentation - managed manually, not generated by Fern docs/ +README.md + +# Compatibility shim and CI/release workflows are managed manually +compat/ +.github/workflows/ # Dependency manifests/lockfiles are managed manually pyproject.toml poetry.lock requirements.txt +.fern/replay.lock +.fern/replay.yml +.gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..74928d6 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +.fern/replay.lock linguist-generated=true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d7dc922..81f51c3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,8 +35,38 @@ jobs: - name: Test run: poetry run pytest -rP . + compat-build: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up python + uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Bootstrap poetry + run: | + curl -sSL https://install.python-poetry.org | python - -y --version 1.5.1 + - name: Install primary package + run: poetry install + - name: Build and verify compatibility package + run: | + cd compat/agora-agent-server-sdk + poetry build + cd ../.. + poetry run pip install compat/agora-agent-server-sdk/dist/*.whl + poetry run python - <<'PY' + import agora_agent + from agora_agent_server_sdk_compat import Agora, Area, __version__ + + assert Agora is agora_agent.Agora + assert Area is agora_agent.Area + assert __version__ == agora_agent.__version__ + print("Compat shim re-exports verified.") + PY + publish: - needs: [compile, test] + needs: [compile, test, compat-build] if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') runs-on: ubuntu-latest steps: @@ -51,9 +81,64 @@ jobs: curl -sSL https://install.python-poetry.org | python - -y --version 1.5.1 - name: Install dependencies run: poetry install - - name: Publish to pypi + + - name: Verify package versions match release tag + run: | + TAG_VERSION="${GITHUB_REF_NAME#v}" + ROOT_VERSION="$(poetry version -s | sed 's/^v//')" + COMPAT_VERSION="$(cd compat/agora-agent-server-sdk && poetry version -s | sed 's/^v//')" + COMPAT_DEP_VERSION="$(python -c "import re, sys; from pathlib import Path; text = Path('compat/agora-agent-server-sdk/pyproject.toml').read_text(); match = re.search(r'^agora-agents\s*=\s*\"([^\"]+)\"', text, re.M); sys.exit('agora-agents dependency not found in compat pyproject.toml') if not match else None; print(match.group(1))")" + + if [ "$ROOT_VERSION" != "$TAG_VERSION" ]; then + echo "Root package version ($ROOT_VERSION) does not match tag version ($TAG_VERSION)." + exit 1 + fi + + if [ "$COMPAT_VERSION" != "$TAG_VERSION" ]; then + echo "Compat package version ($COMPAT_VERSION) does not match tag version ($TAG_VERSION)." + exit 1 + fi + + if [ "$COMPAT_DEP_VERSION" != ">=${TAG_VERSION},<3.0.0" ]; then + echo "Compat package dependency on agora-agents ($COMPAT_DEP_VERSION) does not match >=${TAG_VERSION},<3.0.0." + exit 1 + fi + + - name: Publish primary package to pypi + run: | + poetry config repositories.remote https://upload.pypi.org/legacy/ + poetry --no-interaction -v publish --build --repository remote --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD" + env: + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + + - name: Wait for primary package on PyPI + run: | + TAG_VERSION="${GITHUB_REF_NAME#v}" + PACKAGE="agora-agents" + for attempt in $(seq 1 12); do + if pip index versions "$PACKAGE" 2>/dev/null | grep -q "$TAG_VERSION"; then + echo "$PACKAGE==$TAG_VERSION is available on PyPI." + exit 0 + fi + echo "Waiting for $PACKAGE==$TAG_VERSION on PyPI (attempt $attempt/12)..." + sleep 10 + done + echo "Timed out waiting for $PACKAGE==$TAG_VERSION on PyPI." + exit 1 + + - name: Publish compatibility package to pypi run: | + cd compat/agora-agent-server-sdk poetry config repositories.remote https://upload.pypi.org/legacy/ - poetry --no-interaction -v publish --build --repository remote --username "__token__" --password "$PYPI_API_TOKEN" + for attempt in $(seq 1 3); do + if poetry --no-interaction -v publish --build --repository remote --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD"; then + exit 0 + fi + echo "Compat publish failed (attempt $attempt/3). Retrying in 15s..." + sleep 15 + done + exit 1 env: - PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 2c4b0e7..0000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: release - -on: - push: - tags: - - "v*" - workflow_dispatch: - inputs: - tag: - description: "Existing tag to release, for example v1.4.0" - required: true - type: string - -permissions: - contents: write - -jobs: - github-release: - runs-on: ubuntu-latest - env: - RELEASE_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.tag || github.ref_name }} - steps: - - name: Checkout repo - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Extract release notes - run: | - TAG="${RELEASE_TAG}" - git rev-parse "$TAG" >/dev/null - awk -v tag="$TAG" ' - index($0, "## [" tag "]") == 1 { found = 1; next } - found && /^## \[/ { exit } - found { print } - ' changelog.md > release_notes.md - - if [ ! -s release_notes.md ]; then - echo "Release $TAG" > release_notes.md - fi - - - name: Create or update GitHub release - env: - GH_TOKEN: ${{ github.token }} - run: | - TAG="${RELEASE_TAG}" - if gh release view "$TAG" >/dev/null 2>&1; then - gh release edit "$TAG" --title "$TAG" --notes-file release_notes.md - else - gh release create "$TAG" --title "$TAG" --notes-file release_notes.md - fi diff --git a/README.md b/README.md index f0ecfe5..983932b 100644 --- a/README.md +++ b/README.md @@ -1,59 +1,40 @@ -# Agoraio Python Library +# Agora Conversational AI Python SDK [![fern shield](https://img.shields.io/badge/%F0%9F%8C%BF-Built%20with%20Fern-brightgreen)](https://buildwithfern.com?utm_source=github&utm_medium=github&utm_campaign=readme&utm_source=https%3A%2F%2Fgithub.com%2FAgoraIO-Conversational-AI%2Fagent-server-sdk-python) -[![pypi](https://img.shields.io/pypi/v/agora-agent-server-sdk)](https://pypi.python.org/pypi/agora-agent-server-sdk) +[![pypi](https://img.shields.io/pypi/v/agora-agents)](https://pypi.python.org/pypi/agora-agents) The Agora Conversational AI SDK provides convenient access to the Agora Conversational AI APIs, enabling you to build voice-powered AI agents with support for both cascading flows (ASR -> LLM -> TTS) and multimodal flows (MLLM) for real-time audio processing. +## Install -## Table of Contents - -- [Requirements](#requirements) -- [Installation](#installation) -- [Quick Start](#quick-start) -- [Byok](#byok) -- [Mllm Realtime Multimodal](#mllm-realtime-multimodal) -- [Documentation](#documentation) -- [Reference](#reference) -- [Mllm Flow Multimodal](#mllm-flow-multimodal) -- [Usage](#usage) -- [Async Client](#async-client) -- [Exception Handling](#exception-handling) -- [Pagination](#pagination) -- [Advanced](#advanced) - - [Access Raw Response Data](#access-raw-response-data) - - [Retries](#retries) - - [Timeouts](#timeouts) - - [Custom Client](#custom-client) -- [Contributing](#contributing) +```sh +pip install agora-agents +``` ## Requirements - Python 3.8+ -## Installation - -```sh -pip install agora-agent-server-sdk -``` - ## Quick Start -The recommended onboarding path is a server-side builder flow: define the agent once, configure preset-backed providers in the builder, and let AgentKit infer the reseller `preset` values when the session starts. +Start with the `Agent` builder: create a client with app credentials, choose your ASR, LLM, and TTS providers, then start a session. Omit vendor API keys for supported Agora-managed models, or provide keys when you want BYOK. ```python import os import time -from agora_agent import Agora, Area -from agora_agent.agentkit import ( +from agora_agent import ( Agent, + Agora, + Area, DataChannel, DeepgramSTT, + GenericAvatar, MiniMaxTTS, OpenAI, + XaiGrok, expires_in_hours, ) @@ -146,11 +127,11 @@ def start_conversation() -> str: ### Why no token or vendor key in the example? -`Agora` generates the required ConvoAI REST auth and RTC join tokens automatically when you provide `app_id` and `app_certificate`. AgentKit then inspects the builder-provided vendor configs and infers the matching supported `preset` values for reseller-backed models, so you do not pass vendor API keys in this flow. +`Agora` generates the required ConvoAI REST auth and RTC join tokens automatically when you provide `app_id` and `app_certificate`. For supported Agora-managed models, leave vendor API keys unset; provide keys when you want BYOK. -### BYOK version of the same builder flow +### BYOK version -Use the same `Agent` builder shape, but provide credentials explicitly when you want vendor-managed billing and routing instead of Agora-managed presets. +Use the same `Agent` builder shape, but provide credentials explicitly when you want vendor-managed billing and routing instead of Agora-managed models. ```python agent = Agent( @@ -181,18 +162,20 @@ agent = Agent( ) ``` +Migrating from `agora-agent-server-sdk` on PyPI? Use `pip install agora-agents`; imports stay `agora_agent` — see [changelog migration notes](./changelog.md#migration-notes) or [installation guide](./docs/getting-started/installation.md#migrating-from-a-previous-package-name). + ## BYOK -If you want to bring your own vendor credentials instead of using Agora-managed presets, use the BYOK guide: +If you want to bring your own vendor credentials instead of using Agora-managed models, use the BYOK guide: - [BYOK Guide](./docs/guides/byok.md) ## MLLM (Realtime / Multimodal) -Use `with_mllm()` for OpenAI Realtime or Gemini Live. No STT, LLM, or TTS vendor is needed when MLLM mode is enabled. +Use `with_mllm()` for OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok. No STT, LLM, or TTS vendor is needed when MLLM mode is enabled. ```python -from agora_agent.agentkit import Agent, OpenAIRealtime +from agora_agent import Agent, OpenAIRealtime agent = Agent(name="realtime-assistant").with_mllm( OpenAIRealtime( @@ -213,181 +196,6 @@ API reference documentation is available [here](https://docs.agora.io/en/convers A full reference for this library is available [here](https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python/blob/HEAD/./reference.md). -## MLLM Flow (Multimodal) - -For real-time audio processing using OpenAI's Realtime API or Google Gemini Live, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. - -```python -from agora-agent-server-sdk import Agora -from agora-agent-server-sdk.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesAdvancedFeatures, - StartAgentsRequestPropertiesMllm, - StartAgentsRequestPropertiesMllmVendor, - StartAgentsRequestPropertiesTts, - StartAgentsRequestPropertiesTtsVendor, - StartAgentsRequestPropertiesLlm, - StartAgentsRequestPropertiesTurnDetection, - StartAgentsRequestPropertiesTurnDetectionType, -) - -client = Agora( - customer_id="YOUR_CUSTOMER_ID", - customer_secret="YOUR_CUSTOMER_SECRET", -) - -client.agents.start( - appid="your_app_id", - name="mllm_agent", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="your_token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - advanced_features=StartAgentsRequestPropertiesAdvancedFeatures( - enable_mllm=True, - ), - mllm=StartAgentsRequestPropertiesMllm( - url="wss://api.openai.com/v1/realtime", - api_key="", - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, - params={ - "model": "gpt-4o-realtime-preview", - "voice": "alloy", - }, - input_modalities=["audio"], - output_modalities=["text", "audio"], - greeting_message="Hello! I'm ready to chat in real-time.", - ), - turn_detection=StartAgentsRequestPropertiesTurnDetection( - type=StartAgentsRequestPropertiesTurnDetectionType.SERVER_VAD, - threshold=0.5, - silence_duration_ms=500, - ), - # TTS and LLM are still required but not used when MLLM is enabled - tts=StartAgentsRequestPropertiesTts( - vendor=StartAgentsRequestPropertiesTtsVendor.MICROSOFT, - params={}, - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - ), - ), -) -``` - - -## Usage - -Instantiate and use the client with the following: - -```python -from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, -) - -client = Agora( - authorization="YOUR_AUTHORIZATION", - username="YOUR_USERNAME", - password="YOUR_PASSWORD", -) -client.agents.start( - appid="appid", - name="unique_name", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), - tts=Tts_Microsoft( - params=MicrosoftTtsParams( - key="key", - region="region", - voice_name="voice_name", - ), - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - api_key="", - system_messages=[ - {"role": "system", "content": "You are a helpful chatbot."} - ], - params={"model": "gpt-4o-mini"}, - max_history=32, - greeting_message="Hello, how can I assist you today?", - failure_message="Please hold on a second.", - ), - ), -) -``` - -## Async Client - -The SDK also exports an `async` client so that you can make non-blocking calls to our API. Note that if you are constructing an Async httpx client class to pass into this client, use `httpx.AsyncClient()` instead of `httpx.Client()` (e.g. for the `httpx_client` parameter of this client). - -```python -import asyncio - -from agora_agent import AsyncAgora, MicrosoftTtsParams, Tts_Microsoft -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, -) - -client = AsyncAgora( - authorization="YOUR_AUTHORIZATION", - username="YOUR_USERNAME", - password="YOUR_PASSWORD", -) - - -async def main() -> None: - await client.agents.start( - appid="appid", - name="unique_name", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), - tts=Tts_Microsoft( - params=MicrosoftTtsParams( - key="key", - region="region", - voice_name="voice_name", - ), - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - api_key="", - system_messages=[ - {"role": "system", "content": "You are a helpful chatbot."} - ], - params={"model": "gpt-4o-mini"}, - max_history=32, - greeting_message="Hello, how can I assist you today?", - failure_message="Please hold on a second.", - ), - ), - ) - - -asyncio.run(main()) -``` - ## Exception Handling When the API returns a non-success status code (4xx or 5xx response), a subclass of the following error @@ -405,18 +213,18 @@ except ApiError as e: ## Pagination -Paginated requests will return a `SyncPager` or `AsyncPager`, which can be used as generators for the underlying object. +Paginated requests will return a `SyncPager` or `AsyncPager`, which can be used as generators for the returned object. ```python -from agora_agent import Agora +from agora_agent import Agora, Area client = Agora( - authorization="YOUR_AUTHORIZATION", - username="YOUR_USERNAME", - password="YOUR_PASSWORD", + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", ) response = client.agents.list( - appid="appid", + appid=client.app_id, ) for item in response: yield item @@ -449,15 +257,15 @@ client = Agora( ) response = client.agents.with_raw_response.start(...) print(response.headers) # access the response headers -print(response.data) # access the underlying object +print(response.data) # access the returned object pager = client.agents.list(...) print(pager.response) # access the typed response for the first page for item in pager: - print(item) # access the underlying object(s) + print(item) # access the returned object(s) for page in pager.iter_pages(): print(page.response) # access the typed response for each page for item in page: - print(item) # access the underlying object(s) + print(item) # access the returned object(s) ``` ### Retries diff --git a/changelog.md b/changelog.md index dad1528..1174850 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,44 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [v2.0.0] — 2026-05-21 + +### Added + +- **Type aliases** — `AsrConfig` (= `SttConfig`), `is_avatar_token_managed`, think type aliases (`ThinkOnListeningAction`, etc.), and think value constants. +- **`XaiGrok`** — New MLLM wrapper for xAI Grok (`mllm.vendor`: `"xai"`), including Realtime API URL, voice, language, sample rate, modalities, messages, and MLLM turn detection support. +- **`GenericAvatar`** — New generic avatar wrapper (`vendor: "generic"`) for custom avatar providers. +- **Avatar token enrichment** — `AgentSession.start()` now fills missing generic avatar `agora_appid` and `agora_channel` from the session and generates missing avatar `agora_token` values for HeyGen, LiveAvatar, and Generic avatars using each avatar's `agora_uid`. +- **Turn pagination** — `AgentSession.get_turns()` and `AsyncAgentSession.get_turns()` now accept `page_index` and `page_size`. New `get_all_turns()` helpers fetch and combine all pages. +- **Greeting interruption control** — LLM vendor `greeting_configs` now accepts the typed `LlmGreetingConfigs` shape, including v2.7 `interruptable`. +- **Type alias parity** — Added public aliases for v2.7 generated types such as `LlmConfig`, `TtsConfig`, `SttConfig`, `MllmConfig`, `AvatarConfig`, `AgentConfigUpdate`, `ConversationTurns`, `ConversationHistory`, `SessionInfo`, `Labels`, `SpeakPriority`, and `FillerWordsContentSelectionRule`. + +### Changed + +- **ConvoAI token options** — `generate_convo_ai_token()` now accepts an integer `uid` and handles the internal token string conversion for users, agents, and avatars. +- **Avatar token generation** — Removed the dedicated `generate_avatar_rtc_token()` wrapper; avatar RTC tokens use the existing ConvoAI token helper. +- **Avatar token gating** — Session enrichment uses `is_avatar_token_managed` (vendor-only); UID checks remain in session logic. +- **`XaiGrok` is the primary xAI MLLM class** — Matches the product name ([xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai)) and the TypeScript/Go SDKs. +- **Package version** — Bumped to `v2.0.0` to match the Fern-generated SDK headers. +- **PyPI distribution rename** — The published package name is now `agora-agents` (formerly `agora-agent-server-sdk`). The Python import path remains `agora_agent`. +- **RTM data channel default** — When `advanced_features.enable_rtm=True`, AgentKit now defaults `parameters.data_channel` to `"rtm"` unless the caller explicitly sets a data channel. +- **Agent-level LLM overrides** — In the standard ASR + LLM + TTS pipeline, agent-level `greeting`, `failure_message`, and `max_history` now override vendor defaults, matching the TypeScript SDK. In MLLM mode, agent-level `greeting` and `failure_message` fill only missing fields. +- **MLLM core alignment** — MLLM wrappers no longer expose or emit unsupported `predefined_tools` or `max_history` fields because they are not present in the generated v2.7 core `mllm` type. +- **MLLM without TTS** — MLLM sessions no longer require separate TTS, STT, or LLM vendor configuration. +- **Avatar pipeline support** — Avatar vendors are now explicitly limited to the cascading ASR + LLM + TTS pipeline. Combining `with_avatar()` with `with_mllm()` is rejected at `Agent.to_properties()` and `AgentSession.start()` (matching the TypeScript SDK), with a disabled avatar (`enable=False`) still permitted alongside MLLM. +- **VertexAI parity** — `VertexAI.to_config()` now spreads `additional_params` first so explicit `model`, `project_id`, `location`, and `adc_credentials_string` fields always win, matching the TypeScript and Gemini Live wrappers. +- **Pagination guard parity** — `AgentSession.get_all_turns()` and `AsyncAgentSession.get_all_turns()` now raise `RuntimeError` if the server's pagination metadata is missing (`page_index`/`total_pages`/`is_last_page`) or if the next page does not advance, matching the TypeScript SDK. + +### Migration notes + +- **PyPI package rename** — Install `agora-agents` instead of `agora-agent-server-sdk` (`pip install agora-agents` or `poetry add agora-agents`). The import path is unchanged (`from agora_agent import ...`). The legacy PyPI distribution name remains available as a compatibility shim that re-exports the public API from `agora-agents`. +- **Deprecated aliases** — Use `LiveAvatarAvatar` instead of `HeyGenAvatar`, `is_avatar_token_managed` instead of `is_rtc_avatar`, and `ThinkOn*` / `ThinkResponse` instead of `AgentThinkRequestOn*` / `AgentThinkResponse`. + +- **`think()` default** — The server default for `on_listening_action` changed from `inject` to `interrupt` in API v2.7. Pass `on_listening_action="inject"` explicitly to preserve the old behavior. +- **Turn analytics pagination** — Sessions with more than 50 turns must request additional pages via `get_turns(page_index=..., page_size=...)` or use `get_all_turns()`. +- **Error reasons** — API v2.7 adds status codes `401`, `429`, and `500`; `InvalidRequest` is split into `InvalidRequestBody`, `MissingRequiredField`, and `InvalidFieldValue`, with new reasons such as `ServiceNotEnabled`, `AccountSuspended`, and `ResourceAllocationFailed`. +- **Event `112`** — Webhook event `112 turns finished` can be used as an alternative batch delivery path for post-session turn data. + ## [v1.4.1] — 2026-05-18 ### Fixed @@ -58,8 +96,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Fixed - **`AresSTT`** — Removed redundant `language` key from the `params` dict. Language is now emitted only at the top level. `params` is only included when `additional_params` is provided. -- **`OpenAIRealtime` / `VertexAI` (MLLM)** — Agent-level `greeting`, `failure_message`, and `max_history` overrides are now correctly applied when the agent is in MLLM mode. Previously these values were silently dropped. -- **`VertexAI` (MLLM)** — `messages` is now correctly placed inside `params` (required by the Gemini Live API). Previously it was emitted at the top level and silently ignored. +- **`OpenAIRealtime` / `VertexAI` (MLLM)** — Agent-level `greeting` and `failure_message` defaults are now correctly applied when missing in MLLM mode. Previously these values were silently dropped. +- **`VertexAI` (MLLM)** — `messages` is emitted at the MLLM top level, matching the generated core SDK contract. ### Changed @@ -72,8 +110,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). - **`OpenAITTS`** — New optional parameters: `response_format` (str, e.g. `"pcm"`) and `speed` (float). - **`CartesiaTTS`** — `voice_id` user-facing field is preserved; voice is serialized to the required nested object format automatically. - **`RimeTTS`** — New optional parameters: `lang` (str), `sampling_rate` (int, serialized as `samplingRate`), `speed_alpha` (float, serialized as `speedAlpha`). -- **`OpenAIRealtime`** — New optional parameters: `predefined_tools` (List[str]), `failure_message` (str), `max_history` (int). -- **`VertexAI` (MLLM)** — New optional parameters: `predefined_tools` (List[str]), `failure_message` (str), `max_history` (int). +- **`OpenAIRealtime`** — New optional parameter: `failure_message` (str). +- **`VertexAI` (MLLM)** — New optional parameter: `failure_message` (str). - **`HeyGenAvatar`** — New fields: `agora_token` (str, optional), `avatar_id` (str, optional), `enable` (bool, optional, default `True`), `disable_idle_timeout` (bool, optional), `activity_idle_timeout` (int, optional). ## [v1.1.0] — 2026-03-17 @@ -82,7 +120,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). - `MurfTTS` vendor - ### Fixed - `MiniMaxTTS`: added required `group_id`, `url`, and correctly nested `voice_setting.voice_id` — previously missing, requiring users to bypass the SDK entirely diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md new file mode 100644 index 0000000..1da36aa --- /dev/null +++ b/compat/agora-agent-server-sdk/README.md @@ -0,0 +1,18 @@ +# agora-agent-server-sdk + +This package has been renamed to `agora-agents`. + +New projects should install: + +```sh +pip install agora-agents +``` + +This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + +```python +from agora_agent import Agora, Area +from agora_agent_server_sdk_compat import Agora, Area +``` + +Maintainers: dual-publish steps live in the repository release workflow, not in the root README. diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml new file mode 100644 index 0000000..ac93128 --- /dev/null +++ b/compat/agora-agent-server-sdk/pyproject.toml @@ -0,0 +1,42 @@ +[project] +name = "agora-agent-server-sdk" + +[tool.poetry] +name = "agora-agent-server-sdk" +version = "v2.0.0" +description = "Compatibility shim for the renamed agora-agents package." +readme = "README.md" +authors = [] +keywords = [] + +classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" +] +packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} +] + +[tool.poetry.urls] +Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + +[tool.poetry.dependencies] +python = "^3.8" +agora-agents = ">=2.0.0,<3.0.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py new file mode 100644 index 0000000..6283244 --- /dev/null +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py @@ -0,0 +1,13 @@ +"""Compatibility re-exports for the renamed agora-agents package.""" + +import agora_agent as _agora_agent + +__all__ = getattr(_agora_agent, "__all__", []) + + +def __getattr__(name: str): + return getattr(_agora_agent, name) + + +def __dir__(): + return dir(_agora_agent) diff --git a/docs/concepts/agent.md b/docs/concepts/agent.md index 0a8b159..dd9d3ed 100644 --- a/docs/concepts/agent.md +++ b/docs/concepts/agent.md @@ -12,7 +12,7 @@ The `Agent` class is a fluent builder for configuring AI agent properties. It co ```python -from agora_agent.agentkit import Agent +from agora_agent import Agent agent = Agent( name='support-assistant', @@ -75,8 +75,8 @@ Each `with_*` method returns a **new** `Agent` instance — the original is unch ```python -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent +from agora_agent import OpenAI, ElevenLabsTTS, DeepgramSTT agent = ( Agent(name='my-agent', instructions='You are a helpful assistant.') @@ -92,9 +92,7 @@ Because each `with_*` call returns a new `Agent`, you can build a base configura ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md index 42dece9..f2c058c 100644 --- a/docs/concepts/architecture.md +++ b/docs/concepts/architecture.md @@ -6,9 +6,9 @@ description: How the Python SDK layers are structured and when to use each. # Architecture -## Two-Layer Design +## Three-Layer Design -The Python SDK has two layers: +The Python SDK has three layers: ``` +--------------------------------------------------+ @@ -22,7 +22,7 @@ The Python SDK has two layers: +--------------------------------------------------+ ``` -### Agentkit Layer (`agora_agent.agentkit`) +### AgentKit Layer (`agora_agent.agentkit`) This is the primary developer-facing API. It provides: @@ -83,23 +83,20 @@ await session.stop() The `Agent` builder class is the same for both — it does not make HTTP calls, so it has no async variant. -## Import Paths +## Import paths - ```python -# Top-level client and types -from agora_agent import Agora, AsyncAgora, Area, Pool - -# Agentkit layer -from agora_agent.agentkit import Agent, AgentSession -from agora_agent.agentkit.agent_session import AsyncAgentSession - -# Vendor classes -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT - -# Token helpers -from agora_agent.agentkit.token import generate_rtc_token - -# Also available from top-level -from agora_agent import Agent, AgentSession, AsyncAgentSession, generate_rtc_token +from agora_agent import ( + Agent, + AgentSession, + AsyncAgentSession, + Agora, + AsyncAgora, + Area, + Pool, + OpenAI, + ElevenLabsTTS, + DeepgramSTT, + generate_rtc_token, +) ``` diff --git a/docs/concepts/session.md b/docs/concepts/session.md index 4c9f17d..e4883f2 100644 --- a/docs/concepts/session.md +++ b/docs/concepts/session.md @@ -35,9 +35,7 @@ Use `Agent.create_session()` to create a session: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') @@ -147,13 +145,13 @@ session.off('started', on_started) | `session.app_id` | `str` | The Agora App ID | | `session.raw` | `AgentsClient` | Direct access to the Fern-generated agents client | -## `session.raw` — Escape Hatch +## Direct API access with `session.raw` -If the agentkit does not yet expose a method for a new API endpoint, use `session.raw` to access the underlying Fern-generated `AgentsClient` (sync) or `AsyncAgentsClient` (async) directly: +If AgentKit does not yet expose a method for a new API endpoint, use `session.raw` to access the generated `AgentsClient` (sync) or `AsyncAgentsClient` (async) directly: ```python -# Access any Fern-generated method +# Access any generated REST method response = session.raw.list(session.app_id) ``` diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 7f22d8e..8d58cd1 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -8,11 +8,11 @@ description: Typed vendor classes for LLM, TTS, STT, MLLM, and Avatar providers. The SDK provides typed vendor classes for every supported provider. Each vendor class validates its configuration with Pydantic and produces the correct API payload automatically. -All vendor classes are available from `agora_agent.agentkit.vendors`: +All vendor classes are imported from `agora_agent`. ```python -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT +from agora_agent import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT ``` ## LLM Vendors @@ -25,10 +25,15 @@ Used with `agent.with_llm()` for the cascading flow (ASR → LLM → TTS). | `AzureOpenAI` | Azure OpenAI | `api_key`, `endpoint`, `deployment_name` | | `Anthropic` | Anthropic | `api_key` | | `Gemini` | Google Gemini | `api_key` | +| `Groq` | Groq | `api_key` | +| `VertexAILLM` | Google Vertex AI | `api_key`, `project_id`, `location` | +| `AmazonBedrock` | Amazon Bedrock | `api_key`, `url`, `model` | +| `Dify` | Dify | `api_key`, `url` | +| `CustomLLM` | OpenAI-compatible LLM | `api_key`, `base_url`, `model` | ```python -from agora_agent.agentkit.vendors import OpenAI +from agora_agent import OpenAI llm = OpenAI(api_key='your-openai-key', model='gpt-4o-mini') ``` @@ -55,7 +60,7 @@ Used with `agent.with_tts()`. Each TTS vendor produces audio at a specific sampl ```python -from agora_agent.agentkit.vendors import ElevenLabsTTS +from agora_agent import ElevenLabsTTS tts = ElevenLabsTTS( key='your-elevenlabs-key', @@ -83,7 +88,7 @@ Used with `agent.with_stt()`. ```python -from agora_agent.agentkit.vendors import DeepgramSTT +from agora_agent import DeepgramSTT stt = DeepgramSTT(api_key='your-deepgram-key', language='en-US', model='nova-2') ``` @@ -97,26 +102,30 @@ Used with `agent.with_mllm()` for the [MLLM flow](../guides/mllm-flow.md). These | `OpenAIRealtime` | OpenAI Realtime | `api_key`; optional `turn_detection` | | `GeminiLive` | Google Gemini Live API | `api_key`, `model`; optional `turn_detection` | | `VertexAI` | Vertex AI (Gemini Live) | `model`, `project_id`, `location`, `adc_credentials_string`; optional `turn_detection` | +| `XaiGrok` | xAI Grok (`mllm.vendor`: `xai`) | `api_key`; optional `voice`, `language`, `sample_rate`, `turn_detection` | ```python -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import OpenAIRealtime mllm = OpenAIRealtime(api_key='your-openai-key', model='gpt-4o-realtime-preview') ``` ## Avatar Vendors -Used with `agent.with_avatar()`. Avatars require specific TTS sample rates — see [Avatar Integration](../guides/avatars.md). +Used with `agent.with_avatar()` in the cascading ASR + LLM + TTS pipeline. Some avatars require specific TTS sample rates — see [Avatar Integration](../guides/avatars.md). | Class | Provider | Required Parameters | Required TTS Sample Rate | |---|---|---|---| -| `HeyGenAvatar` | HeyGen | `api_key`, `quality`, `agora_uid` | 24000 Hz | -| `AkoolAvatar` | Akool | `api_key`, `agora_uid` | 16000 Hz | +| `HeyGenAvatar` | HeyGen (deprecated alias) | `api_key`, `quality`, `agora_uid` | 24000 Hz | +| `LiveAvatarAvatar` | LiveAvatar | `api_key`, `quality`, `agora_uid` | 24000 Hz | +| `AkoolAvatar` | Akool | `api_key` | 16000 Hz | +| `AnamAvatar` | Anam | `api_key` | None | +| `GenericAvatar` | Generic Avatar | `api_key`, `api_base_url`, `avatar_id`, `agora_uid` | None | ```python -from agora_agent.agentkit.vendors import HeyGenAvatar +from agora_agent import HeyGenAvatar avatar = HeyGenAvatar(api_key='your-heygen-key', quality='medium', agora_uid='2') ``` diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md index b508647..74c62cd 100644 --- a/docs/getting-started/authentication.md +++ b/docs/getting-started/authentication.md @@ -1,20 +1,17 @@ --- sidebar_position: 2 title: Authentication -description: Configure the Python SDK with the recommended app-credentials flow and understand the supported auth modes. +description: Configure the Python SDK with app credentials and understand other supported auth modes. --- # Authentication -The recommended production path is app credentials mode. +Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate` only. The SDK mints a fresh ConvoAI REST token for each API call and generates the RTC join token when the session starts. -Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate`, then let `AgentSession` generate the ConvoAI REST auth token and the RTC join token automatically. - -## Recommended: app credentials +## App credentials ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent, AgentPresets +from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS client = Agora( area=Area.US, @@ -22,40 +19,33 @@ client = Agora( app_certificate="your-app-certificate", ) -agent = Agent(instructions="Be concise.") +agent = ( + Agent(instructions="Be concise.") + .with_stt(DeepgramSTT(model="nova-3")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) +) session = agent.create_session( client, channel="room-123", agent_uid="1", remote_uids=["100"], - preset=[ - AgentPresets.asr.deepgram_nova_3, - AgentPresets.llm.openai_gpt_5_mini, - AgentPresets.tts.openai_tts_1, - ], ) ``` -## Why this is the default - -- The SDK handles ConvoAI REST auth and RTC join token generation for you. -- Your onboarding code stays focused on agent behavior instead of auth plumbing. -- Your quick start code stays vendor-key free when you use presets. - -## Other supported modes - -The SDK also supports app-credentials mode and Basic Auth, but they are intentionally not the default onboarding path. +## Why app credentials -- App credentials are useful when your backend wants the SDK to mint ConvoAI REST tokens automatically. -- Basic Auth is supported for legacy integrations and account-level workflows. +- Fresh short-lived tokens per API call instead of reusing long-lived credentials +- No Customer ID / Customer Secret in request headers +- No manual REST or RTC token provisioning in application code -## Inspecting the resolved auth mode +## Inspecting auth mode ```python print(client.auth_mode) # "app-credentials" ``` -## Other supported modes +## Legacy auth modes -`auth_token` and Basic Auth are still supported for advanced or legacy cases, but they are not the default onboarding path. +The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index c14bdb2..8fca9ab 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -13,24 +13,22 @@ description: Install the Agora Conversational AI Python SDK. ## Install with pip ```sh -pip install agora-agent-sdk +pip install agora-agents ``` ## Install with Poetry ```sh -poetry add agora-agent-sdk +poetry add agora-agents ``` -## Dependencies +## Imports -The following packages are installed automatically: +```python +from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI +``` -| Package | Purpose | -|---|---| -| `httpx` (>= 0.21.2) | HTTP client for sync and async requests | -| `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | -| `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | +The package installs as `agora-agents` and imports as `agora_agent`. ## Sync vs. Async @@ -47,4 +45,23 @@ from agora_agent import Agora, Area from agora_agent import AsyncAgora, AsyncAgentSession, Area ``` -Both clients share the same constructor parameters and capabilities. See [Authentication](./authentication.md) for setup details. +## Dependencies + +| Package | Purpose | +| ------------------------------ | ------------------------------------------------------ | +| `httpx` (>= 0.21.2) | HTTP client for sync and async requests | +| `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | +| `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + +## Next steps + +- [Authentication](./authentication.md) — configure your credentials +- [Quick Start](./quick-start.md) — build your first conversational agent + +## Migrating from a previous package name + +The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + +The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + +For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index 53ca81d..472ac57 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -1,23 +1,22 @@ --- sidebar_position: 3 title: Quick Start -description: Build and run your first Agora Conversational AI agent in Python with app credentials and presets. +description: Build and run your first Agora Conversational AI agent in Python with app credentials and the builder API. --- # Quick Start -This guide uses the recommended onboarding path: +This guide starts with the standard AgentKit path: - `app_id`, `app_certificate`, and `area` on `Agora` or `AsyncAgora` -- `preset` for Agora-managed ASR, LLM, and TTS +- the `Agent` builder with `.with_stt()`, `.with_llm()`, and `.with_tts()` - automatic ConvoAI REST auth and RTC join token generation -- no vendor API keys in application code +- no vendor API keys when using supported Agora-managed models ## Sync example ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent, AgentPresets +from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS def main() -> None: @@ -27,12 +26,16 @@ def main() -> None: app_certificate="your-app-certificate", ) - # Agent-level behavior lives here. Vendor selection comes from presets below. - agent = Agent( - name="support-assistant", - instructions="You are a concise support voice assistant.", - greeting="Hello! How can I help you today?", - max_history=10, + agent = ( + Agent( + name="support-assistant", + instructions="You are a concise support voice assistant.", + greeting="Hello! How can I help you today?", + max_history=10, + ) + .with_stt(DeepgramSTT(model="nova-3", language="en")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) ) session = agent.create_session( @@ -41,11 +44,6 @@ def main() -> None: agent_uid="1", remote_uids=["100"], idle_timeout=120, - preset=[ - AgentPresets.asr.deepgram_nova_3, - AgentPresets.llm.openai_gpt_5_mini, - AgentPresets.tts.openai_tts_1, - ], ) agent_session_id = session.start() @@ -63,22 +61,21 @@ if __name__ == "__main__": 1. `Agora` runs in app-credentials mode when you pass `app_id` and `app_certificate` only. 2. `Agent` holds reusable behavior such as instructions, greeting, and history settings. -3. `preset` tells Agora which managed ASR, LLM, and TTS vendors to run. -4. `session.start()` lets the SDK generate the required auth tokens automatically. -5. `session.start()` returns the unique agent session ID. +3. Vendor classes on the builder select the ASR, LLM, and TTS stack. Leave vendor credentials unset for supported Agora-managed models, or provide keys when you want BYOK. +4. `session.start()` generates the required auth tokens and returns the unique agent session ID. ## Async applications -For `asyncio` services, switch to `AsyncAgora` and `await` the session methods. The preset and token-auth flow stays the same. +For `asyncio` services, switch to `AsyncAgora` and `await` the session methods. The builder and app-credentials flow stay the same. ## When to use BYOK instead -Use presets when you want the fastest path to a working agent. +Use the builder without vendor API keys when you are using supported Agora-managed models. Use BYOK when you need to: - supply your own vendor API keys -- use models outside the preset catalog +- use models outside the Agora-managed catalog - point at custom vendor endpoints - manage vendor-specific parameters directly diff --git a/docs/guides/agent-builder-features.md b/docs/guides/agent-builder-features.md index 16efad6..3b55b49 100644 --- a/docs/guides/agent-builder-features.md +++ b/docs/guides/agent-builder-features.md @@ -30,9 +30,17 @@ For string values with a finite set of options (e.g. `data_channel`, `sal_mode`, SAL helps the agent focus on the primary speaker and suppress background noise. Enable it via `advanced_features` and configure with `with_sal`: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent, AdvancedFeatures, SalConfig, SalModeValues -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import ( + Agent, + Agora, + Area, + AdvancedFeatures, + SalConfig, + SalModeValues, + OpenAI, + ElevenLabsTTS, + DeepgramSTT, +) agent = ( Agent( @@ -57,8 +65,7 @@ Use `SalModeValues.LOCKING` or `SalModeValues.RECOGNITION` for type safety. Enable MLLM, RTM, SAL, or tools: ```python -from agora_agent.agentkit import Agent, AdvancedFeatures -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import Agent, AdvancedFeatures, OpenAIRealtime # MLLM mode (see mllm-flow guide) agent = Agent().with_mllm(OpenAIRealtime(api_key='...')) @@ -75,7 +82,7 @@ agent = Agent().with_tools() Configure silence handling, farewell behavior, and data channel: ```python -from agora_agent.agentkit import ( +from agora_agent import ( Agent, SessionParams, SilenceConfig, @@ -134,7 +141,7 @@ agent = ( Restrict which geographic regions the backend can use: ```python -from agora_agent.agentkit import Agent, GeofenceConfig, GeofenceArea, GeofenceExcludeArea +from agora_agent import Agent, GeofenceConfig, GeofenceArea, GeofenceExcludeArea agent = ( Agent() @@ -179,7 +186,7 @@ agent = ( Configure RTC media encryption: ```python -from agora_agent.agentkit import Agent, RtcConfig +from agora_agent import Agent, RtcConfig agent = ( Agent() @@ -198,7 +205,7 @@ agent = ( Play filler words while waiting for the LLM response: ```python -from agora_agent.agentkit import ( +from agora_agent import ( Agent, FillerWordsConfig, FillerWordsTrigger, @@ -235,7 +242,7 @@ agent = ( Read back configuration via properties: ```python -from agora_agent.agentkit import Agent, GeofenceConfig, GeofenceArea +from agora_agent import Agent, GeofenceConfig, GeofenceArea agent = ( Agent(max_history=20) @@ -260,7 +267,7 @@ agent.config # Full read-only snapshot ```python from agora_agent import Agora, Area -from agora_agent.agentkit import ( +from agora_agent import ( Agent, AdvancedFeatures, SessionParams, @@ -277,7 +284,7 @@ from agora_agent.agentkit import ( DataChannel, FillerWordsSelectionRule, ) -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora( area=Area.US, diff --git a/docs/guides/avatars.md b/docs/guides/avatars.md index e015bb7..ca50966 100644 --- a/docs/guides/avatars.md +++ b/docs/guides/avatars.md @@ -1,32 +1,43 @@ --- sidebar_position: 3 title: Avatar Integration -description: Add a digital avatar (HeyGen or Akool) to your Conversational AI agent. +description: Add a digital avatar to your Conversational AI agent. --- # Avatar Integration -You can attach a digital avatar to your voice agent so that users see a visual representation of the AI speaking. Two avatar providers are supported: +You can attach a digital avatar to your voice agent so that users see a visual representation of the AI speaking. + +Avatars are currently supported only with the cascading ASR + LLM + TTS pipeline. MLLM sessions handle audio end-to-end and do not support avatars at this time. | Provider | Class | Required TTS Sample Rate | |---|---|---| -| HeyGen | `HeyGenAvatar` | 24000 Hz | +| LiveAvatar | `LiveAvatarAvatar` | 24000 Hz | +| HeyGen (deprecated alias) | `HeyGenAvatar` | 24000 Hz | | Akool | `AkoolAvatar` | 16000 Hz | +| Anam | `AnamAvatar` | None | +| Generic | `GenericAvatar` | None | + +## Token Model + +The agent and avatar join the same RTC channel with separate UIDs. The agent token is scoped to `agent_uid`; `avatar.params.agora_token` is scoped to the avatar `agora_uid`. + +When using `AgentSession.start()`, `agora_token` is optional for LiveAvatar, HeyGen, and Generic avatars. If omitted, AgentKit generates it with the same ConvoAI token path as the agent, using the avatar UID. You can still pass `agora_token` explicitly. ## Sample Rate Constraint -Each avatar vendor requires a specific TTS sample rate. The SDK validates this when you call `with_avatar()` — if the TTS sample rate does not match, a `ValueError` is raised immediately: +Each avatar vendor requires a specific TTS sample rate. The SDK validates this when you add TTS or avatar configuration and again when the session starts. If the TTS sample rate does not match, a `ValueError` is raised: ``` ValueError: Avatar requires TTS sample rate of 24000 Hz, but TTS is configured with 16000 Hz. Please update your TTS sample_rate to 24000. ``` -This validation happens at build time (when chaining methods), not at runtime when the session starts. Python raises this as a `ValueError` — there is no compile-time check as in statically typed languages. +Python raises this as a `ValueError` — there is no compile-time check as in statically typed languages. -Additionally, if the TTS `sample_rate` is not explicitly set (returns `None`), the SDK issues a warning: +Additionally, if the TTS sample rate is not explicitly available, the SDK issues a warning through the session warning callback: ``` -UserWarning: Avatar requires TTS sample rate of 24000 Hz, but TTS sample_rate is not explicitly set. Please ensure your TTS provider is configured for 24000 Hz. +Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz. ``` ## HeyGen Avatar (24 kHz) @@ -34,9 +45,7 @@ UserWarning: Avatar requires TTS sample rate of 24000 Hz, but TTS sample_rate is HeyGen requires a TTS vendor configured at 24000 Hz: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT, HeyGenAvatar +from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT, HeyGenAvatar client = Agora( area=Area.US, @@ -58,7 +67,7 @@ agent = ( api_key='your-heygen-key', quality='medium', agora_uid='2', - avatar_name='your-avatar-name', + avatar_id='your-avatar-id', )) ) @@ -68,12 +77,27 @@ session.say('Hello! I am your visual assistant.') session.stop() ``` +## Generic Avatar + +`GenericAvatar` supports custom avatar providers. `agora_appid`, `agora_channel`, and `agora_token` are optional when using `AgentSession.start()`. + +```python +from agora_agent import GenericAvatar + +agent = agent.with_avatar(GenericAvatar( + api_key='your-avatar-provider-key', + api_base_url='https://avatar-provider.example.com', + avatar_id='avatar-123', + agora_uid='2', +)) +``` + ## Akool Avatar (16 kHz) Akool requires a TTS vendor configured at 16000 Hz: ```python -from agora_agent.agentkit.vendors import ElevenLabsTTS, AkoolAvatar +from agora_agent import ElevenLabsTTS, AkoolAvatar agent = ( Agent(name='akool-agent', instructions='You are a helpful assistant.') @@ -143,15 +167,14 @@ If you call `with_avatar()` before `with_tts()`, the sample rate check is deferr | `api_key` | `str` | Yes | HeyGen API key | | `quality` | `str` | Yes | Avatar quality: `low`, `medium`, or `high` | | `agora_uid` | `str` | Yes | Agora UID for the avatar video stream | -| `avatar_name` | `str` | No | Avatar name | -| `voice_id` | `str` | No | Voice ID | -| `language` | `str` | No | Language code | -| `version` | `str` | No | API version (`v1` or `v2`) | +| `agora_token` | `str` | No | Avatar token, generated at session start when omitted | +| `avatar_id` | `str` | No | Avatar ID | +| `disable_idle_timeout` | `bool` | No | Disable idle timeout | +| `activity_idle_timeout` | `int` | No | Idle timeout in seconds | ## Akool Options | Parameter | Type | Required | Description | |---|---|---|---| | `api_key` | `str` | Yes | Akool API key | -| `agora_uid` | `str` | Yes | Agora UID for the avatar video stream | | `avatar_id` | `str` | No | Avatar ID | diff --git a/docs/guides/byok.md b/docs/guides/byok.md index 53fee78..3b03ebe 100644 --- a/docs/guides/byok.md +++ b/docs/guides/byok.md @@ -6,11 +6,11 @@ description: Bring your own vendor credentials and use custom vendor configurati # BYOK -Use BYOK when you want to provide vendor credentials yourself instead of relying on Agora-managed presets. +Use BYOK when you want to provide vendor credentials yourself instead of relying on Agora-managed models via the builder. Typical reasons: -- you need a vendor model that is not part of the preset catalog +- you need a vendor model outside the Agora-managed catalog - you want to point to a custom endpoint - you want direct control over vendor-specific parameters - your organization manages vendor billing separately from Agora @@ -20,9 +20,7 @@ Typical reasons: ```python import os -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import DeepgramSTT, ElevenLabsTTS, OpenAI +from agora_agent import Agent, Agora, Area, DeepgramSTT, ElevenLabsTTS, OpenAI def main() -> None: @@ -81,7 +79,7 @@ if __name__ == "__main__": main() ``` -## Presets vs BYOK +## Builder-managed vs BYOK -- Presets: fastest path, no vendor keys in app code -- BYOK: most control, your keys and your vendor configuration +- Builder without vendor keys: supported Agora-managed models +- BYOK: your keys and full vendor control diff --git a/docs/guides/cascading-flow.md b/docs/guides/cascading-flow.md index 6b4d59e..43ff2af 100644 --- a/docs/guides/cascading-flow.md +++ b/docs/guides/cascading-flow.md @@ -17,9 +17,7 @@ User audio → STT → LLM → TTS → Agent audio ### Sync ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora( area=Area.US, @@ -45,9 +43,7 @@ session.stop() ```python import asyncio -from agora_agent import AsyncAgora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent import Agent, AsyncAgora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT async def main(): client = AsyncAgora( @@ -77,9 +73,7 @@ asyncio.run(main()) This combination keeps everything within the Azure ecosystem: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import AzureOpenAI, MicrosoftTTS, MicrosoftSTT +from agora_agent import Agent, Agora, Area, AzureOpenAI, MicrosoftTTS, MicrosoftSTT client = Agora( area=Area.EU, @@ -118,7 +112,7 @@ session.stop() All LLM vendors support optional parameters for fine-tuning: ```python -from agora_agent.agentkit.vendors import OpenAI +from agora_agent import OpenAI llm = OpenAI( api_key='your-openai-key', diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md index 6677b45..47397b7 100644 --- a/docs/guides/low-level-api.md +++ b/docs/guides/low-level-api.md @@ -1,187 +1,55 @@ --- sidebar_position: 10 title: Low-Level API -description: Direct client.agents.start() usage without the builder pattern. +description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. --- # Low-Level API -For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. +Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. -## Raw telephony and phone-number APIs - -AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: - -- `client.telephony` for call status and hangup operations -- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations +Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. -## Cascading flow (ASR → LLM → TTS) +## Client setup ```python from agora_agent import Agora, Area -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, -) -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams -from agora_agent.types.tts import Tts_Elevenlabs client = Agora( area=Area.US, - app_id="YOUR_APP_ID", - app_certificate="YOUR_APP_CERTIFICATE", - auth_token="your-rest-auth-token", -) -client.agents.start( - client.app_id, - name="unique_name", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - vendor="deepgram", - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, - ), - tts=Tts_Elevenlabs( - params=ElevenLabsTtsParams( - key="YOUR_ELEVENLABS_API_KEY", - model_id="eleven_flash_v2_5", - voice_id="pNInz6obpgDQGcFmaJgB", - sample_rate=24000, - ), - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - api_key="", - system_messages=[ - {"role": "system", "content": "You are a helpful chatbot."} - ], - params={"model": "gpt-4o-mini"}, - max_history=32, - greeting_message="Hello, how can I assist you today?", - failure_message="Please hold on a second.", - ), - ), + app_id="your-app-id", + app_certificate="your-app-certificate", ) ``` -## Async (low-level) +## Raw telephony and phone-number APIs -```python -import asyncio -from agora_agent import Area, AsyncAgora -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, -) -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams -from agora_agent.types.tts import Tts_Elevenlabs +AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: -client = AsyncAgora( - area=Area.US, - app_id="YOUR_APP_ID", - app_certificate="YOUR_APP_CERTIFICATE", - auth_token="your-rest-auth-token", +- `client.telephony` for call status and hangup operations +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + +```python +calls = client.telephony.list( + appid=client.app_id, + type="sip", ) -async def main() -> None: - await client.agents.start( - client.app_id, - name="unique_name", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - vendor="deepgram", - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, - ), - tts=Tts_Elevenlabs( - params=ElevenLabsTtsParams( - key="YOUR_ELEVENLABS_API_KEY", - model_id="eleven_flash_v2_5", - voice_id="pNInz6obpgDQGcFmaJgB", - sample_rate=24000, - ), - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", - api_key="", - system_messages=[ - {"role": "system", "content": "You are a helpful chatbot."} - ], - params={"model": "gpt-4o-mini"}, - max_history=32, - greeting_message="Hello, how can I assist you today?", - failure_message="Please hold on a second.", - ), - ), - ) - -asyncio.run(main()) +for call in calls: + print(call.id, call.state) ``` -## MLLM flow (multimodal) +## Direct agent APIs -For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flow instead of the cascading ASR → LLM → TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview). +`client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. -```python -from agora_agent import Agora, Area -from agora_agent.agents import ( - StartAgentsRequestProperties, - StartAgentsRequestPropertiesMllm, - StartAgentsRequestPropertiesMllmVendor, - StartAgentsRequestPropertiesTts, - StartAgentsRequestPropertiesTtsVendor, - StartAgentsRequestPropertiesLlm, -) +If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: -client = Agora( - area=Area.US, - app_id="YOUR_APP_ID", - app_certificate="YOUR_APP_CERTIFICATE", - auth_token="your-rest-auth-token", -) - -client.agents.start( - client.app_id, - name="mllm_agent", - properties=StartAgentsRequestProperties( - channel="channel_name", - token="your_token", - agent_rtc_uid="1001", - remote_rtc_uids=["1002"], - idle_timeout=120, - mllm=StartAgentsRequestPropertiesMllm( - enable=True, - url="wss://api.openai.com/v1/realtime", - api_key="", - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, - params={ - "model": "gpt-4o-realtime-preview", - "voice": "alloy", - }, - input_modalities=["audio"], - output_modalities=["text", "audio"], - greeting_message="Hello! I'm ready to chat in real-time.", - turn_detection={ - "mode": "server_vad", - "server_vad_config": { - "idle_timeout_ms": 5000, - }, - }, - ), - ), +```python +info = session.raw.get( + appid=session.app_id, + agent_id=session.id, ) ``` -For more on the agentkit-based MLLM flow, see [MLLM Flow](./mllm-flow.md). +You must pass `appid` and `agent_id` manually when using generated raw methods. diff --git a/docs/guides/mllm-flow.md b/docs/guides/mllm-flow.md index 15d7c3e..328514b 100644 --- a/docs/guides/mllm-flow.md +++ b/docs/guides/mllm-flow.md @@ -1,29 +1,28 @@ --- sidebar_position: 2 title: MLLM Flow (Multimodal) -description: Use OpenAI Realtime or Gemini Live for end-to-end audio processing. +description: Use OpenAI Realtime, Gemini Live, Vertex AI, or xAI for end-to-end audio processing. --- # MLLM Flow (Multimodal) The MLLM (Multimodal LLM) flow uses a single model to handle both audio input and output — no separate STT or TTS step. This gives the model direct access to voice tone, pacing, and emotion. -Two MLLM vendors are supported: +MLLM vendors supported by AgentKit: - **OpenAI Realtime** — `gpt-4o-realtime-preview` and related models - **Gemini Live** — direct Google AI API access for audio-native Gemini models +- **Vertex AI** — Gemini Live through Google Cloud Vertex AI +- **xAI Grok** — xAI Realtime API ## Enable MLLM Mode -Call `agent.with_mllm(vendor)` to enable MLLM mode. The builder sets `mllm.enable = True` automatically. +Call `agent.with_mllm(vendor)` to enable MLLM mode. The builder sets `mllm.enable = True` automatically. MLLM sessions do not require TTS, STT, or LLM vendors. Avatars are currently supported only with the cascading ASR + LLM + TTS pipeline. ```python -from agora_agent.agentkit import Agent +from agora_agent import Agent -agent = Agent( - name='realtime-agent', - instructions='You are a voice assistant.', -) +agent = Agent(name='realtime-agent') ``` ## OpenAI Realtime @@ -31,9 +30,7 @@ agent = Agent( ### Sync ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import Agent, Agora, Area, OpenAIRealtime client = Agora( area=Area.US, @@ -42,10 +39,7 @@ client = Agora( ) agent = ( - Agent( - name='realtime-agent', - instructions='You are a helpful voice assistant.', - ) + Agent(name='realtime-agent') .with_mllm(OpenAIRealtime( api_key='your-openai-key', model='gpt-4o-realtime-preview', @@ -62,9 +56,7 @@ session.stop() ```python import asyncio -from agora_agent import AsyncAgora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import Agent, AsyncAgora, Area, OpenAIRealtime async def main(): client = AsyncAgora( @@ -74,10 +66,7 @@ async def main(): ) agent = ( - Agent( - name='realtime-agent', - instructions='You are a helpful voice assistant.', - ) + Agent(name='realtime-agent') .with_mllm(OpenAIRealtime( api_key='your-openai-key', model='gpt-4o-realtime-preview', @@ -96,9 +85,7 @@ asyncio.run(main()) Gemini Live uses a Google AI API key: ```python -from agora_agent import Agora, Area -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import GeminiLive +from agora_agent import Agent, Agora, Area, GeminiLive client = Agora( area=Area.AP, @@ -107,10 +94,7 @@ client = Agora( ) agent = ( - Agent( - name='gemini-agent', - instructions='You are a helpful multilingual assistant.', - ) + Agent(name='gemini-agent') .with_mllm(GeminiLive( api_key='your-google-ai-api-key', model='gemini-live-2.5-flash', @@ -123,10 +107,35 @@ agent_id = session.start() session.stop() ``` +## xAI Grok + +```python +from agora_agent import Agent, Agora, Area, XaiGrok + +client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') + +agent = ( + Agent(name='xai-agent') + .with_mllm(XaiGrok( + api_key='your-xai-key', + voice='eve', + language='en', + sample_rate=24000, + output_modalities=['audio', 'text'], + )) +) + +session = agent.create_session(client, channel='xai-room', agent_uid='1', remote_uids=['100']) +agent_id = session.start() +session.stop() +``` + +For xAI turn detection, use `mllm.turn_detection` with `agora_vad` or `server_vad`. + ## OpenAI Realtime with Custom Options ```python -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import OpenAIRealtime mllm = OpenAIRealtime( api_key='your-openai-key', @@ -145,10 +154,10 @@ mllm = OpenAIRealtime( |---|---|---| | Latency | Lower — single model, no pipeline | Higher — three models in sequence | | Voice control | Model-dependent | Full vendor choice for TTS | -| Vendor flexibility | Limited (OpenAI Realtime or Gemini Live) | Mix and match 4 LLMs, 12 TTS, 10 STT | +| Vendor flexibility | Limited to supported MLLM providers (OpenAI Realtime, Gemini Live, Vertex AI, xAI Grok) | Mix and match LLM, TTS, and STT vendors | | Audio understanding | Model hears tone, pacing, emotion | STT produces text only | ## Next Steps - For the cascading pipeline, see [Cascading Flow](./cascading-flow.md) -- To add a visual avatar, see [Avatars](./avatars.md) +- To add a visual avatar, use the cascading pipeline and see [Avatars](./avatars.md) diff --git a/docs/guides/regional-routing.md b/docs/guides/regional-routing.md index 4c4ffcb..00584d0 100644 --- a/docs/guides/regional-routing.md +++ b/docs/guides/regional-routing.md @@ -108,7 +108,7 @@ print(client.get_current_url()) ## Accessing the Pool Directly -For advanced use, access the underlying `Pool` object: +For advanced use, access the `Pool` object: ```python pool = client.pool @@ -127,4 +127,4 @@ area = pool.get_area() | `next_region()` | sync | sync | Cycle to next region prefix | | `select_best_domain()` | sync | **`async` (requires `await`)** | DNS-based domain selection | | `get_current_url()` | sync | sync | Get the current base URL | -| `pool` (property) | sync | sync | Access the underlying `Pool` object | +| `pool` (property) | sync | sync | Access the `Pool` object | diff --git a/docs/index.md b/docs/index.md index 9063734..f6d2b82 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,42 +17,44 @@ The Agora Conversational AI Python SDK lets you build voice-powered AI agents on **Cascading flow** uses ASR -> LLM -> TTS and supports the broadest set of vendor combinations. -**MLLM flow** uses a multimodal model such as OpenAI Realtime or Gemini Live for end-to-end audio. +**MLLM flow** uses a multimodal model such as OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok for end-to-end audio. -## Choose a starting point +## Start here -- Use [Quick Start](./getting-started/quick-start.md) if you want the recommended preset-based path with no vendor keys. -- Use [MLLM Flow](./guides/mllm-flow.md) if you want realtime end-to-end audio with OpenAI Realtime or Gemini Live. -- Use [Cascading Flow](./guides/cascading-flow.md) if you want separate ASR, LLM, and TTS vendors. +- Start with [Quick Start](./getting-started/quick-start.md). It shows the baseline app-credentials setup and starts a cascading ASR -> LLM -> TTS agent. +- Use [MLLM Flow](./guides/mllm-flow.md) when your agent uses one realtime multimodal model, such as OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok. +- Use [Cascading Flow](./guides/cascading-flow.md) for more examples of the default ASR -> LLM -> TTS flow, including provider-specific configuration. -## SDK layers +## How the SDK is organized | Layer | What it does | When to use | |---|---|---| -| **Agentkit** (`Agent`, `AgentSession`, vendors, presets) | High-level builder pattern, lifecycle, typed vendors | Most use cases | -| **Fern-generated core** (`client.agents`, `client.telephony`) | Direct REST client mapping every API endpoint | Advanced use cases | +| **AgentKit** (`Agent`, `AgentSession`, vendor classes) | High-level builder pattern, lifecycle, typed vendors | Most use cases | +| **Generated REST clients** (`client.agents`, `client.telephony`) | Typed access to REST APIs not covered by AgentKit | Advanced use cases | ## Documentation | Section | What you will learn | |---|---| | [Installation](./getting-started/installation.md) | Install the SDK and prerequisites | -| [Authentication](./getting-started/authentication.md) | Token auth for REST and RTC joins | -| [Quick Start](./getting-started/quick-start.md) | Recommended preset-based onboarding flow | +| [Authentication](./getting-started/authentication.md) | App credentials and other auth modes | +| [Quick Start](./getting-started/quick-start.md) | App credentials and AgentKit | +| [Agent Builder Features](./guides/agent-builder-features.md) | Turn detection, SAL, filler words, and advanced agent options | | [BYOK](./guides/byok.md) | Bring your own vendor credentials and config | -| [Architecture](./concepts/architecture.md) | Understand the SDK layers and client types | +| [Architecture](./concepts/architecture.md) | SDK structure and generated REST clients | | [Agent](./concepts/agent.md) | Configure agents with the fluent builder | | [AgentSession](./concepts/session.md) | Manage the agent lifecycle | | [Vendors](./concepts/vendors.md) | Browse all LLM, TTS, STT, MLLM, and Avatar providers | | [Cascading Flow](./guides/cascading-flow.md) | Build an ASR -> LLM -> TTS pipeline | -| [MLLM Flow](./guides/mllm-flow.md) | Use OpenAI Realtime or Gemini Live for end-to-end audio | -| [Avatars](./guides/avatars.md) | Add a digital avatar with HeyGen or Akool | +| [MLLM Flow](./guides/mllm-flow.md) | Use OpenAI Realtime, Gemini Live, Vertex AI, or xAI Grok for end-to-end audio | +| [Avatars](./guides/avatars.md) | Add a digital avatar with LiveAvatar, Akool, Anam, or Generic Avatar | | [Regional Routing](./guides/regional-routing.md) | Route requests to the nearest region | | [Error Handling](./guides/error-handling.md) | Handle API errors with ApiError | | [Pagination](./guides/pagination.md) | Iterate over paginated list endpoints | | [Advanced](./guides/advanced.md) | Raw response, retries, timeouts, custom httpx client | -| [Low-Level API](./guides/low-level-api.md) | Direct `client.agents.start()` usage | +| [Low-Level API](./guides/low-level-api.md) | Generated REST APIs | | [Client Reference](./reference/client.md) | Full `Agora` / `AsyncAgora` API | | [Agent Reference](./reference/agent.md) | Full `Agent` builder API | | [Session Reference](./reference/session.md) | Full `AgentSession` / `AsyncAgentSession` API | | [Vendor Reference](./reference/vendors.md) | Constructor options for all vendor classes | +| [Error Reference](./reference/errors.md) | v2.7 status codes and error reason values | diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 0e7f5e8..93770e9 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -6,7 +6,7 @@ description: Full API reference for the Python Agent builder class. # Agent Reference -**Import:** `from agora_agent.agentkit import Agent` or `from agora_agent import Agent` +**Import:** `from agora_agent import Agent` ## Constructor @@ -57,7 +57,7 @@ Set the LLM vendor for cascading flow. ```python -from agora_agent.agentkit.vendors import OpenAI +from agora_agent import OpenAI agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) ``` @@ -67,7 +67,7 @@ Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. ```python -from agora_agent.agentkit.vendors import ElevenLabsTTS +from agora_agent import ElevenLabsTTS agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) ``` @@ -77,27 +77,29 @@ Set the STT (ASR) vendor. ```python -from agora_agent.agentkit.vendors import DeepgramSTT +from agora_agent import DeepgramSTT agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) ``` ### `with_mllm(vendor: BaseMLLM) -> Agent` -Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. +Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. ```python -from agora_agent.agentkit.vendors import OpenAIRealtime +from agora_agent import OpenAIRealtime agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) ``` ### `with_avatar(vendor: BaseAvatar) -> Agent` -Set the avatar vendor. Raises `ValueError` if TTS sample rate does not match the avatar's `required_sample_rate`. +Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + +Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. ```python -from agora_agent.agentkit.vendors import HeyGenAvatar +from agora_agent import HeyGenAvatar agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) ``` @@ -107,6 +109,22 @@ agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', ago Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. +Pause-state detection is configured under semantic end-of-speech: + +```python +agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, +}) +``` + ### `with_interruption(config: InterruptionConfig) -> Agent` Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. @@ -131,13 +149,19 @@ Set SAL (Selective Attention Locking) configuration. Set advanced features (e.g. `{'enable_rtm': True}`). +When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + ### `with_tools(enabled: bool = True) -> Agent` Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. ### `with_parameters(parameters: SessionParams) -> Agent` -Set session parameters (silence config, farewell config, data channel, etc.). +Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + +### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + +Set `parameters.audio_scenario` without replacing existing session parameters. ### `with_failure_message(message: str) -> Agent` @@ -145,7 +169,7 @@ Set the message spoken via TTS when the LLM call fails. ### `with_max_history(max_history: int) -> Agent` -Set the maximum conversation history length. +Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. ### `with_geofence(geofence: GeofenceConfig) -> Agent` @@ -240,3 +264,9 @@ to_properties( | `rtc` | `Optional[RtcConfig]` | RTC configuration | | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | | `config` | `Dict[str, Any]` | Full configuration dict | + +## Type aliases + +Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + +Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. diff --git a/docs/reference/client.md b/docs/reference/client.md index 4182214..9c1a38c 100644 --- a/docs/reference/client.md +++ b/docs/reference/client.md @@ -100,7 +100,7 @@ url = client.get_current_url() ### `pool` (property) -Access the underlying `Pool` object for advanced domain management. +Access the `Pool` object for advanced domain management. ```python diff --git a/docs/reference/errors.md b/docs/reference/errors.md new file mode 100644 index 0000000..d91f335 --- /dev/null +++ b/docs/reference/errors.md @@ -0,0 +1,29 @@ +--- +sidebar_position: 5 +title: Error Responses +description: Error handling notes for Conversational AI API responses. +--- + +# Error Responses + +API v2.7 updates the error status codes and `reason` values surfaced through the generated client. + +## Status Codes + +In addition to existing validation and task errors, integrations should handle: + +- `401` — authentication failed +- `429` — rate limit exceeded +- `500` — internal server error + +## Reason Migration + +If your application branches on `AgentErrorResponse.reason`, update handlers for the v2.7 reasons: + +| Previous | v2.7 replacements | +|---|---| +| `InvalidRequest` | `InvalidRequestBody`, `MissingRequiredField`, `InvalidFieldValue` | + +New reasons include `ServiceNotEnabled`, `AccountSuspended`, and `ResourceAllocationFailed`. + +Prefer treating unknown reasons as retryable only when the HTTP status and operation are safe to retry. diff --git a/docs/reference/session.md b/docs/reference/session.md index 955e08f..63402f6 100644 --- a/docs/reference/session.md +++ b/docs/reference/session.md @@ -9,8 +9,8 @@ description: Full API reference for the Python AgentSession class. **Import:** ```python -from agora_agent.agentkit import AgentSession -from agora_agent.agentkit.agent_session import AsyncAgentSession +from agora_agent import AgentSession +from agora_agent import AsyncAgentSession # or from top-level: from agora_agent import AgentSession, AsyncAgentSession ``` @@ -56,14 +56,14 @@ AgentSession( ### `start()` -Start the agent session. Generates an RTC token if not provided, validates avatar/TTS config, and calls the Agora API. +Start the agent session. Generates an RTC token if not provided, validates avatar/TTS config for cascading sessions, and calls the Agora API. MLLM sessions do not require TTS; an enabled avatar is rejected when MLLM is configured (a disabled avatar is allowed). | | Sync (`AgentSession`) | Async (`AsyncAgentSession`) | |---|---|---| | **Signature** | `start() -> str` | `async start() -> str` | | **Returns** | Agent ID | Agent ID | | **Raises** | `RuntimeError` if not in `idle`, `stopped`, or `error` state | Same | -| **Raises** | `ValueError` if avatar/TTS sample rate mismatch | Same | +| **Raises** | `ValueError` if avatar/TTS sample rate mismatch or an enabled avatar is used with MLLM | Same | ```python @@ -154,6 +154,16 @@ session.update(properties) await session.update(properties) ``` +### `think(text, ...)` + +Inject a custom text instruction into the running agent. + +In API v2.7, omitting `on_listening_action` uses the server default `interrupt`. Pass `on_listening_action='inject'` explicitly to preserve the pre-v2.7 behavior. + +```python +session.think('Summarize the last answer', on_listening_action='inject') +``` + ### `get_history()` Retrieve the conversation history. @@ -190,6 +200,22 @@ info = session.get_info() info = await session.get_info() ``` +### `get_turns(page_index=None, page_size=None)` + +Retrieve paginated turn analytics for a completed or running session. In v2.7, the API defaults to page 1 and up to 50 turns per page. Responses include `agent_id`, `name`, `channel`, `total_turn_count`, `pagination`, and `turns`. + +```python +page = session.get_turns(page_index=1, page_size=50) +``` + +### `get_all_turns(page_size=None)` + +Fetch all turn pages and return a single `GetTurnsAgentsResponse` with the combined `turns` list. + +```python +all_turns = session.get_all_turns(page_size=50) +``` + ### `on(event, handler)` Register an event handler. This method is synchronous on both `AgentSession` and `AsyncAgentSession`. diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index 798a7f3..7395eea 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -6,17 +6,19 @@ description: Constructor options for all LLM, TTS, STT, MLLM, and Avatar vendor # Vendor Reference -All vendor classes are available from `agora_agent.agentkit.vendors`: +All vendor classes are available from `agora_agent`: ```python -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIRealtime, GeminiLive, HeyGenAvatar +from agora_agent import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIRealtime, XaiGrok, GenericAvatar ``` --- ## LLM Vendors +`greeting_configs` accepts either a dict or `LlmGreetingConfigs`. In v2.7, `greeting_configs.interruptable=False` makes the greeting uninterruptible; `True` follows the global `interruption` settings. + ### `OpenAI` | Parameter | Type | Required | Default | Description | @@ -39,7 +41,7 @@ from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, Dee ```python -from agora_agent.agentkit.vendors import OpenAI +from agora_agent import OpenAI llm = OpenAI(api_key='your-key', model='gpt-4o-mini', temperature=0.7) ``` @@ -67,7 +69,7 @@ llm = OpenAI(api_key='your-key', model='gpt-4o-mini', temperature=0.7) ```python -from agora_agent.agentkit.vendors import AzureOpenAI +from agora_agent import AzureOpenAI llm = AzureOpenAI( api_key='your-azure-key', @@ -97,7 +99,7 @@ llm = AzureOpenAI( ```python -from agora_agent.agentkit.vendors import Anthropic +from agora_agent import Anthropic llm = Anthropic(api_key='your-anthropic-key', model='claude-3-5-sonnet-20241022') ``` @@ -124,11 +126,23 @@ llm = Anthropic(api_key='your-anthropic-key', model='claude-3-5-sonnet-20241022' ```python -from agora_agent.agentkit.vendors import Gemini +from agora_agent import Gemini llm = Gemini(api_key='your-google-key', model='gemini-2.0-flash-exp') ``` +### Other LLM vendors + +The SDK also includes named helpers for the remaining Agora-supported LLM providers. These helpers choose the correct request format internally. + +| Class | Provider | Key parameters | +|---|---|---| +| `Groq` | Groq | `api_key`, `model`, `base_url?` | +| `VertexAILLM` | Google Vertex AI | `api_key`, `model`, `project_id`, `location`, `url?` | +| `AmazonBedrock` | Amazon Bedrock | `api_key`, `url`, `model` | +| `Dify` | Dify | `api_key`, `url`, `user?`, `conversation_id?` | +| `CustomLLM` | OpenAI-compatible LLM | `api_key`, `model`, `base_url` | + --- ## TTS Vendors @@ -364,8 +378,6 @@ Fixed sample rate: 24000 Hz. | `url` | `str` | No | `None` | Custom WebSocket URL | | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Message played when the model call fails | -| `max_history` | `int` | No | `None` | Maximum conversation history length | -| `predefined_tools` | `List[str]` | No | `None` | Predefined tools (e.g., `["_publish_message"]`) | | `input_modalities` | `List[str]` | No | `None` | Input modalities | | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `messages` | `List[Dict]` | No | `None` | Conversation messages | @@ -383,8 +395,6 @@ Fixed sample rate: 24000 Hz. | `voice` | `str` | No | `None` | Voice name | | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Message played when the model call fails | -| `max_history` | `int` | No | `None` | Maximum conversation history length | -| `predefined_tools` | `List[str]` | No | `None` | Predefined tools (e.g., `["_publish_message"]`) | | `input_modalities` | `List[str]` | No | `None` | Input modalities | | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `messages` | `List[Dict]` | No | `None` | Conversation messages | @@ -403,18 +413,37 @@ Fixed sample rate: 24000 Hz. | `voice` | `str` | No | `None` | Voice name (e.g., `Aoede`, `Charon`) | | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Message played when the model call fails | -| `max_history` | `int` | No | `None` | Maximum conversation history length | -| `predefined_tools` | `List[str]` | No | `None` | Predefined tools (e.g., `["_publish_message"]`) | | `input_modalities` | `List[str]` | No | `None` | Input modalities | | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `messages` | `List[Dict]` | No | `None` | Conversation messages | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | | `turn_detection` | `MllmTurnDetectionConfig` | No | `None` | MLLM turn detection configuration; overrides top-level `turn_detection` | +### `XaiGrok` + +xAI Grok MLLM vendor (`mllm.vendor`: `"xai"`). Matches the [xAI Grok](https://docs.agora.io/en/conversational-ai/models/mllm/xai) product docs. + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `api_key` | `str` | Yes | — | xAI API key | +| `url` | `str` | No | `wss://api.x.ai/v1/realtime` | xAI realtime WebSocket URL | +| `voice` | `str` | No | `None` | Voice identifier, for example `eve` or `rex` | +| `language` | `str` | No | `None` | Language code, for example `en` | +| `sample_rate` | `int` | No | `None` | Audio sample rate in Hz | +| `greeting_message` | `str` | No | `None` | Greeting message | +| `failure_message` | `str` | No | `None` | Message played when the model call fails | +| `input_modalities` | `List[str]` | No | `None` | Input modalities | +| `output_modalities` | `List[str]` | No | `None` | Output modalities | +| `messages` | `List[Dict]` | No | `None` | Conversation messages | +| `params` | `Dict[str, Any]` | No | `None` | Additional xAI parameters | +| `turn_detection` | `MllmTurnDetectionConfig` | No | `None` | Supports `agora_vad` and `server_vad` for xAI | + --- ## Avatar Vendors +Avatar vendors are currently supported only with the cascading ASR + LLM + TTS pipeline. + ### `HeyGenAvatar` Required TTS sample rate: **24000 Hz** @@ -424,7 +453,7 @@ Required TTS sample rate: **24000 Hz** | `api_key` | `str` | Yes | — | HeyGen API key | | `quality` | `str` | Yes | — | Avatar quality: `low`, `medium`, or `high` | | `agora_uid` | `str` | Yes | — | Agora UID for avatar video stream | -| `agora_token` | `str` | No | `None` | RTC token for avatar authentication | +| `agora_token` | `str` | No | `None` | Avatar token. When omitted, `AgentSession.start()` generates one for `agora_uid` using the same token path as the agent. | | `avatar_id` | `str` | No | `None` | HeyGen avatar ID | | `enable` | `bool` | No | `True` | Enable or disable the avatar | | `disable_idle_timeout` | `bool` | No | `None` | Disable the idle timeout | @@ -437,5 +466,33 @@ Required TTS sample rate: **16000 Hz** | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Akool API key | -| `agora_uid` | `str` | Yes | — | Agora UID for avatar video stream | | `avatar_id` | `str` | No | `None` | Avatar ID | + +### `LiveAvatarAvatar` + +Required TTS sample rate: **24000 Hz** + +Same options as `HeyGenAvatar`, but serializes `vendor: "liveavatar"`. `agora_token` is optional and generated by `AgentSession.start()` when omitted. + +### `AnamAvatar` + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `api_key` | `str` | Yes | — | Anam API key | +| `persona_id` | `str` | No | `None` | Persona ID | +| `enable` | `bool` | No | `True` | Enable or disable the avatar | + +### `GenericAvatar` + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `api_key` | `str` | Yes | — | Generic avatar provider API key | +| `agora_uid` | `str` | Yes | — | Avatar RTC UID. Must differ from the agent UID. | +| `api_base_url` | `str` | Yes | — | Avatar provider API base URL | +| `avatar_id` | `str` | Yes | — | Avatar ID | +| `agora_token` | `str` | No | `None` | Optional avatar token. Generated by `AgentSession.start()` when omitted. | +| `agora_appid` | `str` | No | `None` | Optional; filled from the session App ID when omitted. | +| `agora_channel` | `str` | No | `None` | Optional; filled from the session channel when omitted. | +| `enable` | `bool` | No | `True` | Enable or disable the avatar | + +Avatar tokens are separate from the agent join token but generated with the same `generate_convo_ai_token` path, using the avatar's `agora_uid` as `uid`. diff --git a/pyproject.toml b/pyproject.toml index 07f0004..36a9552 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,9 @@ [project] -name = "agora-agent-server-sdk" +name = "agora-agents" [tool.poetry] -name = "agora-agent-server-sdk" -version = "v1.4.1" +name = "agora-agents" +version = "v2.0.0" description = "" readme = "README.md" authors = [] diff --git a/reference.md b/reference.md index 356b016..55a516e 100644 --- a/reference.md +++ b/reference.md @@ -32,6 +32,9 @@ from agora_agent.agents import ( StartAgentsRequestProperties, StartAgentsRequestPropertiesAsr, StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionConfig, + StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, ) client = Agora( @@ -69,6 +72,13 @@ client.agents.start( greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + config=StartAgentsRequestPropertiesTurnDetectionConfig( + end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( + mode="semantic", + ), + ), + ), ), ) @@ -242,7 +252,6 @@ The agent state to filter by. Only one state can be specified per query: - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. -- `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. @@ -516,6 +525,22 @@ client.agents.get_turns(
+**page_index:** `typing.Optional[int]` — The page number. Starts from 1. + +
+
+ +
+
+ +**page_size:** `typing.Optional[int]` — The number of dialogue turns returned per page. + +
+
+ +
+
+ **request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration.
@@ -540,7 +565,7 @@ client.agents.get_turns(
-Stop the specified conversational agent instance. +Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned.
@@ -1015,6 +1040,7 @@ client.agent_management.agent_think( The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. +- `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. diff --git a/src/agora_agent/__init__.py b/src/agora_agent/__init__.py index 84a5d9f..e1a4942 100644 --- a/src/agora_agent/__init__.py +++ b/src/agora_agent/__init__.py @@ -2,9 +2,13 @@ # isort: skip_file +"""Agora Conversational AI Python SDK.""" + import typing from importlib import import_module +from . import agentkit as _agentkit + if typing.TYPE_CHECKING: from . import agents, agentkit, core, phone_numbers, telephony from .core.domain import Area, Pool, create_pool @@ -14,10 +18,13 @@ Agent, AgentSession, AgentSessionOptions, + GenericAvatar, + XaiGrok, generate_rtc_token, GenerateTokenOptions, ) from .agentkit.agent_session import AsyncAgentSession + _dynamic_imports: typing.Dict[str, str] = { "Agora": ".pool_client", "Agent": ".agentkit", @@ -26,8 +33,9 @@ "Area": ".core.domain", "AsyncAgentSession": ".agentkit.agent_session", "AsyncAgora": ".pool_client", + "GenericAvatar": ".agentkit", + "XaiGrok": ".agentkit", "GenerateTokenOptions": ".agentkit", - "Pool": ".core.domain", "__version__": ".version", "agentkit": ".agentkit", "agents": ".agents", @@ -38,29 +46,7 @@ "telephony": ".telephony", } - -def __getattr__(attr_name: str) -> typing.Any: - module_name = _dynamic_imports.get(attr_name) - if module_name is None: - raise AttributeError(f"No {attr_name} found in _dynamic_imports for module name -> {__name__}") - try: - module = import_module(module_name, __package__) - if module_name == f".{attr_name}": - return module - else: - return getattr(module, attr_name) - except ImportError as e: - raise ImportError(f"Failed to import {attr_name} from {module_name}: {e}") from e - except AttributeError as e: - raise AttributeError(f"Failed to get {attr_name} from {module_name}: {e}") from e - - -def __dir__(): - lazy_attrs = list(_dynamic_imports.keys()) - return sorted(lazy_attrs) - - -__all__ = [ +_ROOT_ALL = ( "Agora", "Agent", "AgentSession", @@ -68,6 +54,8 @@ def __dir__(): "Area", "AsyncAgentSession", "AsyncAgora", + "GenericAvatar", + "XaiGrok", "GenerateTokenOptions", "Pool", "__version__", @@ -78,4 +66,29 @@ def __dir__(): "generate_rtc_token", "phone_numbers", "telephony", -] +) + +__all__ = sorted({*_ROOT_ALL, *_agentkit.__all__}) + + +def __getattr__(attr_name: str) -> typing.Any: + module_name = _dynamic_imports.get(attr_name) + if module_name is not None: + try: + module = import_module(module_name, __package__) + if module_name == f".{attr_name}": + return module + return getattr(module, attr_name) + except ImportError as e: + raise ImportError(f"Failed to import {attr_name} from {module_name}: {e}") from e + except AttributeError as e: + raise AttributeError(f"Failed to get {attr_name} from {module_name}: {e}") from e + + if attr_name in _agentkit.__all__: + return getattr(_agentkit, attr_name) + + raise AttributeError(f"module {__name__!r} has no attribute {attr_name!r}") + + +def __dir__() -> typing.List[str]: + return list(__all__) diff --git a/src/agora_agent/agent_management/client.py b/src/agora_agent/agent_management/client.py index 71b3c62..6140649 100644 --- a/src/agora_agent/agent_management/client.py +++ b/src/agora_agent/agent_management/client.py @@ -72,6 +72,7 @@ def agent_think( on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] @@ -186,6 +187,7 @@ async def agent_think( on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] diff --git a/src/agora_agent/agent_management/raw_client.py b/src/agora_agent/agent_management/raw_client.py index 03a0838..4af36fe 100644 --- a/src/agora_agent/agent_management/raw_client.py +++ b/src/agora_agent/agent_management/raw_client.py @@ -65,6 +65,7 @@ def agent_think( on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] @@ -167,6 +168,7 @@ async def agent_think( on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] The action to take when the agent is in a listening state: - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `interrupt`: Immediately interrupt the current flow and initiate a new round of dialogue. - `ignore`: Ignore the request. on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] diff --git a/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py index 117d8cc..3812a6e 100644 --- a/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py +++ b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py @@ -2,4 +2,6 @@ import typing -AgentThinkAgentManagementRequestOnListeningAction = typing.Union[typing.Literal["inject", "ignore"], typing.Any] +AgentThinkAgentManagementRequestOnListeningAction = typing.Union[ + typing.Literal["inject", "interrupt", "ignore"], typing.Any +] diff --git a/src/agora_agent/agent_management/types/agent_think_agent_management_response.py b/src/agora_agent/agent_management/types/agent_think_agent_management_response.py index 7e512eb..0c1f689 100644 --- a/src/agora_agent/agent_management/types/agent_think_agent_management_response.py +++ b/src/agora_agent/agent_management/types/agent_think_agent_management_response.py @@ -20,7 +20,7 @@ class AgentThinkAgentManagementResponse(UncheckedBaseModel): start_ts: typing.Optional[int] = pydantic.Field(default=None) """ - Timestamp indicating when the agent was created. + Unix timestamp in seconds when the think request was processed. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py b/src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py deleted file mode 100644 index 54cca4c..0000000 --- a/src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -AgentThinkRequestOnListeningAction = typing.Union[typing.Literal["inject", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py b/src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py deleted file mode 100644 index 8329197..0000000 --- a/src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -AgentThinkRequestOnSpeakingAction = typing.Union[typing.Literal["interrupt", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py b/src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py deleted file mode 100644 index ee50877..0000000 --- a/src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -AgentThinkRequestOnThinkingAction = typing.Union[typing.Literal["interrupt", "ignore"], typing.Any] diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py index 1942bce..712d0dd 100644 --- a/src/agora_agent/agentkit/__init__.py +++ b/src/agora_agent/agentkit/__init__.py @@ -1,13 +1,31 @@ from .agent import ( Agent, + AgentConfig, + AgentConfigUpdate, + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, RtcConfig, + SttConfig, + SttVendor, + TtsConfig, FillerWordsConfig, FillerWordsTrigger, FillerWordsTriggerFixedTimeConfig, FillerWordsContent, FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, TurnDetectionConfig, TurnDetectionNestedConfig, StartOfSpeechConfig, @@ -37,28 +55,40 @@ InterruptionMode, MllmTurnDetectionConfig, MllmTurnDetectionMode, + Labels, LlmGreetingConfigs, LlmGreetingConfigsMode, McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SessionStatus, + SayOptions, + ThinkOptions, + GetTurnsOptions, + SessionOptions, + SpeakPriority, + ThinkOnListeningAction, + ThinkOnSpeakingAction, + ThinkOnThinkingAction, + ThinkResponse, ) -from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession -from ..agent_management.types.agent_think_agent_management_response import ( - AgentThinkAgentManagementResponse as AgentThinkResponse, -) -from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( - AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, -) -from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( - AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, -) -from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( - AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, +# Deprecated think type aliases (prefer ThinkOn* names). +from .agent import ( + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + ThinkResponse as AgentThinkResponse, ) +from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession from .avatar_types import ( is_akool_avatar, is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) @@ -70,6 +100,27 @@ GeofenceArea, GeofenceExcludeArea, FillerWordsSelectionRule, + InterruptionDisabledStrategy, + InterruptionDisabledStrategyAppend, + InterruptionDisabledStrategyIgnore, + InterruptionMode as InterruptionModeValues, + InterruptionModeKeywords, + InterruptionModeStartOfSpeech, + MllmTurnDetectionMode as MllmTurnDetectionModeValues, + MllmTurnDetectionModeAgoraVad, + MllmTurnDetectionModeSemanticVad, + MllmTurnDetectionModeServerVad, + SpeakPriority as SpeakPriorityValues, + SpeakPriorityAppend, + SpeakPriorityIgnore, + SpeakPriorityInterrupt, + ThinkOnListeningActionIgnore, + ThinkOnListeningActionInject, + ThinkOnListeningActionInterrupt, + ThinkOnSpeakingActionIgnore, + ThinkOnSpeakingActionInterrupt, + ThinkOnThinkingActionIgnore, + ThinkOnThinkingActionInterrupt, TurnDetectionTypeValues, ) from .token import ( @@ -91,6 +142,7 @@ ) from .vendors import ( AkoolAvatar, + AmazonBedrock, AmazonSTT, AmazonTTS, AnamAvatar, @@ -105,15 +157,19 @@ BaseTTS, CartesiaSampleRate, CartesiaTTS, + CustomLLM, DeepgramSTT, DeepgramTTS, ElevenLabsSampleRate, ElevenLabsTTS, + Dify, FishAudioTTS, Gemini, GeminiLive, + GenericAvatar, GoogleSTT, GoogleTTS, + Groq, HeyGenAvatar, HumeAITTS, MicrosoftSampleRate, @@ -132,14 +188,28 @@ SarvamTTS, SpeechmaticsSTT, VertexAI, + VertexAILLM, + XaiGrok, LiveAvatarAvatar, ) __all__ = [ "Agent", + "AgentConfig", + "AgentConfigUpdate", # Return type of Agent.to_properties() "StartAgentsRequestProperties", # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", "GeofenceConfig", "RtcConfig", "FillerWordsConfig", @@ -147,6 +217,7 @@ "FillerWordsTriggerFixedTimeConfig", "FillerWordsContent", "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", # Turn detection types "TurnDetectionConfig", "TurnDetectionNestedConfig", @@ -181,6 +252,7 @@ "InterruptionMode", "MllmTurnDetectionConfig", "MllmTurnDetectionMode", + "Labels", # Type-safe constants "DataChannel", "AudioScenario", @@ -190,6 +262,27 @@ "GeofenceExcludeArea", "FillerWordsSelectionRule", "TurnDetectionTypeValues", + "InterruptionModeValues", + "InterruptionModeStartOfSpeech", + "InterruptionModeKeywords", + "InterruptionDisabledStrategy", + "InterruptionDisabledStrategyAppend", + "InterruptionDisabledStrategyIgnore", + "SpeakPriorityValues", + "SpeakPriorityInterrupt", + "SpeakPriorityAppend", + "SpeakPriorityIgnore", + "MllmTurnDetectionModeValues", + "MllmTurnDetectionModeAgoraVad", + "MllmTurnDetectionModeServerVad", + "MllmTurnDetectionModeSemanticVad", + "ThinkOnListeningActionInject", + "ThinkOnListeningActionInterrupt", + "ThinkOnListeningActionIgnore", + "ThinkOnThinkingActionInterrupt", + "ThinkOnThinkingActionIgnore", + "ThinkOnSpeakingActionInterrupt", + "ThinkOnSpeakingActionIgnore", # LLM sub-types "LlmGreetingConfigs", "LlmGreetingConfigsMode", @@ -197,10 +290,30 @@ "AgentSession", "AsyncAgentSession", "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "SessionStatus", + "SayOptions", + "ThinkOptions", + "GetTurnsOptions", + "SessionOptions", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "ThinkResponse", + "ThinkOnListeningAction", + "ThinkOnThinkingAction", + "ThinkOnSpeakingAction", "AgentThinkResponse", "AgentThinkRequestOnListeningAction", "AgentThinkRequestOnThinkingAction", "AgentThinkRequestOnSpeakingAction", + "is_avatar_token_managed", + "is_rtc_avatar", "AgentPresets", "DeepgramPresetModels", "OpenAIPresetModels", @@ -228,6 +341,11 @@ "AzureOpenAI", "Anthropic", "Gemini", + "Groq", + "VertexAILLM", + "AmazonBedrock", + "Dify", + "CustomLLM", "ElevenLabsTTS", "MicrosoftTTS", "OpenAITTS", @@ -253,14 +371,17 @@ "OpenAIRealtime", "GeminiLive", "VertexAI", + "XaiGrok", "HeyGenAvatar", "LiveAvatarAvatar", "AkoolAvatar", "AnamAvatar", + "GenericAvatar", "is_heygen_avatar", "is_live_avatar_avatar", "is_akool_avatar", "is_anam_avatar", + "is_generic_avatar", "validate_avatar_config", "validate_tts_sample_rate", ] diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 70a1bdd..f84862c 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -8,6 +8,25 @@ from .agent_session import AgentSession, AsyncAgentSession from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties +from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr +from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor +from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar +from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor +from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm +from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle +from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm +from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor +from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties +from ..agents.types.get_agents_response import GetAgentsResponse +from ..agents.types.list_agents_response import ListAgentsResponse +from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem +from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus +from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse +from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem +from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse +from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem +from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech @@ -46,10 +65,33 @@ from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig -from .token import generate_convo_ai_token, _validate_expires_in +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule +from ..types.tts import Tts +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, +) +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, +) +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, +) +from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, +) from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS # Top-level aliases +LlmConfig = StartAgentsRequestPropertiesLlm +LlmStyle = StartAgentsRequestPropertiesLlmStyle +SttConfig = StartAgentsRequestPropertiesAsr +AsrConfig = SttConfig +SttVendor = StartAgentsRequestPropertiesAsrVendor +TtsConfig = Tts +MllmConfig = StartAgentsRequestPropertiesMllm +MllmVendor = StartAgentsRequestPropertiesMllmVendor +AvatarConfig = StartAgentsRequestPropertiesAvatar +AvatarVendor = StartAgentsRequestPropertiesAvatarVendor TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection SalConfig = StartAgentsRequestPropertiesSal SalMode = StartAgentsRequestPropertiesSalSalMode @@ -93,6 +135,19 @@ InterruptionMode = StartAgentsRequestPropertiesInterruptionMode MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode +AgentConfig = StartAgentsRequestProperties +AgentConfigUpdate = UpdateAgentsRequestProperties +SessionInfo = GetAgentsResponse +SessionListResponse = ListAgentsResponse +SessionSummary = ListAgentsResponseDataListItem +SessionStatus = ListAgentsResponseDataListItemStatus +ConversationHistory = GetHistoryAgentsResponse +ConversationTurn = GetHistoryAgentsResponseContentsItem +ConversationRole = GetHistoryAgentsResponseContentsItemRole +ConversationTurns = GetTurnsAgentsResponse +ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem +SpeakPriority = SpeakAgentsRequestPriority +Labels = typing.Dict[str, str] class SessionParamsInput(typing_extensions.TypedDict, total=False): @@ -103,6 +158,39 @@ class SessionParamsInput(typing_extensions.TypedDict, total=False): enable_error_message: bool audio_scenario: ParametersAudioScenario + +class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + +class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + +class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + +class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + # LLM sub-type aliases LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode @@ -116,6 +204,23 @@ class SessionParamsInput(typing_extensions.TypedDict, total=False): FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig +FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +# Think type aliases and response +ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction +ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction +ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction +ThinkResponse = AgentThinkAgentManagementResponse + +from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + +def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value class Agent: @@ -126,8 +231,7 @@ class Agent: Examples -------- - >>> from agora_agent.agentkit import Agent - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT >>> >>> agent = Agent(instructions="You are a helpful voice assistant.") >>> agent = ( @@ -154,6 +258,7 @@ def __init__( labels: typing.Optional[typing.Dict[str, str]] = None, rtc: typing.Optional[RtcConfig] = None, filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, ): self._name = name self._instructions = instructions @@ -176,6 +281,7 @@ def __init__( self._labels = labels self._rtc = rtc self._filler_words = filler_words + self._greeting_configs = greeting_configs def with_llm(self, vendor: BaseLLM) -> "Agent": new_agent = self._clone() @@ -183,9 +289,20 @@ def with_llm(self, vendor: BaseLLM) -> "Agent": return new_agent def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) new_agent = self._clone() new_agent._tts = vendor.to_config() - new_agent._tts_sample_rate = vendor.sample_rate + new_agent._tts_sample_rate = sample_rate return new_agent def with_stt(self, vendor: BaseSTT) -> "Agent": @@ -194,6 +311,9 @@ def with_stt(self, vendor: BaseSTT) -> "Agent": return new_agent def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. new_agent = self._clone() new_agent._mllm = vendor.to_config() if isinstance(new_agent._mllm, dict): @@ -202,7 +322,10 @@ def with_mllm(self, vendor: BaseMLLM) -> "Agent": advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): - advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) if ( advanced_features_model.enable_rtm is None and advanced_features_model.enable_sal is None @@ -214,6 +337,10 @@ def with_mllm(self, vendor: BaseMLLM) -> "Agent": return new_agent def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. required_sample_rate = vendor.required_sample_rate if ( required_sample_rate not in (None, 0) @@ -251,6 +378,12 @@ def with_greeting(self, greeting: str) -> "Agent": new_agent._greeting = greeting return new_agent + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Returns a new Agent with greeting playback configuration.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + def with_name(self, name: str) -> "Agent": new_agent = self._clone() new_agent._name = name @@ -282,7 +415,10 @@ def with_tools(self, enabled: bool = True) -> "Agent": {**new_agent._advanced_features, "enable_tools": enabled}, ) else: - new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) return new_agent def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": @@ -294,6 +430,23 @@ def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsI new_agent._parameters = parameters return new_agent + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + def with_failure_message(self, message: str) -> "Agent": """Returns a new Agent with the specified failure message. @@ -342,6 +495,33 @@ def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": new_agent._filler_words = filler_words return new_agent + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + @property def name(self) -> typing.Optional[str]: return self._name @@ -354,6 +534,10 @@ def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: return self._tts + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + @property def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: return self._stt @@ -378,6 +562,10 @@ def instructions(self) -> typing.Optional[str]: def greeting(self) -> typing.Optional[str]: return self._greeting + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + @property def failure_message(self) -> typing.Optional[str]: return self._failure_message @@ -440,6 +628,7 @@ def config(self) -> typing.Dict[str, typing.Any]: "labels": self._labels, "rtc": self._rtc, "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, } def create_session( @@ -536,6 +725,20 @@ def to_properties( expires_in: typing.Optional[int] = None, skip_vendor_validation: bool = False, ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + if token is None: if app_id is None or app_certificate is None: raise ValueError("Either token or app_id+app_certificate must be provided") @@ -549,13 +752,10 @@ def to_properties( app_id=app_id, app_certificate=app_certificate, channel_name=channel, - account=agent_uid, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), **token_kwargs, ) - mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True - is_mllm_mode = bool(mllm_flag or self._mllm is not None) - base_kwargs: typing.Dict[str, typing.Any] = { "channel": channel, "token": token, @@ -579,11 +779,12 @@ def to_properties( base_kwargs["avatar"] = self._avatar if self._advanced_features is not None: base_kwargs["advanced_features"] = self._advanced_features - if self._parameters is not None: - if isinstance(self._parameters, dict): - base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) else: - base_kwargs["parameters"] = self._parameters + base_kwargs["parameters"] = parameters if self._geofence is not None: base_kwargs["geofence"] = self._geofence if self._labels is not None: @@ -596,12 +797,10 @@ def to_properties( if is_mllm_mode: if self._mllm is not None: mllm_config = dict(self._mllm) - if self._greeting: + if self._greeting is not None: mllm_config.setdefault("greeting_message", self._greeting) - if self._failure_message: + if self._failure_message is not None: mllm_config.setdefault("failure_message", self._failure_message) - if self._max_history is not None: - mllm_config.setdefault("max_history", self._max_history) base_kwargs["mllm"] = mllm_config return StartAgentsRequestProperties(**base_kwargs) @@ -617,14 +816,16 @@ def to_properties( llm_config = dict(self._llm) # Agent-level fields take priority over the vendor's defaults. # This matches the TS SDK where agent-level values override vendor config. - if self._instructions: + if self._instructions is not None: llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] - if self._greeting: - llm_config.setdefault("greeting_message", self._greeting) - if self._failure_message: - llm_config.setdefault("failure_message", self._failure_message) + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message if self._max_history is not None: - llm_config.setdefault("max_history", self._max_history) + llm_config["max_history"] = self._max_history base_kwargs["llm"] = llm_config base_kwargs["tts"] = self._tts @@ -656,4 +857,5 @@ def _clone(self) -> "Agent": new_agent._labels = self._labels new_agent._rtc = self._rtc new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs return new_agent diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index 2408659..a749d1e 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -14,18 +14,21 @@ from ..agent_management.types.agent_think_agent_management_response import ( AgentThinkAgentManagementResponse as AgentThinkResponse, ) +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties -from .agent import Agent +from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions from .avatar_types import ( is_akool_avatar, is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, validate_avatar_config, validate_tts_sample_rate, ) from .presets import resolve_session_presets -from .token import generate_convo_ai_token +from .token import generate_convo_ai_token, _parse_numeric_uid class _AgentSessionRequiredOptions(typing.TypedDict, total=True): @@ -166,7 +169,7 @@ def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: app_id=app_id, app_certificate=app_certificate, channel_name=self._channel, - account=self._agent_uid, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), ) return {"Authorization": f"agora token={token}"} @@ -182,17 +185,29 @@ def _validate_avatar_config(self) -> None: tts = self._agent.tts if not avatar or avatar.get("enable", True) is False: return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) if ( is_heygen_avatar(avatar) or is_live_avatar_avatar(avatar) or is_akool_avatar(avatar) or is_anam_avatar(avatar) + or is_generic_avatar(avatar) ): validate_avatar_config(avatar) tts_params = tts.get("params") if isinstance(tts, dict) else None - sample_rate = tts_params.get("sample_rate") if isinstance(tts_params, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) if isinstance(sample_rate, int): validate_tts_sample_rate(avatar, sample_rate) elif is_heygen_avatar(avatar): @@ -211,6 +226,54 @@ def _validate_avatar_config(self) -> None: "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." ) + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + @staticmethod def _dump_model(value: typing.Any) -> typing.Any: if hasattr(value, "model_dump"): @@ -238,12 +301,17 @@ def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> t **token_opts, ) properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) if self._is_mllm_mode(): if self._agent.mllm is not None: - mllm = dict(self._agent.mllm) - if self._agent.greeting: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) properties["mllm"] = mllm return properties @@ -251,20 +319,49 @@ def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> t properties["tts"] = self._dump_model(self._agent.tts) if self._agent.llm is not None: llm = dict(self._agent.llm) - if self._agent.instructions: + if self._agent.instructions is not None: llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] - if self._agent.greeting: - llm.setdefault("greeting_message", self._agent.greeting) - if self._agent.failure_message: - llm.setdefault("failure_message", self._agent.failure_message) + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message if self._agent.max_history is not None: - llm.setdefault("max_history", self._agent.max_history) + llm["max_history"] = self._agent.max_history properties["llm"] = llm if self._agent.stt is not None: properties["asr"] = self._dump_model(self._agent.stt) return properties + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + # ------------------------------------------------------------------ # Event handling # ------------------------------------------------------------------ @@ -315,12 +412,10 @@ class AgentSession(_AgentSessionBase): Examples -------- - >>> from agora_agent import Agora, Area - >>> from agora_agent.agentkit import Agent + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS >>> >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = session.start() @@ -436,6 +531,8 @@ def say( text: str, priority: typing.Optional[str] = None, interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, ) -> None: """Send a message to be spoken by the agent. @@ -454,6 +551,8 @@ def say( raise RuntimeError("No agent ID available") kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) if priority is not None: kwargs["priority"] = priority if interruptable is not None: @@ -483,14 +582,22 @@ def think( on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, interruptable: typing.Optional[bool] = None, metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, ) -> AgentThinkResponse: - """Inject a custom text instruction into the current session pipeline.""" + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ if self._status != "running": raise RuntimeError(f"Cannot think in {self._status} state") if not self._agent_id: raise RuntimeError("No agent ID available") kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) if on_listening_action is not None: kwargs["on_listening_action"] = on_listening_action if on_thinking_action is not None: @@ -547,15 +654,76 @@ def get_info(self) -> typing.Any: self._app_id, self._agent_id, request_options=self._request_options() ) - def get_turns(self) -> typing.Any: + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: """Get turn-by-turn analytics and timing details for this session.""" if not self._agent_id: raise RuntimeError("No agent ID available") + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + return self._client.agents.get_turns( - self._app_id, self._agent_id, request_options=self._request_options() + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, ) + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + class AsyncAgentSession(_AgentSessionBase): """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. @@ -565,12 +733,10 @@ class AsyncAgentSession(_AgentSessionBase): Examples -------- - >>> from agora_agent import AsyncAgora, Area - >>> from agora_agent.agentkit import Agent + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS >>> >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are helpful.") - >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = await session.start() @@ -686,6 +852,8 @@ async def say( text: str, priority: typing.Optional[str] = None, interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, ) -> None: """Send a message to be spoken by the agent. @@ -704,6 +872,8 @@ async def say( raise RuntimeError("No agent ID available") kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) if priority is not None: kwargs["priority"] = priority if interruptable is not None: @@ -733,14 +903,22 @@ async def think( on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, interruptable: typing.Optional[bool] = None, metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, ) -> AgentThinkResponse: - """Inject a custom text instruction into the current session pipeline.""" + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ if self._status != "running": raise RuntimeError(f"Cannot think in {self._status} state") if not self._agent_id: raise RuntimeError("No agent ID available") kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) if on_listening_action is not None: kwargs["on_listening_action"] = on_listening_action if on_thinking_action is not None: @@ -797,11 +975,72 @@ async def get_info(self) -> typing.Any: self._app_id, self._agent_id, request_options=self._request_options() ) - async def get_turns(self) -> typing.Any: + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: """Get turn-by-turn analytics and timing details for this session.""" if not self._agent_id: raise RuntimeError("No agent ID available") + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + return await self._client.agents.get_turns( - self._app_id, self._agent_id, request_options=self._request_options() + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py index 9e132a9..aea9da1 100644 --- a/src/agora_agent/agentkit/avatar_types.py +++ b/src/agora_agent/agentkit/avatar_types.py @@ -1,3 +1,4 @@ +import warnings import typing @@ -17,7 +18,35 @@ def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: return config.get("vendor") == "anam" -def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: +def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + +def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + """Return True when AgentKit manages the avatar RTC publisher identity.""" + return ( + is_heygen_avatar(config) + or is_live_avatar_avatar(config) + or is_generic_avatar(config) + ) + + +def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + warnings.warn( + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + "and keep agora_uid checks in session enrichment.", + DeprecationWarning, + stacklevel=2, + ) + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + +def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, +) -> None: """Validates avatar configuration at runtime. Parameters @@ -45,6 +74,8 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: f"Invalid quality for {label}: {params.get('quality')}. " f"Must be one of: {', '.join(valid_qualities)}" ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") elif is_akool_avatar(config): params = config.get("params", {}) if not params.get("api_key"): @@ -53,6 +84,23 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: params = config.get("params", {}) if not params.get("api_key"): raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") def validate_tts_sample_rate( @@ -62,7 +110,7 @@ def validate_tts_sample_rate( """Validates that TTS sample rate is compatible with the avatar vendor. Different avatar vendors have specific sample rate requirements: - - HeyGen: ONLY supports 24,000 Hz + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz - Akool: ONLY supports 16,000 Hz Parameters diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py index f86e4d3..eb186da 100644 --- a/src/agora_agent/agentkit/constants.py +++ b/src/agora_agent/agentkit/constants.py @@ -58,3 +58,53 @@ class TurnDetectionTypeValues: AGORA_VAD = "agora_vad" SERVER_VAD = "server_vad" SEMANTIC_VAD = "semantic_vad" + + +class InterruptionMode: + START_OF_SPEECH = "start_of_speech" + KEYWORDS = "keywords" + + +InterruptionModeStartOfSpeech = InterruptionMode.START_OF_SPEECH +InterruptionModeKeywords = InterruptionMode.KEYWORDS + + +class InterruptionDisabledStrategy: + APPEND = "append" + IGNORE = "ignore" + + +InterruptionDisabledStrategyAppend = InterruptionDisabledStrategy.APPEND +InterruptionDisabledStrategyIgnore = InterruptionDisabledStrategy.IGNORE + + +class SpeakPriority: + INTERRUPT = "INTERRUPT" + APPEND = "APPEND" + IGNORE = "IGNORE" + + +SpeakPriorityInterrupt = SpeakPriority.INTERRUPT +SpeakPriorityAppend = SpeakPriority.APPEND +SpeakPriorityIgnore = SpeakPriority.IGNORE + + +class MllmTurnDetectionMode: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + +MllmTurnDetectionModeAgoraVad = MllmTurnDetectionMode.AGORA_VAD +MllmTurnDetectionModeServerVad = MllmTurnDetectionMode.SERVER_VAD +MllmTurnDetectionModeSemanticVad = MllmTurnDetectionMode.SEMANTIC_VAD + + +# Think action value constants (match Fern wire values) +ThinkOnListeningActionInject = "inject" +ThinkOnListeningActionInterrupt = "interrupt" +ThinkOnListeningActionIgnore = "ignore" +ThinkOnThinkingActionInterrupt = "interrupt" +ThinkOnThinkingActionIgnore = "ignore" +ThinkOnSpeakingActionInterrupt = "interrupt" +ThinkOnSpeakingActionIgnore = "ignore" diff --git a/src/agora_agent/agentkit/token.py b/src/agora_agent/agentkit/token.py index 338e9db..9dd1aaa 100644 --- a/src/agora_agent/agentkit/token.py +++ b/src/agora_agent/agentkit/token.py @@ -85,7 +85,7 @@ class GenerateConvoAITokenOptions(typing.TypedDict, total=False): app_id: str app_certificate: str channel_name: str - account: str + uid: int token_expire: int privilege_expire: int @@ -242,7 +242,7 @@ def generate_convo_ai_token( app_id: str, app_certificate: str, channel_name: str, - account: str, + uid: int, token_expire: int = DEFAULT_EXPIRY_SECONDS, privilege_expire: int = 0, ) -> str: @@ -262,8 +262,8 @@ def generate_convo_ai_token( Agora App Certificate. channel_name : str The channel the agent will join (must match the start request). - account : str - String account identity — pass the agent UID as a string (e.g. "1001"). + uid : int + Numeric ConvoAI participant UID. Use the RTC UID for a user, agent, or avatar. token_expire : int Seconds until the token expires (default 86400). privilege_expire : int @@ -281,7 +281,7 @@ def generate_convo_ai_token( app_id, app_certificate, channel_name, - account, + _uid_to_account(uid), ROLE_PUBLISHER, token_expire, privilege_expire, @@ -290,6 +290,7 @@ def generate_convo_ai_token( pass priv_expire = privilege_expire if privilege_expire != 0 else token_expire + account = _uid_to_account(uid) rtc_privileges: typing.Dict[int, int] = { 1: priv_expire, # kPrivilegeJoinChannel @@ -310,3 +311,15 @@ def generate_convo_ai_token( token_expire, [(1, rtc_svc), (2, rtm_svc)], ) + + +def _uid_to_account(uid: int) -> str: + if not isinstance(uid, int) or isinstance(uid, bool): + raise TypeError("uid must be an int") + return str(uid) + + +def _parse_numeric_uid(uid: str, label: str) -> int: + if not uid.isdigit(): + raise ValueError(f"{label} must be a numeric RTC UID when auto-generating a ConvoAI token") + return int(uid) diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py index 0320843..0cd901e 100644 --- a/src/agora_agent/agentkit/vendors/__init__.py +++ b/src/agora_agent/agentkit/vendors/__init__.py @@ -11,9 +11,9 @@ OpenAISampleRate, SampleRate, ) -from .avatar import AkoolAvatar, AnamAvatar, HeyGenAvatar, LiveAvatarAvatar -from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI -from .mllm import GeminiLive, OpenAIRealtime, VertexAI +from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar +from .llm import AmazonBedrock, Anthropic, AzureOpenAI, CustomLLM, Dify, Gemini, Groq, OpenAI, VertexAILLM +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok from .stt import ( AmazonSTT, AresSTT, @@ -57,6 +57,11 @@ "AzureOpenAI", "Anthropic", "Gemini", + "Groq", + "VertexAILLM", + "AmazonBedrock", + "Dify", + "CustomLLM", "ElevenLabsTTS", "MicrosoftTTS", "OpenAITTS", @@ -82,8 +87,10 @@ "OpenAIRealtime", "GeminiLive", "VertexAI", + "XaiGrok", "HeyGenAvatar", "LiveAvatarAvatar", "AkoolAvatar", "AnamAvatar", + "GenericAvatar", ] diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index b83a356..50bdd08 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -5,19 +5,19 @@ from .base import BaseAvatar -HEYGEN_SAMPLE_RATE = 24000 LIVEAVATAR_SAMPLE_RATE = 24000 +HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE AKOOL_SAMPLE_RATE = 16000 -class HeyGenAvatarOptions(BaseModel): +class LiveAvatarAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") - api_key: str = Field(..., description="HeyGen API key") + api_key: str = Field(..., description="LiveAvatar API key") quality: str = Field(..., description="Avatar quality: low, medium, or high") agora_uid: str = Field(..., description="Agora UID for the avatar stream") agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") - avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") @@ -31,6 +31,41 @@ def validate_quality(cls, v: str) -> str: raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") return v + +class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + +class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + class HeyGenAvatar(BaseAvatar): """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" @@ -76,6 +111,7 @@ class AkoolAvatarOptions(BaseModel): enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + class AkoolAvatar(BaseAvatar): def __init__(self, **kwargs: Any): self.options = AkoolAvatarOptions(**kwargs) @@ -98,48 +134,58 @@ def to_config(self) -> Dict[str, Any]: return {"enable": enable, "vendor": "akool", "params": params} -class LiveAvatarAvatarOptions(HeyGenAvatarOptions): - pass +class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") -class LiveAvatarAvatar(BaseAvatar): + +class GenericAvatar(BaseAvatar): def __init__(self, **kwargs: Any): - self.options = LiveAvatarAvatarOptions(**kwargs) + self.options = GenericAvatarOptions(**kwargs) @property def required_sample_rate(self) -> int: - return LIVEAVATAR_SAMPLE_RATE + return 0 def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "api_key": self.options.api_key, - "quality": self.options.quality, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, "agora_uid": self.options.agora_uid, } + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid if self.options.agora_token is not None: params["agora_token"] = self.options.agora_token - if self.options.avatar_id is not None: - params["avatar_id"] = self.options.avatar_id - if self.options.disable_idle_timeout is not None: - params["disable_idle_timeout"] = self.options.disable_idle_timeout - if self.options.activity_idle_timeout is not None: - params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel if self.options.additional_params is not None: params = {**self.options.additional_params, **params} enable = self.options.enable if self.options.enable is not None else True - return {"enable": enable, "vendor": "liveavatar", "params": params} + return {"enable": enable, "vendor": "generic", "params": params} class AnamAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Anam API key") - persona_id: Optional[str] = Field(default=None, description="Persona ID") + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + class AnamAvatar(BaseAvatar): def __init__(self, **kwargs: Any): self.options = AnamAvatarOptions(**kwargs) diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 7465c9f..b521867 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -1,9 +1,14 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, ConfigDict, Field +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, +) from .base import BaseLLM +LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" @@ -15,6 +20,14 @@ def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]] result.append(item) return result + +def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + class OpenAIOptions(BaseModel): model_config = ConfigDict(extra="forbid") @@ -31,7 +44,7 @@ class OpenAIOptions(BaseModel): params: Optional[Dict[str, Any]] = Field(default=None) headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) vendor: Optional[str] = Field(default=None) mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -74,7 +87,7 @@ def to_config(self) -> Dict[str, Any]: if self.options.output_modalities is not None: config["output_modalities"] = self.options.output_modalities if self.options.greeting_configs is not None: - config["greeting_configs"] = self.options.greeting_configs + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) if self.options.template_variables is not None: config["template_variables"] = self.options.template_variables if self.options.vendor is not None: @@ -104,7 +117,7 @@ class AzureOpenAIOptions(BaseModel): params: Optional[Dict[str, Any]] = Field(default=None) headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) vendor: Optional[str] = Field(default=None) mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -150,7 +163,7 @@ def to_config(self) -> Dict[str, Any]: if self.options.output_modalities is not None: config["output_modalities"] = self.options.output_modalities if self.options.greeting_configs is not None: - config["greeting_configs"] = self.options.greeting_configs + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) if self.options.template_variables is not None: config["template_variables"] = self.options.template_variables if self.options.mcp_servers is not None: @@ -177,7 +190,7 @@ class AnthropicOptions(BaseModel): params: Optional[Dict[str, Any]] = Field(default=None) headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) vendor: Optional[str] = Field(default=None) mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -216,7 +229,7 @@ def to_config(self) -> Dict[str, Any]: if self.options.output_modalities is not None: config["output_modalities"] = self.options.output_modalities if self.options.greeting_configs is not None: - config["greeting_configs"] = self.options.greeting_configs + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) if self.options.template_variables is not None: config["template_variables"] = self.options.template_variables if self.options.vendor is not None: @@ -246,7 +259,7 @@ class GeminiOptions(BaseModel): params: Optional[Dict[str, Any]] = Field(default=None) headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) vendor: Optional[str] = Field(default=None) mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -287,7 +300,7 @@ def to_config(self) -> Dict[str, Any]: if self.options.output_modalities is not None: config["output_modalities"] = self.options.output_modalities if self.options.greeting_configs is not None: - config["greeting_configs"] = self.options.greeting_configs + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) if self.options.template_variables is not None: config["template_variables"] = self.options.template_variables if self.options.vendor is not None: @@ -298,3 +311,139 @@ def to_config(self) -> Dict[str, Any]: config["max_history"] = self.options.max_history return config + + +class GroqOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Groq API key") + model: str = Field(default="llama-3.3-70b-versatile", description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom Groq-compatible endpoint") + + +class Groq(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GroqOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["url"] = self.options.base_url or "https://api.groq.com/openai/v1/chat/completions" + return config + + +class CustomLLMOptions(OpenAIOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Custom LLM API key") + base_url: str = Field(..., description="OpenAI-compatible chat completions endpoint") + + +class CustomLLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = CustomLLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config = OpenAI(**_dump_optional_model(self.options)).to_config() + config["vendor"] = self.options.vendor or "custom" + return config + + +class VertexAILLMOptions(GeminiOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Vertex AI access token or API key") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location") + + +class VertexAILLM(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAILLMOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + options = _dump_optional_model(self.options) + options.pop("project_id", None) + options.pop("location", None) + config = Gemini(**options).to_config() + params = dict(config["params"]) + params["project_id"] = self.options.project_id + params["location"] = self.options.location + config["params"] = params + return config + + +class AmazonBedrockOptions(AnthropicOptions): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Amazon Bedrock API key or gateway token") + url: str = Field(..., description="Amazon Bedrock proxy or runtime endpoint") + + +class AmazonBedrock(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AmazonBedrockOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + return Anthropic(**_dump_optional_model(self.options)).to_config() + + +class DifyOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Dify API key") + url: str = Field(..., description="Dify workflow or chat endpoint") + user: Optional[str] = Field(default=None, description="Dify user identifier") + conversation_id: Optional[str] = Field(default=None, description="Dify conversation ID") + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0) + + +class Dify(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = DifyOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.user is not None: + params["user"] = self.options.user + if self.options.conversation_id is not None: + params["conversation_id"] = self.options.conversation_id + + config: Dict[str, Any] = { + "url": self.options.url, + "api_key": self.options.api_key, + "params": params, + "style": "dify", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index 5f6f940..b58f040 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -22,9 +22,7 @@ class OpenAIRealtimeOptions(BaseModel): messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") failure_message: Optional[str] = Field(default=None, description="Message played on failure") - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") class OpenAIRealtime(BaseMLLM): def __init__(self, **kwargs: Any): @@ -53,12 +51,67 @@ def to_config(self) -> Dict[str, Any]: config["output_modalities"] = self.options.output_modalities if self.options.messages is not None: config["messages"] = self.options.messages - if self.options.predefined_tools is not None: - config["predefined_tools"] = self.options.predefined_tools if self.options.failure_message is not None: config["failure_message"] = self.options.failure_message - if self.options.max_history is not None: - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + +# xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name +# is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + +class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + +class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message if self.options.turn_detection is not None: config["turn_detection"] = self.options.turn_detection @@ -81,28 +134,24 @@ class VertexAIOptions(BaseModel): messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") failure_message: Optional[str] = Field(default=None, description="Message played on failure") - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") class VertexAI(BaseMLLM): def __init__(self, **kwargs: Any): self.options = VertexAIOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { - "model": self.options.model, - "project_id": self.options.project_id, - "location": self.options.location, - "adc_credentials_string": self.options.adc_credentials_string, - } - + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string if self.options.instructions is not None: params["instructions"] = self.options.instructions if self.options.voice is not None: params["voice"] = self.options.voice - if self.options.additional_params is not None: - params.update(self.options.additional_params) config: Dict[str, Any] = { "vendor": "vertexai", @@ -119,12 +168,8 @@ def to_config(self) -> Dict[str, Any]: config["output_modalities"] = self.options.output_modalities if self.options.messages is not None: config["messages"] = self.options.messages - if self.options.predefined_tools is not None: - config["predefined_tools"] = self.options.predefined_tools if self.options.failure_message is not None: config["failure_message"] = self.options.failure_message - if self.options.max_history is not None: - config["max_history"] = self.options.max_history if self.options.turn_detection is not None: config["turn_detection"] = self.options.turn_detection @@ -145,9 +190,7 @@ class GeminiLiveOptions(BaseModel): messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") failure_message: Optional[str] = Field(default=None, description="Message played on failure") - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") class GeminiLive(BaseMLLM): def __init__(self, **kwargs: Any): @@ -179,12 +222,8 @@ def to_config(self) -> Dict[str, Any]: config["output_modalities"] = self.options.output_modalities if self.options.messages is not None: config["messages"] = self.options.messages - if self.options.predefined_tools is not None: - config["predefined_tools"] = self.options.predefined_tools if self.options.failure_message is not None: config["failure_message"] = self.options.failure_message - if self.options.max_history is not None: - config["max_history"] = self.options.max_history if self.options.turn_detection is not None: config["turn_detection"] = self.options.turn_detection diff --git a/src/agora_agent/agents/client.py b/src/agora_agent/agents/client.py index 6c4a48c..3f6af4c 100644 --- a/src/agora_agent/agents/client.py +++ b/src/agora_agent/agents/client.py @@ -89,6 +89,9 @@ def start( StartAgentsRequestProperties, StartAgentsRequestPropertiesAsr, StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionConfig, + StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, ) client = Agora( @@ -126,6 +129,13 @@ def start( greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + config=StartAgentsRequestPropertiesTurnDetectionConfig( + end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( + mode="semantic", + ), + ), + ), ), ) """ @@ -175,7 +185,6 @@ def list( - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. limit : typing.Optional[int] @@ -302,7 +311,13 @@ def get_history( return _response.data def get_turns( - self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None + self, + appid: str, + agent_id: str, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, ) -> GetTurnsAgentsResponse: """ Query conversation turn information for a conversational AI agent session. @@ -319,6 +334,12 @@ def get_turns( agent_id : str The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + page_index : typing.Optional[int] + The page number. Starts from 1. + + page_size : typing.Optional[int] + The number of dialogue turns returned per page. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -341,12 +362,14 @@ def get_turns( agent_id="agentId", ) """ - _response = self._raw_client.get_turns(appid, agent_id, request_options=request_options) + _response = self._raw_client.get_turns( + appid, agent_id, page_index=page_index, page_size=page_size, request_options=request_options + ) return _response.data def stop(self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None: """ - Stop the specified conversational agent instance. + Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned. Parameters ---------- @@ -623,6 +646,9 @@ async def start( StartAgentsRequestProperties, StartAgentsRequestPropertiesAsr, StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionConfig, + StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, ) client = AsyncAgora( @@ -663,6 +689,13 @@ async def main() -> None: greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + config=StartAgentsRequestPropertiesTurnDetectionConfig( + end_of_speech=StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech( + mode="semantic", + ), + ), + ), ), ) @@ -715,7 +748,6 @@ async def list( - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. limit : typing.Optional[int] @@ -867,7 +899,13 @@ async def main() -> None: return _response.data async def get_turns( - self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None + self, + appid: str, + agent_id: str, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, ) -> GetTurnsAgentsResponse: """ Query conversation turn information for a conversational AI agent session. @@ -884,6 +922,12 @@ async def get_turns( agent_id : str The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + page_index : typing.Optional[int] + The page number. Starts from 1. + + page_size : typing.Optional[int] + The number of dialogue turns returned per page. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -914,12 +958,14 @@ async def main() -> None: asyncio.run(main()) """ - _response = await self._raw_client.get_turns(appid, agent_id, request_options=request_options) + _response = await self._raw_client.get_turns( + appid, agent_id, page_index=page_index, page_size=page_size, request_options=request_options + ) return _response.data async def stop(self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None) -> None: """ - Stop the specified conversational agent instance. + Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned. Parameters ---------- diff --git a/src/agora_agent/agents/raw_client.py b/src/agora_agent/agents/raw_client.py index c6eb03f..c0dfa01 100644 --- a/src/agora_agent/agents/raw_client.py +++ b/src/agora_agent/agents/raw_client.py @@ -144,7 +144,6 @@ def list( - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. limit : typing.Optional[int] @@ -293,7 +292,13 @@ def get_history( raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) def get_turns( - self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None + self, + appid: str, + agent_id: str, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, ) -> HttpResponse[GetTurnsAgentsResponse]: """ Query conversation turn information for a conversational AI agent session. @@ -310,6 +315,12 @@ def get_turns( agent_id : str The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + page_index : typing.Optional[int] + The page number. Starts from 1. + + page_size : typing.Optional[int] + The number of dialogue turns returned per page. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -321,6 +332,10 @@ def get_turns( _response = self._client_wrapper.httpx_client.request( f"v2/projects/{jsonable_encoder(appid)}/agents/{jsonable_encoder(agent_id)}/turns", method="GET", + params={ + "page_index": page_index, + "page_size": page_size, + }, request_options=request_options, ) try: @@ -342,7 +357,7 @@ def stop( self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None ) -> HttpResponse[None]: """ - Stop the specified conversational agent instance. + Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned. Parameters ---------- @@ -670,7 +685,6 @@ async def list( - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. limit : typing.Optional[int] @@ -822,7 +836,13 @@ async def get_history( raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) async def get_turns( - self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None + self, + appid: str, + agent_id: str, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + request_options: typing.Optional[RequestOptions] = None, ) -> AsyncHttpResponse[GetTurnsAgentsResponse]: """ Query conversation turn information for a conversational AI agent session. @@ -839,6 +859,12 @@ async def get_turns( agent_id : str The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + page_index : typing.Optional[int] + The page number. Starts from 1. + + page_size : typing.Optional[int] + The number of dialogue turns returned per page. + request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -850,6 +876,10 @@ async def get_turns( _response = await self._client_wrapper.httpx_client.request( f"v2/projects/{jsonable_encoder(appid)}/agents/{jsonable_encoder(agent_id)}/turns", method="GET", + params={ + "page_index": page_index, + "page_size": page_size, + }, request_options=request_options, ) try: @@ -871,7 +901,7 @@ async def stop( self, appid: str, agent_id: str, *, request_options: typing.Optional[RequestOptions] = None ) -> AsyncHttpResponse[None]: """ - Stop the specified conversational agent instance. + Stop the specified conversational agent instance. The API responds after request parameters are validated, and the stop operation is processed asynchronously after the response is returned. Parameters ---------- diff --git a/src/agora_agent/agents/types/get_agents_response.py b/src/agora_agent/agents/types/get_agents_response.py index fe7e8e9..fd0aa2b 100644 --- a/src/agora_agent/agents/types/get_agents_response.py +++ b/src/agora_agent/agents/types/get_agents_response.py @@ -32,7 +32,6 @@ class GetAgentsResponse(UncheckedBaseModel): - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. """ diff --git a/src/agora_agent/agents/types/get_agents_response_status.py b/src/agora_agent/agents/types/get_agents_response_status.py index 2cef8c7..c170dc3 100644 --- a/src/agora_agent/agents/types/get_agents_response_status.py +++ b/src/agora_agent/agents/types/get_agents_response_status.py @@ -3,5 +3,5 @@ import typing GetAgentsResponseStatus = typing.Union[ - typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "RECOVERING", "FAILED"], typing.Any + typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "FAILED"], typing.Any ] diff --git a/src/agora_agent/agents/types/get_turns_agents_response.py b/src/agora_agent/agents/types/get_turns_agents_response.py index dca9ce1..653b3a5 100644 --- a/src/agora_agent/agents/types/get_turns_agents_response.py +++ b/src/agora_agent/agents/types/get_turns_agents_response.py @@ -5,10 +5,36 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from .get_turns_agents_response_pagination import GetTurnsAgentsResponsePagination from .get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem class GetTurnsAgentsResponse(UncheckedBaseModel): + agent_id: typing.Optional[str] = pydantic.Field(default=None) + """ + The unique identifier of the agent. + """ + + name: typing.Optional[str] = pydantic.Field(default=None) + """ + The name of the agent. + """ + + channel: typing.Optional[str] = pydantic.Field(default=None) + """ + The name of the RTC channel the agent joined. + """ + + total_turn_count: typing.Optional[int] = pydantic.Field(default=None) + """ + The total number of dialogue turns in the current session. + """ + + pagination: typing.Optional[GetTurnsAgentsResponsePagination] = pydantic.Field(default=None) + """ + Pagination information. + """ + turns: typing.Optional[typing.List[GetTurnsAgentsResponseTurnsItem]] = pydantic.Field(default=None) """ A list of conversation turns for the agent session. diff --git a/src/agora_agent/agent_management/types/agent_think_response.py b/src/agora_agent/agents/types/get_turns_agents_response_pagination.py similarity index 56% rename from src/agora_agent/agent_management/types/agent_think_response.py rename to src/agora_agent/agents/types/get_turns_agents_response_pagination.py index 3a3c646..8330f7c 100644 --- a/src/agora_agent/agent_management/types/agent_think_response.py +++ b/src/agora_agent/agents/types/get_turns_agents_response_pagination.py @@ -7,20 +7,24 @@ from ...core.unchecked_base_model import UncheckedBaseModel -class AgentThinkResponse(UncheckedBaseModel): - agent_id: typing.Optional[str] = pydantic.Field(default=None) +class GetTurnsAgentsResponsePagination(UncheckedBaseModel): """ - Unique identifier of the agent instance. + Pagination information. """ - channel: typing.Optional[str] = pydantic.Field(default=None) + page_index: typing.Optional[int] = pydantic.Field(default=None) """ - The name of the RTC channel where the agent is located. + The current page number; starts from 1. """ - start_ts: typing.Optional[int] = pydantic.Field(default=None) + total_pages: typing.Optional[int] = pydantic.Field(default=None) """ - Timestamp indicating when the agent was created. + The total number of pages. + """ + + is_last_page: typing.Optional[bool] = pydantic.Field(default=None) + """ + True if the current page is the last page. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/agents/types/get_turns_agents_response_turns_item_end_metadata.py b/src/agora_agent/agents/types/get_turns_agents_response_turns_item_end_metadata.py index 0d9c61e..1e69709 100644 --- a/src/agora_agent/agents/types/get_turns_agents_response_turns_item_end_metadata.py +++ b/src/agora_agent/agents/types/get_turns_agents_response_turns_item_end_metadata.py @@ -28,8 +28,8 @@ class GetTurnsAgentsResponseTurnsItemEndMetadata(UncheckedBaseModel): - `api_leave`: The turn was interrupted because the agent left the channel. When `type` is `ignored`, possible values are: - - `semantic`: The turn was ignored because semantic end-of-speech detection determined no response was required. - - `keywords`: The turn was ignored because the start keyword was not detected. + - `semantic`: The turn was ignored because semantic end-of-speech detection determined no response was required. Applies when `turn_detection.config.end_of_speech.mode` is set to `semantic`. + - `keywords`: The turn was ignored because the start keyword was not detected. Applies when `turn_detection.config.start_of_speech.mode` is set to `keywords`. - `disable`: The turn was ignored because interruption is disabled for this turn. """ diff --git a/src/agora_agent/agents/types/get_turns_agents_response_turns_item_metrics_segmented_latency_ms_item.py b/src/agora_agent/agents/types/get_turns_agents_response_turns_item_metrics_segmented_latency_ms_item.py index 345d970..e849fdc 100644 --- a/src/agora_agent/agents/types/get_turns_agents_response_turns_item_metrics_segmented_latency_ms_item.py +++ b/src/agora_agent/agents/types/get_turns_agents_response_turns_item_metrics_segmented_latency_ms_item.py @@ -14,16 +14,16 @@ class GetTurnsAgentsResponseTurnsItemMetricsSegmentedLatencyMsItem(UncheckedBase When the LLM input modality is `text`, the returned segments are: - `algorithm_processing`: Algorithm processing delay. - - `asr_ttlw`: ASR Time To Last Word (TTLW) in milliseconds. - - `llm_ttft`: LLM Time To First Token (TTFT) in milliseconds. - - `llm_ftfs`: LLM First Token To First Sentence (FTFS) in milliseconds. - - `tts_ttfb`: TTS Time To First Byte (TTFB) in milliseconds. + - `asr_ttlw`: The ASR Time To Last Word (TTLW) in milliseconds. Represents the delay from when the user finishes speaking to when the ASR module outputs the last word. + - `llm_ttft`: The LLM Time To First Token (TTFT) in milliseconds. Represents the delay from when the user finishes speaking to when the LLM outputs the first token. + - `llm_ftfs`: The LLM First Token To First Sentence (FTFS) in milliseconds. Represents the delay from when the LLM outputs the first token to when it outputs the first complete sentence. + - `tts_ttfb`: The TTS Time To First Byte (TTFB) in milliseconds. Represents the delay from when the TTS module receives a text request to when it outputs the first audio byte. - `transport`: Network transmission delay in milliseconds. Not returned when the user is connected using the RTC Web SDK. When the LLM input modality is `audio`, the returned segments are: - `algorithm_processing`: Algorithm processing delay. - - `asr_ttlw`: ASR Time To Last Word (TTLW) in milliseconds. - - `llm_ttfa`: LLM Time To First Audio Byte (TTFA) in milliseconds. + - `asr_ttlw`: The ASR Time To Last Word (TTLW) in milliseconds. Represents the delay from when the user finishes speaking to when the ASR module outputs the last word. + - `llm_ttfa`: The LLM Time To First Audio Byte (TTFA) in milliseconds. Represents the delay from when the user finishes speaking to when the LLM outputs the first audio byte. - `transport`: Network transmission delay in milliseconds. Not returned when the user is connected using the RTC Web SDK. """ diff --git a/src/agora_agent/agents/types/interrupt_agents_response.py b/src/agora_agent/agents/types/interrupt_agents_response.py index 79ff7f5..f8c59af 100644 --- a/src/agora_agent/agents/types/interrupt_agents_response.py +++ b/src/agora_agent/agents/types/interrupt_agents_response.py @@ -20,7 +20,7 @@ class InterruptAgentsResponse(UncheckedBaseModel): start_ts: typing.Optional[int] = pydantic.Field(default=None) """ - Timestamp when the broadcast started + Unix timestamp in seconds when the interrupt request was processed. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/agents/types/list_agents_response_data_list_item_status.py b/src/agora_agent/agents/types/list_agents_response_data_list_item_status.py index 58c42e0..a00c386 100644 --- a/src/agora_agent/agents/types/list_agents_response_data_list_item_status.py +++ b/src/agora_agent/agents/types/list_agents_response_data_list_item_status.py @@ -3,5 +3,5 @@ import typing ListAgentsResponseDataListItemStatus = typing.Union[ - typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "RECOVERING", "FAILED"], typing.Any + typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "FAILED"], typing.Any ] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py b/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py index 78250d7..e75f098 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py @@ -14,7 +14,7 @@ class StartAgentsRequestPropertiesAdvancedFeatures(UncheckedBaseModel): enable_mllm: typing.Optional[bool] = pydantic.Field(default=None) """ - Use `mllm.enable` instead. Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. See `turn_detection.mode` for turn detection options available with MLLM. + Use `mllm.enable` instead. Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. See `turn_detection.type` for turn detection options available with MLLM. """ enable_rtm: typing.Optional[bool] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_avatar.py b/src/agora_agent/agents/types/start_agents_request_properties_avatar.py index 24e3888..8993b2c 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_avatar.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_avatar.py @@ -24,6 +24,7 @@ class StartAgentsRequestPropertiesAvatar(UncheckedBaseModel): - `akool`: Akool (Beta) - `liveavatar`: LiveAvatar (Beta) - `anam`: Anam (Beta) + - `generic`: Generic (Beta) """ params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_avatar_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_avatar_vendor.py index 2926d04..e5bcec5 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_avatar_vendor.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_avatar_vendor.py @@ -3,5 +3,5 @@ import typing StartAgentsRequestPropertiesAvatarVendor = typing.Union[ - typing.Literal["akool", "liveavatar", "anam", "heygen"], typing.Any + typing.Literal["akool", "liveavatar", "anam", "generic", "heygen"], typing.Any ] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_filler_words_content_static_config.py b/src/agora_agent/agents/types/start_agents_request_properties_filler_words_content_static_config.py index e220739..7fe264c 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_filler_words_content_static_config.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_filler_words_content_static_config.py @@ -15,7 +15,7 @@ class StartAgentsRequestPropertiesFillerWordsContentStaticConfig(UncheckedBaseMo Static filler word configuration. Used when `mode` is `static`. """ - phrases: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + phrases: typing.List[str] = pydantic.Field() """ List of filler word phrases. Maximum 100 filler words, each not exceeding 50 English words. """ diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm.py b/src/agora_agent/agents/types/start_agents_request_properties_llm.py index 20c391e..9ab0f62 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_llm.py @@ -44,7 +44,7 @@ class StartAgentsRequestPropertiesLlm(UncheckedBaseModel): """ LLM input modalities: - `["text"]`: Text only - - `["text", "image"]`: Text plus image; requires the selected LLM to support visual input + - `["text", "image"]`: Text plus image. Recommended configuration, requires the selected LLM to support visual input """ output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py index a8594ee..c0d7046 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py @@ -27,6 +27,12 @@ class StartAgentsRequestPropertiesLlmGreetingConfigs(UncheckedBaseModel): The delay in milliseconds before the agent plays the greeting message after a user joins the channel. """ + interruptable: typing.Optional[bool] = pydantic.Field(default=None) + """ + - `true`: Follows the global `interruption` configuration. + - `false`: Uninterruptible. The greeting plays in its entirety. If the user speaks multiple times while the greeting plays, the system merges the speech segments after the greeting ends and sends them to the LLM for a single response. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py index e84422c..0993ebc 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py @@ -59,17 +59,7 @@ class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): failure_message: typing.Optional[str] = pydantic.Field(default=None) """ - Message played when the MLLM call fails. - """ - - max_history: typing.Optional[int] = pydantic.Field(default=None) - """ - Maximum number of conversation history messages cached for the MLLM session. - """ - - predefined_tools: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - Predefined tools available to the MLLM provider. + Agent failure message. If provided, the agent speaks this message when an MLLM request fails. """ vendor: typing.Optional[StartAgentsRequestPropertiesMllmVendor] = pydantic.Field(default=None) @@ -78,6 +68,7 @@ class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): - `openai`: OpenAI Realtime API - `gemini`: Google Gemini Live - `vertexai`: Google Gemini Live (Vertex AI) + - `xai`: xAI Grok Realtime API """ turn_detection: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetection] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py index 9298a0c..032979d 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py @@ -26,7 +26,7 @@ class StartAgentsRequestPropertiesMllmTurnDetection(UncheckedBaseModel): """ Turn detection mode for MLLM: - `agora_vad`: Agora VAD-based detection. - - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API and Gemini Live. + - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API, Gemini Live, and xAI Grok. - `semantic_vad`: Semantic-based detection. Supported by OpenAI Realtime API only. """ diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py index d27b76e..c74d8d7 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py @@ -30,7 +30,7 @@ class StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig(UncheckedBase threshold: typing.Optional[float] = pydantic.Field(default=None) """ - VAD sensitivity threshold. Applicable to OpenAI Realtime API only. + VAD sensitivity threshold. Applicable to OpenAI Realtime API and xAI Grok. """ idle_timeout_ms: typing.Optional[int] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py index 519b143..0233696 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py @@ -2,4 +2,4 @@ import typing -StartAgentsRequestPropertiesMllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai"], typing.Any] +StartAgentsRequestPropertiesMllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/agents/types/start_agents_response.py b/src/agora_agent/agents/types/start_agents_response.py index ee3f32e..1d69a34 100644 --- a/src/agora_agent/agents/types/start_agents_response.py +++ b/src/agora_agent/agents/types/start_agents_response.py @@ -27,7 +27,6 @@ class StartAgentsResponse(UncheckedBaseModel): - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. """ diff --git a/src/agora_agent/agents/types/start_agents_response_status.py b/src/agora_agent/agents/types/start_agents_response_status.py index 6926aca..894df21 100644 --- a/src/agora_agent/agents/types/start_agents_response_status.py +++ b/src/agora_agent/agents/types/start_agents_response_status.py @@ -3,5 +3,5 @@ import typing StartAgentsResponseStatus = typing.Union[ - typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "RECOVERING", "FAILED"], typing.Any + typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "FAILED"], typing.Any ] diff --git a/src/agora_agent/agents/types/update_agents_response.py b/src/agora_agent/agents/types/update_agents_response.py index 9f73945..8951b77 100644 --- a/src/agora_agent/agents/types/update_agents_response.py +++ b/src/agora_agent/agents/types/update_agents_response.py @@ -27,7 +27,6 @@ class UpdateAgentsResponse(UncheckedBaseModel): - `RUNNING` (2): The agent is running. - `STOPPING` (3): The agent is stopping. - `STOPPED` (4): The agent has exited. - - `RECOVERING` (5): The agent is recovering. - `FAILED` (6): The agent failed to execute. """ diff --git a/src/agora_agent/agents/types/update_agents_response_status.py b/src/agora_agent/agents/types/update_agents_response_status.py index 7f4078f..c8e9133 100644 --- a/src/agora_agent/agents/types/update_agents_response_status.py +++ b/src/agora_agent/agents/types/update_agents_response_status.py @@ -3,5 +3,5 @@ import typing UpdateAgentsResponseStatus = typing.Union[ - typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "RECOVERING", "FAILED"], typing.Any + typing.Literal["IDLE", "STARTING", "RUNNING", "STOPPING", "STOPPED", "FAILED"], typing.Any ] diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index c5a0e03..c44e886 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agent-server-sdk/v1.4.0", + "User-Agent": "agora-agents/v2.0.0", "X-Fern-Language": "Python", - "X-Fern-SDK-Name": "agora-agent-server-sdk", - "X-Fern-SDK-Version": "v1.4.0", + "X-Fern-SDK-Name": "agora-agents", + "X-Fern-SDK-Version": "v2.0.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/pool_client.py b/src/agora_agent/pool_client.py index 652b3d2..bd54fff 100644 --- a/src/agora_agent/pool_client.py +++ b/src/agora_agent/pool_client.py @@ -195,7 +195,7 @@ class Agora(BaseAgora): from agora_agent import Agora, Area from agora_agent.agentkit.token import generate_convo_ai_token - raw_token = generate_convo_ai_token(app_id="...", app_certificate="...", channel_name="...", account="1") + raw_token = generate_convo_ai_token(app_id="...", app_certificate="...", channel_name="...", uid=1) client = Agora( area=Area.US, app_id="YOUR_APP_ID", @@ -323,7 +323,7 @@ def stop_agent(self, agent_id: str) -> None: app_id=self.app_id, app_certificate=self.app_certificate, channel_name="stop", - account=agent_id, + uid=0, ) request_options = {"additional_headers": {"Authorization": f"agora token={token}"}} @@ -534,7 +534,7 @@ async def stop_agent(self, agent_id: str) -> None: app_id=self.app_id, app_certificate=self.app_certificate, channel_name="stop", - account=agent_id, + uid=0, ) request_options = {"additional_headers": {"Authorization": f"agora token={token}"}} diff --git a/src/agora_agent/types/agent_error_response.py b/src/agora_agent/types/agent_error_response.py new file mode 100644 index 0000000..510ce74 --- /dev/null +++ b/src/agora_agent/types/agent_error_response.py @@ -0,0 +1,46 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .agent_error_response_reason import AgentErrorResponseReason + + +class AgentErrorResponse(UncheckedBaseModel): + """ + Error response returned when a Conversational AI Agent Management API request fails. + + Common HTTP status codes include: + - `400`: Invalid request parameters. + - `401`: Authentication failed. + - `403`: Unauthorized access or service not enabled. + - `404`: Agent not found or has exited. + - `409`: Agent conflict. + - `422`: Access limit exceeded. + - `429`: Request rate limit exceeded. + - `500`: Internal server error. + - `502`: Gateway error. + - `503`: Agent startup failure. + - `504`: Request timeout. + """ + + detail: typing.Optional[str] = pydantic.Field(default=None) + """ + Error detail message. + """ + + reason: typing.Optional[AgentErrorResponseReason] = pydantic.Field(default=None) + """ + Error reason code. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/agent_error_response_reason.py b/src/agora_agent/types/agent_error_response_reason.py new file mode 100644 index 0000000..c7144cf --- /dev/null +++ b/src/agora_agent/types/agent_error_response_reason.py @@ -0,0 +1,24 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AgentErrorResponseReason = typing.Union[ + typing.Literal[ + "ServiceNotEnabled", + "AccountSuspended", + "InternalError", + "InvalidPermission", + "InvalidRequestBody", + "MissingRequiredField", + "InvalidFieldValue", + "ResourceQuotaLimitExceeded", + "ConcurrencyLimitExceeded", + "ServiceUnavailable", + "ResourceAllocationFailed", + "TaskConflict", + "TaskNotFound", + "TaskOperationTimeout", + "NotImplemented", + ], + typing.Any, +] diff --git a/src/agora_agent/types/open_ai_tts_params.py b/src/agora_agent/types/open_ai_tts_params.py index 67a6e1d..3839646 100644 --- a/src/agora_agent/types/open_ai_tts_params.py +++ b/src/agora_agent/types/open_ai_tts_params.py @@ -14,7 +14,7 @@ class OpenAiTtsParams(UncheckedBaseModel): api_key: typing.Optional[str] = pydantic.Field(default=None) """ - OpenAI API key + OpenAI API key. Optional for Agora-managed OpenAI TTS usage. """ voice: str = pydantic.Field() diff --git a/src/agora_agent/types/telephony_error_response.py b/src/agora_agent/types/telephony_error_response.py new file mode 100644 index 0000000..af4e7d4 --- /dev/null +++ b/src/agora_agent/types/telephony_error_response.py @@ -0,0 +1,34 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class TelephonyErrorResponse(UncheckedBaseModel): + """ + Error response returned when a Telephony or Phone Number Management API request fails. + + The response body includes `error_type` and `description` fields (not the Agent Management `detail`/`reason` shape). + """ + + error_type: typing.Optional[str] = pydantic.Field(default=None) + """ + The type of error that occurred. + """ + + description: typing.Optional[str] = pydantic.Field(default=None) + """ + A detailed description of the error. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/version.py b/src/agora_agent/version.py index 160096b..6f7b967 100644 --- a/src/agora_agent/version.py +++ b/src/agora_agent/version.py @@ -1,3 +1,3 @@ from importlib import metadata -__version__ = metadata.version("agora-agent-server-sdk") +__version__ = metadata.version("agora-agents") diff --git a/tests/custom/test_avatar_token.py b/tests/custom/test_avatar_token.py new file mode 100644 index 0000000..fa73fc0 --- /dev/null +++ b/tests/custom/test_avatar_token.py @@ -0,0 +1,12 @@ +from agora_agent.agentkit import generate_convo_ai_token + + +def test_avatar_tokens_use_convo_ai_token_path_with_avatar_uid(): + token = generate_convo_ai_token( + app_id="0" * 32, + app_certificate="1" * 32, + channel_name="room", + uid=123, + ) + + assert token.startswith("007") diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py index 77fae36..ab04ce6 100644 --- a/tests/custom/test_client.py +++ b/tests/custom/test_client.py @@ -1,265 +1,7 @@ -from agora_agent.agentkit.agent import Agent, AdvancedFeatures, InterruptionConfig, MllmTurnDetectionConfig, TurnDetectionConfig -from agora_agent.agentkit.constants import TurnDetectionTypeValues -import asyncio -import warnings -from agora_agent.agentkit.agent_session import AgentSession, AsyncAgentSession -from agora_agent.agentkit.vendors import DeepgramTTS, HeyGenAvatar, MicrosoftTTS, OpenAI, OpenAIRealtime -from agora_agent.agentkit import AgentThinkResponse -from typing import Any, Dict, List, Tuple +import pytest -class _AgentManagementStub: - def __init__(self) -> None: - self.calls: List[Tuple[str, str, Dict[str, Any]]] = [] - - def agent_think(self, appid, agent_id, **kwargs): # noqa: ANN001 - self.calls.append((appid, agent_id, kwargs)) - return AgentThinkResponse(agent_id=agent_id, channel="room", start_ts=1) - - -class _ClientStub: - auth_mode = "basic" - - def __init__(self) -> None: - self.agents = object() - self.agent_management = _AgentManagementStub() - - -class _AsyncAgentManagementStub: - def __init__(self) -> None: - self.calls: List[Tuple[str, str, Dict[str, Any]]] = [] - - async def agent_think(self, appid, agent_id, **kwargs): # noqa: ANN001 - self.calls.append((appid, agent_id, kwargs)) - return AgentThinkResponse(agent_id=agent_id, channel="room", start_ts=1) - - -class _AsyncClientStub: - auth_mode = "basic" - - def __init__(self) -> None: - self.agents = object() - self.agent_management = _AsyncAgentManagementStub() - - -def test_agentkit_think_routes_to_agent_management() -> None: - client = _ClientStub() - session = AgentSession( - client=client, - agent=Agent(), - app_id="appid", - name="agent", - channel="room", - token="token", - agent_uid="1", - remote_uids=["2"], - ) - session._status = "running" - session._agent_id = "agent-1" - - response = session.think("Injected instruction", on_thinking_action="interrupt") - assert response.agent_id == "agent-1" - assert len(client.agent_management.calls) == 1 - appid, agent_id, kwargs = client.agent_management.calls[0] - assert appid == "appid" - assert agent_id == "agent-1" - assert kwargs["text"] == "Injected instruction" - assert kwargs["on_thinking_action"] == "interrupt" - - -def test_async_agentkit_think_routes_to_agent_management() -> None: - async def _run() -> None: - client = _AsyncClientStub() - session = AsyncAgentSession( - client=client, - agent=Agent(), - app_id="appid", - name="agent", - channel="room", - token="token", - agent_uid="1", - remote_uids=["2"], - ) - session._status = "running" - session._agent_id = "agent-1" - - response = await session.think("Injected instruction", on_thinking_action="interrupt") - assert response.agent_id == "agent-1" - assert len(client.agent_management.calls) == 1 - appid, agent_id, kwargs = client.agent_management.calls[0] - assert appid == "appid" - assert agent_id == "agent-1" - assert kwargs["text"] == "Injected instruction" - assert kwargs["on_thinking_action"] == "interrupt" - - asyncio.run(_run()) - - -def test_llm_vendor_headers_are_forwarded_to_properties() -> None: - agent = Agent().with_llm( - OpenAI( - api_key="openai-key", - model="gpt-4o-mini", - headers={"X-Trace-Id": "trace-123"}, - output_modalities=["text", "audio"], - greeting_configs={"mode": "single_first"}, - template_variables={"caller_name": "Ada"}, - ) - ).with_tts(MicrosoftTTS(key="tts-key", region="eastus", voice_name="en-US-JennyNeural")) - - props = agent.to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.llm is not None - assert props.llm.headers == {"X-Trace-Id": "trace-123"} - assert props.llm.output_modalities == ["text", "audio"] - assert props.llm.greeting_configs is not None - assert props.llm.greeting_configs.mode == "single_first" - assert props.llm.template_variables == {"caller_name": "Ada"} - - -def test_with_turn_detection_forwards_config() -> None: - turn_detection = TurnDetectionConfig( - type=TurnDetectionTypeValues.AGORA_VAD, - threshold=0.5, - ) - - props = Agent().with_turn_detection(turn_detection).to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - skip_vendor_validation=True, - ) - - assert props.turn_detection == turn_detection - - -def test_with_interruption_forwards_config() -> None: - interruption = InterruptionConfig( - enable=False, - disabled_config={"strategy": "ignore"}, - ) - - props = Agent().with_interruption(interruption).to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - skip_vendor_validation=True, - ) - - assert props.interruption == interruption - - -def test_mllm_turn_detection_is_forwarded_without_legacy_style() -> None: - mllm_turn_detection = MllmTurnDetectionConfig( - mode="server_vad", - server_vad_config={"idle_timeout_ms": 5000}, - ) - props = Agent().with_mllm( - OpenAIRealtime(api_key="openai-key", turn_detection=mllm_turn_detection) - ).to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.mllm is not None - assert props.mllm.vendor == "openai" - assert "style" not in props.mllm.dict() - assert props.mllm.turn_detection == mllm_turn_detection - - -def test_with_mllm_sets_mllm_enable_without_legacy_flag() -> None: - agent = Agent().with_mllm(OpenAIRealtime(api_key="openai-key")) - - props = agent.to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.mllm is not None - assert props.mllm.enable is True - assert props.advanced_features is None - - -def test_with_mllm_removes_deprecated_enable_mllm_from_existing_advanced_features() -> None: - agent = Agent( - advanced_features=AdvancedFeatures(enable_mllm=True, enable_rtm=True) - ).with_mllm(OpenAIRealtime(api_key="openai-key")) - - props = agent.to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.mllm is not None - assert props.mllm.enable is True - assert props.advanced_features is not None - assert props.advanced_features.enable_mllm is None - assert props.advanced_features.enable_rtm is True - - -def test_with_mllm_drops_advanced_features_when_only_deprecated_enable_mllm_was_set() -> None: - props = Agent( - advanced_features=AdvancedFeatures(enable_mllm=True) - ).with_mllm(OpenAIRealtime(api_key="openai-key")).to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - ) - - assert props.mllm is not None - assert props.mllm.enable is True - assert props.advanced_features is None - - -def test_with_tools_sets_enable_tools() -> None: - props = Agent().with_tools().to_properties( - channel="room", - token="rtc-token", - agent_uid="1", - remote_uids=["2"], - skip_vendor_validation=True, - ) - - assert props.advanced_features is not None - assert props.advanced_features.enable_tools is True - - -def test_heygen_avatar_emits_deprecation_warning() -> None: - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - HeyGenAvatar(api_key="heygen-key", quality="high", agora_uid="42") - - assert any("HeyGenAvatar is deprecated" in str(warning.message) for warning in caught) - - -def test_deepgram_tts_vendor_config() -> None: - tts = DeepgramTTS( - api_key="deepgram-key", - model="aura-2-thalia-en", - base_url="wss://api.deepgram.com/v1/speak", - sample_rate=24000, - params={"encoding": "linear16"}, - ).to_config() - - assert tts["vendor"] == "deepgram" - assert tts["params"] == { - "api_key": "deepgram-key", - "model": "aura-2-thalia-en", - "base_url": "wss://api.deepgram.com/v1/speak", - "sample_rate": 24000, - "encoding": "linear16", - } +# Get started with writing tests with pytest at https://docs.pytest.org +@pytest.mark.skip(reason="Unimplemented") +def test_client() -> None: + assert True diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py new file mode 100644 index 0000000..faca9bf --- /dev/null +++ b/tests/custom/test_llm_vendors.py @@ -0,0 +1,60 @@ +from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM + + +def test_groq_serializes_as_openai_compatible() -> None: + config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() + + assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" + assert config["api_key"] == "groq-key" + assert config["style"] == "openai" + assert config["params"]["model"] == "llama-3.3-70b-versatile" + + +def test_custom_llm_marks_request_as_custom() -> None: + config = CustomLLM(api_key="key", model="model", base_url="https://llm.example.com/chat").to_config() + + assert config["url"] == "https://llm.example.com/chat" + assert config["api_key"] == "key" + assert config["vendor"] == "custom" + assert config["style"] == "openai" + + +def test_vertex_ai_llm_includes_project_routing() -> None: + config = VertexAILLM( + api_key="vertex-token", + model="gemini-2.0-flash", + project_id="project", + location="us-central1", + ).to_config() + + assert config["api_key"] == "vertex-token" + assert config["style"] == "gemini" + assert config["params"]["model"] == "gemini-2.0-flash" + assert config["params"]["project_id"] == "project" + assert config["params"]["location"] == "us-central1" + + +def test_amazon_bedrock_serializes_as_anthropic_style() -> None: + config = AmazonBedrock( + api_key="bedrock-key", + url="https://bedrock.example.com/messages", + model="anthropic.claude-3-5-sonnet-20241022-v2:0", + ).to_config() + + assert config["api_key"] == "bedrock-key" + assert config["style"] == "anthropic" + assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" + + +def test_dify_serializes_conversation_fields() -> None: + config = Dify( + api_key="dify-key", + url="https://api.dify.ai/v1/chat-messages", + user="user-1", + conversation_id="conversation-1", + ).to_config() + + assert config["api_key"] == "dify-key" + assert config["style"] == "dify" + assert config["params"]["user"] == "user-1" + assert config["params"]["conversation_id"] == "conversation-1" diff --git a/tests/custom/test_presets.py b/tests/custom/test_presets.py deleted file mode 100644 index c05c477..0000000 --- a/tests/custom/test_presets.py +++ /dev/null @@ -1,135 +0,0 @@ -from agora_agent.agentkit.presets import resolve_session_presets - - -def test_minimax_preset_strips_group_id_and_url_when_no_key() -> None: - """When no key is provided, preset is inferred and credential fields are stripped.""" - properties = { - "tts": { - "vendor": "minimax", - "params": { - "group_id": "my-group", - "model": "speech-2.6-turbo", - "url": "wss://api-uw.minimax.io/ws/v1/t2a_v2", - "voice_setting": {"voice_id": "English_captivating_female1"}, - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "minimax_speech_2_6_turbo" - params = resolved["tts"]["params"] - assert "group_id" not in params - assert "url" not in params - assert "model" not in params - assert params["voice_setting"]["voice_id"] == "English_captivating_female1" - - -def test_minimax_preset_strips_group_id_and_url_for_28_turbo() -> None: - properties = { - "tts": { - "vendor": "minimax", - "params": { - "group_id": "org-123", - "model": "speech-2.8-turbo", - "url": "wss://api.minimax.io/ws/v1/t2a_v2", - "voice_setting": {"voice_id": "some-voice"}, - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "minimax_speech_2_8_turbo" - params = resolved["tts"]["params"] - assert "group_id" not in params - assert "url" not in params - assert "model" not in params - - -def test_minimax_preset_strips_group_id_and_url_with_underscore_model_name() -> None: - properties = { - "tts": { - "vendor": "minimax", - "params": { - "group_id": "my-group", - "model": "speech_2_6_turbo", - "url": "wss://api-uw.minimax.io/ws/v1/t2a_v2", - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "minimax_speech_2_6_turbo" - params = resolved["tts"].get("params") or {} - assert "group_id" not in params - assert "url" not in params - assert "model" not in params - - -def test_minimax_preset_not_inferred_when_key_present() -> None: - """When user provides their own key, preset is NOT inferred and nothing is stripped.""" - properties = { - "tts": { - "vendor": "minimax", - "params": { - "key": "user-secret", - "group_id": "my-group", - "model": "speech-2.6-turbo", - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset is None - params = resolved["tts"]["params"] - assert params.get("key") == "user-secret" - assert params.get("group_id") == "my-group" - - -def test_minimax_preset_not_inferred_when_explicit_preset_given() -> None: - """When an explicit tts preset is provided, tts inference is skipped.""" - properties = { - "tts": { - "vendor": "minimax", - "params": { - "group_id": "my-group", - "model": "speech-2.6-turbo", - }, - } - } - preset, resolved = resolve_session_presets("minimax_speech_2_6_turbo", properties) - assert preset == "minimax_speech_2_6_turbo" - # Explicit preset: tts inference is skipped, params are NOT stripped - params = resolved["tts"]["params"] - assert params.get("group_id") == "my-group" - - -def test_deepgram_preset_strips_model_and_api_key() -> None: - properties = { - "asr": { - "vendor": "deepgram", - "params": { - "model": "nova-3", - "language": "en-US", - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "deepgram_nova_3" - params = resolved["asr"]["params"] - assert "model" not in params - assert "api_key" not in params - assert params.get("language") == "en-US" - - -def test_openai_llm_preset_strips_model_api_key_and_default_url() -> None: - properties = { - "llm": { - "vendor": "openai", - "url": "https://api.openai.com/v1/chat/completions", - "params": { - "model": "gpt-4o-mini", - }, - } - } - preset, resolved = resolve_session_presets(None, properties) - assert preset == "openai_gpt_4o_mini" - llm = resolved["llm"] - assert "api_key" not in llm - assert "url" not in llm - assert "model" not in (llm.get("params") or {}) diff --git a/tests/custom/test_root_exports.py b/tests/custom/test_root_exports.py new file mode 100644 index 0000000..9b2f508 --- /dev/null +++ b/tests/custom/test_root_exports.py @@ -0,0 +1,29 @@ +import pytest + +import agora_agent +import agora_agent.agentkit as agentkit + + +def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + +def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + +def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + +def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + +def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__