diff --git a/synapseclient/api/__init__.py b/synapseclient/api/__init__.py index 99bdbab33..3b02952c7 100644 --- a/synapseclient/api/__init__.py +++ b/synapseclient/api/__init__.py @@ -143,6 +143,28 @@ get_project_setting, update_project_setting, ) +from .search_services import ( + autocomplete_search, + bind_search_config_to_entity, + clear_search_config_binding, + create_column_analyzer_override, + create_search_configuration, + create_synonym_set, + create_text_analyzer, + get_column_analyzer_override, + get_search_config_binding, + get_search_configuration, + get_synonym_set, + get_text_analyzer, + list_column_analyzer_overrides, + list_search_configurations, + list_synonym_sets, + list_text_analyzers, + update_column_analyzer_override, + update_search_configuration, + update_synonym_set, + update_text_analyzer, +) from .storage_location_services import ( create_storage_location_setting, get_storage_location_setting, @@ -388,4 +410,25 @@ "create_project_setting", "update_project_setting", "delete_project_setting", + # search_services + "create_text_analyzer", + "get_text_analyzer", + "update_text_analyzer", + "list_text_analyzers", + "create_column_analyzer_override", + "get_column_analyzer_override", + "update_column_analyzer_override", + "list_column_analyzer_overrides", + "create_synonym_set", + "get_synonym_set", + "update_synonym_set", + "list_synonym_sets", + "create_search_configuration", + "get_search_configuration", + "update_search_configuration", + "list_search_configurations", + "bind_search_config_to_entity", + "get_search_config_binding", + "clear_search_config_binding", + "autocomplete_search", ] diff --git a/synapseclient/api/entity_factory.py b/synapseclient/api/entity_factory.py index 919af7227..db3472641 100644 --- a/synapseclient/api/entity_factory.py +++ b/synapseclient/api/entity_factory.py @@ -343,6 +343,7 @@ class type. This will also download the file if `download_file` is set to True. MaterializedView, Project, RecordSet, + SearchIndex, SubmissionView, Table, VirtualTable, @@ -379,6 +380,7 @@ class type. This will also download the file if `download_file` is set to True. concrete_types.RECORD_SET_ENTITY: RecordSet, concrete_types.SUBMISSION_VIEW: SubmissionView, concrete_types.VIRTUAL_TABLE: VirtualTable, + concrete_types.SEARCH_INDEX_ENTITY: SearchIndex, concrete_types.LINK_ENTITY: Link, concrete_types.DOCKER_REPOSITORY: DockerRepository, } diff --git a/synapseclient/api/search_services.py b/synapseclient/api/search_services.py new file mode 100644 index 000000000..32bb33ece --- /dev/null +++ b/synapseclient/api/search_services.py @@ -0,0 +1,629 @@ +"""This module is responsible for exposing the services defined at: + + +It covers TextAnalyzer, ColumnAnalyzerOverride, SynonymSet, SearchConfiguration, +SearchConfigBinding, and the synchronous SearchIndex autocomplete endpoint. +The async SearchIndex query endpoint is exposed via the +`SearchIndexQuery.send_job_and_wait_async()` method on the model class +(`models.search_management.SearchIndexQuery`), which uses the shared +`AsynchronousCommunicator` mixin. +""" + +import json +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +if TYPE_CHECKING: + from synapseclient import Synapse + + +# ---------- Text Analyzer ---------- + + +async def create_text_analyzer( + request: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Create a TextAnalyzer. + + + + Arguments: + request: TextAnalyzer body. Must include organizationName, name, settings. + `settings` is a JSON object (the contents of the OpenSearch + `settings.analysis` block), not a JSON-encoded string. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The created TextAnalyzer. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_post_async( + uri="/search/text/analyzer", body=json.dumps(request) + ) + + +async def get_text_analyzer( + analyzer_id: str, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Get a TextAnalyzer by ID. + + + + Arguments: + analyzer_id: The numeric ID of the text analyzer to retrieve. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The requested TextAnalyzer. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_get_async(uri=f"/search/text/analyzer/{analyzer_id}") + + +async def update_text_analyzer( + analyzer_id: str, + request: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Update a TextAnalyzer. + + + + Arguments: + analyzer_id: The path ID (must match the request body's ID). + request: The updated TextAnalyzer. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The updated TextAnalyzer. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_put_async( + uri=f"/search/text/analyzer/{analyzer_id}", body=json.dumps(request) + ) + + +async def list_text_analyzers( + organization_name: Optional[str] = None, + next_page_token: Optional[str] = None, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """List TextAnalyzers, optionally filtered by Organization. + + + + Arguments: + organization_name: Optional filter by organization name. + next_page_token: Optional pagination token. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + Page of TextAnalyzers and a nextPageToken if more results exist. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + body = {"organizationName": organization_name, "nextPageToken": next_page_token} + body = {k: v for k, v in body.items() if v is not None} + return await client.rest_post_async( + uri="/search/text/analyzer/list", body=json.dumps(body) + ) + + +# ---------- Column Analyzer Override ---------- + + +async def create_column_analyzer_override( + request: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Create a ColumnAnalyzerOverride. + + + + Each entry under `request["overrides"]` carries `analyzer` as a JSON + object — either a `$ref` reference `{"$ref": "{organizationName}-{name}"}` + to a saved TextAnalyzer or an inline TextAnalyzer literal — not a bare + qualified-name string. + + Arguments: + request: ColumnAnalyzerOverride body. Must include organizationName, name, + and overrides. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The created ColumnAnalyzerOverride. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_post_async( + uri="/search/column/analyzer/override", body=json.dumps(request) + ) + + +async def get_column_analyzer_override( + column_analyzer_override_id: str, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Get a ColumnAnalyzerOverride by ID. + + + + Arguments: + column_analyzer_override_id: The numeric ID of the column analyzer + override to retrieve. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The requested ColumnAnalyzerOverride. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_get_async( + uri=f"/search/column/analyzer/override/{column_analyzer_override_id}" + ) + + +async def update_column_analyzer_override( + column_analyzer_override_id: str, + request: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Update a ColumnAnalyzerOverride. + + + + Arguments: + column_analyzer_override_id: The path ID (must match the request body's ID). + request: The updated ColumnAnalyzerOverride. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The updated ColumnAnalyzerOverride. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_put_async( + uri=f"/search/column/analyzer/override/{column_analyzer_override_id}", + body=json.dumps(request), + ) + + +async def list_column_analyzer_overrides( + organization_name: Optional[str] = None, + next_page_token: Optional[str] = None, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """List ColumnAnalyzerOverrides, optionally filtered by Organization. + + + + Arguments: + organization_name: Optional filter by organization name. + next_page_token: Optional pagination token. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + Page of ColumnAnalyzerOverrides and a nextPageToken if more results exist. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + body = {"organizationName": organization_name, "nextPageToken": next_page_token} + body = {k: v for k, v in body.items() if v is not None} + return await client.rest_post_async( + uri="/search/column/analyzer/override/list", body=json.dumps(body) + ) + + +# ---------- Synonym Set ---------- + + +async def create_synonym_set( + request: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Create a SynonymSet. + + + + `request["definition"]` is a JSON object (the OpenSearch token-filter + definition), not a JSON-encoded string. + + Arguments: + request: SynonymSet body. Must include organizationName, name, definition. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The created SynonymSet. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_post_async( + uri="/search/synonym/set", body=json.dumps(request) + ) + + +async def get_synonym_set( + synonym_set_id: str, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Get a SynonymSet by ID. + + + + Arguments: + synonym_set_id: The numeric ID of the synonym set to retrieve. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The requested SynonymSet. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_get_async(uri=f"/search/synonym/set/{synonym_set_id}") + + +async def update_synonym_set( + synonym_set_id: str, + request: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Update a SynonymSet. + + + + Arguments: + synonym_set_id: The path ID (must match the request body's ID). + request: The updated SynonymSet. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The updated SynonymSet. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_put_async( + uri=f"/search/synonym/set/{synonym_set_id}", body=json.dumps(request) + ) + + +async def list_synonym_sets( + organization_name: Optional[str] = None, + next_page_token: Optional[str] = None, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """List SynonymSets, optionally filtered by Organization. + + + + Arguments: + organization_name: Optional filter by organization name. + next_page_token: Optional pagination token. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + Page of SynonymSets and a nextPageToken if more results exist. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + body = {"organizationName": organization_name, "nextPageToken": next_page_token} + body = {k: v for k, v in body.items() if v is not None} + return await client.rest_post_async( + uri="/search/synonym/set/list", body=json.dumps(body) + ) + + +# ---------- Search Configuration ---------- + + +async def create_search_configuration( + request: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Create a SearchConfiguration. + + + + `request["defaultAnalyzer"]` is a JSON object — either a `$ref` reference + `{"$ref": "{organizationName}-{name}"}` to a saved TextAnalyzer or an + inline TextAnalyzer literal. Each entry of + `request["columnAnalyzerOverrides"]` is likewise a `$ref` dict or an + inline ColumnAnalyzerOverride literal — not a bare qualified-name string. + + Arguments: + request: SearchConfiguration body. Must include organizationName and name. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The created SearchConfiguration. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_post_async( + uri="/search/configuration", body=json.dumps(request) + ) + + +async def get_search_configuration( + search_configuration_id: str, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Get a SearchConfiguration by ID. + + + + Arguments: + search_configuration_id: The numeric ID of the search configuration + to retrieve. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The requested SearchConfiguration. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_get_async( + uri=f"/search/configuration/{search_configuration_id}" + ) + + +async def update_search_configuration( + search_configuration_id: str, + request: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Update a SearchConfiguration. + + + + Arguments: + search_configuration_id: The path ID (must match the request body's ID). + request: The updated SearchConfiguration. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The updated SearchConfiguration. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_put_async( + uri=f"/search/configuration/{search_configuration_id}", + body=json.dumps(request), + ) + + +async def list_search_configurations( + organization_name: Optional[str] = None, + next_page_token: Optional[str] = None, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """List SearchConfigurations, optionally filtered by Organization. + + + + Arguments: + organization_name: Optional filter by organization name. + next_page_token: Optional pagination token. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + Page of SearchConfigurations and a nextPageToken if more results exist. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + body = {"organizationName": organization_name, "nextPageToken": next_page_token} + body = {k: v for k, v in body.items() if v is not None} + return await client.rest_post_async( + uri="/search/configuration/list", body=json.dumps(body) + ) + + +# ---------- Search Configuration Bindings ---------- + + +async def bind_search_config_to_entity( + entity_id: str, + search_configuration_id: str, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Bind a SearchConfiguration to an entity. Replaces any existing binding. + + + + Arguments: + entity_id: The ID of the entity to bind to. + search_configuration_id: The ID of the SearchConfiguration to bind. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The created SearchConfigBinding. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + body = { + "entityId": entity_id, + "searchConfigurationId": search_configuration_id, + } + return await client.rest_put_async( + uri=f"/entity/{entity_id}/searchconfig/binding", body=json.dumps(body) + ) + + +async def get_search_config_binding( + entity_id: str, + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Get the effective SearchConfigBinding for an entity by walking up the + hierarchy. + + + + Arguments: + entity_id: The ID of the entity whose effective binding to resolve. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + The effective SearchConfigBinding. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_get_async(uri=f"/entity/{entity_id}/searchconfig/binding") + + +async def clear_search_config_binding( + entity_id: str, + *, + synapse_client: Optional["Synapse"] = None, +) -> None: + """Clear the SearchConfigBinding on a specific entity. + + + + Arguments: + entity_id: The ID of the entity whose binding to clear. + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + await client.rest_delete_async(uri=f"/entity/{entity_id}/searchconfig/binding") + + +# ---------- Search Queries ---------- +# +# The async search query endpoint (POST /search/query/async/start + +# GET /search/query/async/get/{token}) is exposed through the shared +# AsynchronousCommunicator mixin. Use: +# +# query = SearchIndexQuery(search_index_id=..., search_query=SearchQuery(...)) +# await query.send_job_and_wait_async() +# +# The synchronous autocomplete endpoint stays here because it is not an +# async job. + + +async def autocomplete_search( + request: Dict[str, Any], + *, + synapse_client: Optional["Synapse"] = None, +) -> Dict[str, Any]: + """Synchronous autocomplete search. Caps results at 8. + + + + Arguments: + request: SearchIndexQuery body (must include searchIndexId). + synapse_client: If not passed in and caching was not disabled by + `Synapse.allow_client_caching(False)` this will use the last created + instance from the Synapse class constructor. + + Returns: + SearchQueryResults. + """ + from synapseclient import Synapse + + client = Synapse.get_client(synapse_client=synapse_client) + return await client.rest_post_async( + uri="/search/autocomplete", body=json.dumps(request) + ) + + +__all__: List[str] = [ + "create_text_analyzer", + "get_text_analyzer", + "update_text_analyzer", + "list_text_analyzers", + "create_column_analyzer_override", + "get_column_analyzer_override", + "update_column_analyzer_override", + "list_column_analyzer_overrides", + "create_synonym_set", + "get_synonym_set", + "update_synonym_set", + "list_synonym_sets", + "create_search_configuration", + "get_search_configuration", + "update_search_configuration", + "list_search_configurations", + "bind_search_config_to_entity", + "get_search_config_binding", + "clear_search_config_binding", + "autocomplete_search", +] diff --git a/synapseclient/core/constants/concrete_types.py b/synapseclient/core/constants/concrete_types.py index d01cc38a9..a119c579d 100644 --- a/synapseclient/core/constants/concrete_types.py +++ b/synapseclient/core/constants/concrete_types.py @@ -89,8 +89,12 @@ MATERIALIZED_VIEW = "org.sagebionetworks.repo.model.table.MaterializedView" SUBMISSION_VIEW = "org.sagebionetworks.repo.model.table.SubmissionView" VIRTUAL_TABLE = "org.sagebionetworks.repo.model.table.VirtualTable" +SEARCH_INDEX_ENTITY = "org.sagebionetworks.repo.model.search.table.SearchIndex" DOCKER_REPOSITORY = "org.sagebionetworks.repo.model.docker.DockerRepository" +# Search Management +SEARCH_INDEX_QUERY = "org.sagebionetworks.repo.model.search.table.SearchIndexQuery" + # upload requests MULTIPART_UPLOAD_REQUEST = "org.sagebionetworks.repo.model.file.MultipartUploadRequest" MULTIPART_UPLOAD_COPY_REQUEST = ( diff --git a/synapseclient/models/__init__.py b/synapseclient/models/__init__.py index 0badee2db..c2e46cf96 100644 --- a/synapseclient/models/__init__.py +++ b/synapseclient/models/__init__.py @@ -28,6 +28,29 @@ from synapseclient.models.project_setting import ProjectSetting from synapseclient.models.recordset import RecordSet from synapseclient.models.schema_organization import JSONSchema, SchemaOrganization +from synapseclient.models.search_index import SearchIndex +from synapseclient.models.search_management import ( + ColumnAnalyzerOverride, + ColumnAnalyzerOverrideEntry, + FacetRequest, + FacetSortField, + KeyRange, + KeyValues, + SearchConfigBinding, + SearchConfiguration, + SearchFieldValue, + SearchHit, + SearchIndexQuery, + SearchIndexState, + SearchIndexStatus, + SearchQuery, + SearchQueryPart, + SearchQueryType, + SortDirection, + SortField, + SynonymSet, + TextAnalyzer, +) from synapseclient.models.services import FailureStrategy from synapseclient.models.storage_location import ( StorageLocation, @@ -169,6 +192,28 @@ "UploadType", # Project Setting models "ProjectSetting", + # SearchIndex / Search Management models + "SearchIndex", + "SearchIndexQuery", + "SearchIndexStatus", + "SearchIndexState", + "SearchQuery", + "SearchQueryPart", + "SearchQueryType", + "SearchHit", + "SearchFieldValue", + "KeyValues", + "KeyRange", + "FacetRequest", + "FacetSortField", + "SortField", + "SortDirection", + "SearchConfiguration", + "SearchConfigBinding", + "TextAnalyzer", + "ColumnAnalyzerOverride", + "ColumnAnalyzerOverrideEntry", + "SynonymSet", ] # Static methods to expose as functions diff --git a/synapseclient/models/mixins/asynchronous_job.py b/synapseclient/models/mixins/asynchronous_job.py index 3110f892b..c51dd5578 100644 --- a/synapseclient/models/mixins/asynchronous_job.py +++ b/synapseclient/models/mixins/asynchronous_job.py @@ -21,6 +21,7 @@ GRID_RECORD_SET_EXPORT_REQUEST, QUERY_BUNDLE_REQUEST, QUERY_TABLE_CSV_REQUEST, + SEARCH_INDEX_QUERY, SYNCHRONIZE_GRID_REQUEST, TABLE_UPDATE_TRANSACTION_REQUEST, UPLOAD_TO_TABLE_PREVIEW_REQUEST, @@ -45,6 +46,7 @@ QUERY_BUNDLE_REQUEST: "/entity/{entityId}/table/query/async", GRID_CSV_IMPORT_REQUEST: "/grid/import/csv/async", UPLOAD_TO_TABLE_PREVIEW_REQUEST: "/table/upload/csv/preview/async", + SEARCH_INDEX_QUERY: "/search/query/async", } diff --git a/synapseclient/models/mixins/table_components.py b/synapseclient/models/mixins/table_components.py index 2d854c444..1eea7b787 100644 --- a/synapseclient/models/mixins/table_components.py +++ b/synapseclient/models/mixins/table_components.py @@ -85,7 +85,7 @@ "DatasetCollection", "SubmissionView", ] -CLASSES_WITH_READ_ONLY_SCHEMA = ["MaterializedView", "VirtualTable"] +CLASSES_WITH_READ_ONLY_SCHEMA = ["MaterializedView", "VirtualTable", "SearchIndex"] PANDAS_TABLE_TYPE = { "floating": "DOUBLE", diff --git a/synapseclient/models/search_index.py b/synapseclient/models/search_index.py new file mode 100644 index 000000000..479626496 --- /dev/null +++ b/synapseclient/models/search_index.py @@ -0,0 +1,271 @@ +"""SearchIndex entity model. + +A SearchIndex is a Synapse entity whose content is defined by a Synapse SQL query +(`defining_sql`). An OpenSearch index is built from the query results, supporting +full-text search, faceted search, and autocomplete. +""" + +from collections import OrderedDict +from copy import deepcopy +from dataclasses import dataclass, field, replace +from datetime import date, datetime +from typing import Any, Dict, List, Optional, Protocol, Union + +from typing_extensions import Self + +from synapseclient import Synapse +from synapseclient.core.async_utils import async_to_sync +from synapseclient.core.constants import concrete_types +from synapseclient.core.utils import delete_none_keys +from synapseclient.models.activity import Activity +from synapseclient.models.mixins.access_control import AccessControllable +from synapseclient.models.mixins.table_components import ( + DeleteMixin, + GetMixin, + QueryMixin, + TableBase, + TableStoreMixin, +) +from synapseclient.models.table_components import Column + + +class SearchIndexSynchronousProtocol(Protocol): + """Protocol defining the synchronous interface for SearchIndex operations.""" + + def store( + self, + dry_run: bool = False, + *, + job_timeout: int = 600, + synapse_client: Optional[Synapse] = None, + ) -> "Self": + """Store metadata about a SearchIndex including the annotations.""" + return self + + def get( + self, + include_columns: bool = True, + include_activity: bool = False, + *, + synapse_client: Optional[Synapse] = None, + ) -> "Self": + """Get the metadata about the SearchIndex from Synapse.""" + return self + + def delete(self, *, synapse_client: Optional[Synapse] = None) -> None: + """Delete the SearchIndex from Synapse.""" + return None + + +@dataclass +@async_to_sync +class SearchIndex( + SearchIndexSynchronousProtocol, + AccessControllable, + TableBase, + TableStoreMixin, + DeleteMixin, + GetMixin, + QueryMixin, +): + """ + A SearchIndex is a Synapse entity whose content is defined by a Synapse SQL + query (`defining_sql`). An OpenSearch index is built from the query results, + supporting full-text search, faceted search, and autocomplete. + + The `defining_sql` must reference exactly one table-like entity. Multi-entity + JOIN/UNION queries are not supported. Optionally, a `search_configuration_id` + may be supplied to control the analyzer/synonym settings used when building + the index. If not specified, the configuration is resolved by walking up + the entity hierarchy. + + REST API model: + + Attributes: + id: The unique immutable ID for this entity. + name: The name of this entity. + description: The description of this entity. + etag: Synapse OCC etag. + created_on: Date this entity was created. + modified_on: Date this entity was last modified. + created_by: The ID of the user that created this entity. + modified_by: The ID of the user that last modified this entity. + parent_id: The ID of the parent entity. + version_number: The version number issued to this version on the object. + version_label: The version label for this entity. + version_comment: The version comment for this entity. + is_latest_version: If this is the latest version of the object. + columns: (Read Only) Columns derived from `defining_sql`. + defining_sql: The Synapse SQL statement that defines which columns and + rows are indexed. + search_configuration_id: ID of the SearchConfiguration to apply when + building this index. Optional. + annotations: Additional metadata associated with the entity. + activity: Provenance for this entity. + + Example: Create a new SearchIndex. + + ```python + from synapseclient import Synapse + from synapseclient.models import SearchIndex + + syn = Synapse() + syn.login() + + index = SearchIndex( + name="My Search Index", + parent_id="syn12345", + defining_sql="SELECT * FROM syn67890", + ) + index = index.store() + print(f"Created SearchIndex: {index.id}") + ``` + """ + + id: Optional[str] = None + name: Optional[str] = None + description: Optional[str] = None + etag: Optional[str] = field(default=None, compare=False) + created_on: Optional[str] = field(default=None, compare=False) + modified_on: Optional[str] = field(default=None, compare=False) + created_by: Optional[str] = field(default=None, compare=False) + modified_by: Optional[str] = field(default=None, compare=False) + parent_id: Optional[str] = None + version_number: Optional[int] = field(default=None, compare=False) + version_label: Optional[str] = None + version_comment: Optional[str] = None + is_latest_version: Optional[bool] = field(default=None, compare=False) + + columns: Optional[OrderedDict[str, Column]] = field( + default_factory=OrderedDict, compare=False + ) + """(Read Only) Columns of a SearchIndex are derived from the defining SQL.""" + + defining_sql: Optional[str] = None + """The Synapse SQL statement that defines which columns and rows are indexed. + Must reference exactly one entity.""" + + search_configuration_id: Optional[str] = None + """The ID of the SearchConfiguration to apply when building this search + index. If not provided, the system will check for a search configuration + binding on the parent project/folder hierarchy, or use platform defaults.""" + + _last_persistent_instance: Optional["SearchIndex"] = field( + default=None, repr=False, compare=False + ) + + annotations: Optional[ + Dict[ + str, + Union[ + List[str], + List[bool], + List[float], + List[int], + List[date], + List[datetime], + ], + ] + ] = field(default_factory=dict, compare=False) + + activity: Optional[Activity] = field(default=None, compare=False) + + @property + def has_changed(self) -> bool: + """Checks if the object has changed since the last persistent instance.""" + return self._last_persistent_instance != self + + def _set_last_persistent_instance(self) -> None: + """Stash the last time this object interacted with Synapse.""" + del self._last_persistent_instance + self._last_persistent_instance = replace(self) + self._last_persistent_instance.activity = ( + replace(self.activity) if self.activity and self.activity.id else None + ) + self._last_persistent_instance.annotations = ( + deepcopy(self.annotations) if self.annotations else {} + ) + + def fill_from_dict( + self, entity: Dict[str, Any], set_annotations: bool = True + ) -> "SearchIndex": + """Populate this dataclass from a Synapse REST API entity dict.""" + self.id = entity.get("id", None) + self.name = entity.get("name", None) + self.description = entity.get("description", None) + self.parent_id = entity.get("parentId", None) + self.etag = entity.get("etag", None) + self.created_on = entity.get("createdOn", None) + self.created_by = entity.get("createdBy", None) + self.modified_on = entity.get("modifiedOn", None) + self.modified_by = entity.get("modifiedBy", None) + self.version_number = entity.get("versionNumber", None) + self.version_label = entity.get("versionLabel", None) + self.version_comment = entity.get("versionComment", None) + self.is_latest_version = entity.get("isLatestVersion", None) + self.defining_sql = entity.get("definingSQL", None) + self.search_configuration_id = entity.get("searchConfigurationId", None) + + if set_annotations: + self.annotations = entity.get("annotations", {}) + + return self + + def to_synapse_request(self) -> Dict[str, Any]: + """Convert the request to the body expected by the Synapse REST API.""" + entity = { + "name": self.name, + "description": self.description, + "id": self.id, + "etag": self.etag, + "createdOn": self.created_on, + "modifiedOn": self.modified_on, + "createdBy": self.created_by, + "modifiedBy": self.modified_by, + "parentId": self.parent_id, + "concreteType": concrete_types.SEARCH_INDEX_ENTITY, + "versionNumber": self.version_number, + "versionLabel": self.version_label, + "versionComment": self.version_comment, + "isLatestVersion": self.is_latest_version, + "definingSQL": self.defining_sql, + "searchConfigurationId": self.search_configuration_id, + } + delete_none_keys(entity) + result = {"entity": entity} + delete_none_keys(result) + return result + + async def store_async( + self, + dry_run: bool = False, + *, + job_timeout: int = 600, + synapse_client: Optional[Synapse] = None, + ) -> "Self": + """Asynchronously store the SearchIndex entity.""" + if not self.defining_sql: + raise ValueError( + "The defining_sql attribute must be set for a SearchIndex." + ) + return await super().store_async( + dry_run=dry_run, job_timeout=job_timeout, synapse_client=synapse_client + ) + + async def get_async( + self, + include_columns: bool = True, + include_activity: bool = False, + *, + synapse_client: Optional[Synapse] = None, + ) -> "Self": + """Asynchronously fetch the SearchIndex metadata.""" + return await super().get_async( + include_columns=include_columns, + include_activity=include_activity, + synapse_client=synapse_client, + ) + + async def delete_async(self, *, synapse_client: Optional[Synapse] = None) -> None: + """Asynchronously delete this SearchIndex from Synapse.""" + await super().delete_async(synapse_client=synapse_client) diff --git a/synapseclient/models/search_management.py b/synapseclient/models/search_management.py new file mode 100644 index 000000000..8fcd531c2 --- /dev/null +++ b/synapseclient/models/search_management.py @@ -0,0 +1,894 @@ +"""Search management dataclasses. + +These dataclasses model the org-level search management resources used by +SearchIndex entities: TextAnalyzer, ColumnAnalyzerOverride, SynonymSet, +SearchConfiguration, and SearchConfigBinding. + +Each resource belongs to an Organization and is referenced by qualified name +(`{organizationName}-{name}`). Resources are publicly readable; create/update +operations are restricted to Sage Bionetworks employees. + +REST controller: +""" + +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Dict, List, Optional, Union + +from typing_extensions import Self + +from synapseclient.core.constants import concrete_types +from synapseclient.core.utils import delete_none_keys +from synapseclient.models.mixins.asynchronous_job import AsynchronousCommunicator +from synapseclient.models.table_components import SelectColumn + +# ---------- Enums ---------- + + +class SearchIndexState(str, Enum): + """The state of a SearchIndex's OpenSearch index.""" + + CREATING = "CREATING" + ACTIVE = "ACTIVE" + FAILED = "FAILED" + + +class SearchQueryPart(str, Enum): + """Optional response parts for a SearchQuery beyond default HITS. + + These are values for the `responseParts` field on a SearchIndexQuery. + """ + + HITS = "HITS" + FACETS = "FACETS" + TOTAL_HITS = "TOTAL_HITS" + SELECT_COLUMNS = "SELECT_COLUMNS" + + +class SearchQueryType(str, Enum): + """The type of full-text query to execute against a search index. + + REST: + """ + + SIMPLE_QUERY_STRING = "SIMPLE_QUERY_STRING" + """Supports +, -, |, quotes, ~, *, () operators. Best for user-facing + search boxes.""" + + MATCH = "MATCH" + """Standard full-text match. Best for programmatic queries.""" + + MULTI_MATCH = "MULTI_MATCH" + """Matches across multiple fields with configurable boost.""" + + MATCH_PHRASE = "MATCH_PHRASE" + """Exact phrase matching. Terms must appear in order and adjacent.""" + + PREFIX = "PREFIX" + """Prefix matching for autocomplete and type-ahead.""" + + WILDCARD = "WILDCARD" + """Supports * and ? wildcards. Use sparingly.""" + + MATCH_ALL = "MATCH_ALL" + """Returns all documents. Used automatically when queryText is null + or empty.""" + + +class SortDirection(str, Enum): + """Sort direction for search results. + + REST: + """ + + ASC = "ASC" + DESC = "DESC" + + +class FacetSortField(str, Enum): + """Field used to organize facet values in a terms aggregation. + + REST: + """ + + COUNT = "COUNT" + """Sort by bucket document count. Maps to OpenSearch '_count'.""" + + KEY = "KEY" + """Sort by bucket key value. Maps to OpenSearch '_key'.""" + + +# ---------- Text Analyzer ---------- + + +@dataclass +class TextAnalyzer: + """A shareable, named OpenSearch + [custom analyzer](https://docs.opensearch.org/latest/analyzers/custom-analyzer/). + + The `settings` field is a JSON object holding the *contents of* the + `settings.analysis` block of an OpenSearch + [create-index](https://docs.opensearch.org/latest/api-reference/index-apis/create-index/) + request body — **not** the wrapping `{"settings": {"analysis": ...}}` + envelope, and **not** the full create-index body. Synapse resolves any + `{"$ref": "{organizationName}-{name}"}` entries (only allowed inside + the `filter` registry map) against existing SynonymSets at index-build + time; everything else passes through to AOSS verbatim. + + One TextAnalyzer record exposes one externally-addressable analyzer: + the inner `analyzer` map must declare exactly one entry named `default` + (required), and may optionally declare a second entry named + `default_search` (the asymmetric edge_ngram autocomplete pattern). + Curators who need additional analyzers create additional TextAnalyzer + records — each TextAnalyzer is itself shareable across SearchConfigurations. + + REST: + """ + + id: Optional[str] = None + organization_name: Optional[str] = None + name: Optional[str] = None + description: Optional[str] = None + settings: Optional[Dict[str, Any]] = None + """Required. JSON object holding the contents of the OpenSearch + `settings.analysis` block. Allowed root keys: `char_filter`, `tokenizer`, + `filter`, `analyzer`. The inner `analyzer` map must declare a `default` + entry (required) and may optionally declare a `default_search` entry; + any other entry is rejected. Cross-resource references use + `{"$ref": "{organizationName}-{name}"}` inside the `filter` registry map + (not in chain arrays) and must resolve to an existing SynonymSet.""" + etag: Optional[str] = None + created_on: Optional[str] = None + created_by: Optional[str] = None + modified_on: Optional[str] = None + modified_by: Optional[str] = None + + @property + def qualified_name(self) -> Optional[str]: + """The qualified name '{organizationName}-{name}' used to reference + this analyzer from a SearchConfiguration.""" + if self.organization_name and self.name: + return f"{self.organization_name}-{self.name}" + return None + + @staticmethod + def ref(qualified_name: str) -> Dict[str, str]: + """Return `{"$ref": qualified_name}` for use as a SearchConfiguration + `defaultAnalyzer` or a ColumnAnalyzerOverrideEntry `analyzer`.""" + return {"$ref": qualified_name} + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.id = data.get("id", None) + self.organization_name = data.get("organizationName", None) + self.name = data.get("name", None) + self.description = data.get("description", None) + self.settings = data.get("settings", None) + self.etag = data.get("etag", None) + self.created_on = data.get("createdOn", None) + self.created_by = data.get("createdBy", None) + self.modified_on = data.get("modifiedOn", None) + self.modified_by = data.get("modifiedBy", None) + return self + + def to_synapse_request(self) -> Dict[str, Any]: + body = { + "id": self.id, + "organizationName": self.organization_name, + "name": self.name, + "description": self.description, + "settings": self.settings, + "etag": self.etag, + } + delete_none_keys(body) + return body + + +# ---------- Column Analyzer Override ---------- + + +@dataclass +class ColumnAnalyzerOverrideEntry: + """Per-column analyzer override entry. The referenced TextAnalyzer's + `analyzer.default` entry drives index-time analysis; if it also declares + `analyzer.default_search`, that drives search-time analysis.""" + + column_name: Optional[str] = None + analyzer: Optional[Union[Dict[str, Any], str]] = None + """The TextAnalyzer to bind to this column. Either a `$ref` dict + `{"$ref": "{organizationName}-{name}"}` pointing at a saved TextAnalyzer + (preferred — supports reuse), or an inline TextAnalyzer literal pasted + directly. A bare qualified-name string is normalized to a `$ref` dict on + serialize. The TextAnalyzer's `analyzer.default` entry drives index-time + analysis and its `analyzer.default_search` entry (if declared) drives + search-time analysis.""" + + @classmethod + def from_ref( + cls, column_name: str, qualified_name: str + ) -> "ColumnAnalyzerOverrideEntry": + """Build an entry whose analyzer is a `$ref` to a saved TextAnalyzer.""" + return cls(column_name=column_name, analyzer={"$ref": qualified_name}) + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.column_name = data.get("columnName", None) + self.analyzer = data.get("analyzer", None) + return self + + def to_synapse_request(self) -> Dict[str, Any]: + analyzer_value: Optional[Union[Dict[str, Any], str]] + if isinstance(self.analyzer, str): + analyzer_value = {"$ref": self.analyzer} + else: + analyzer_value = self.analyzer + body = { + "columnName": self.column_name, + "analyzer": analyzer_value, + } + delete_none_keys(body) + return body + + +@dataclass +class ColumnAnalyzerOverride: + """A shared resource containing per-column analyzer override entries. + + REST: + """ + + id: Optional[str] = None + organization_name: Optional[str] = None + name: Optional[str] = None + description: Optional[str] = None + overrides: Optional[List[ColumnAnalyzerOverrideEntry]] = field(default_factory=list) + etag: Optional[str] = None + created_on: Optional[str] = None + created_by: Optional[str] = None + modified_on: Optional[str] = None + modified_by: Optional[str] = None + + @property + def qualified_name(self) -> Optional[str]: + if self.organization_name and self.name: + return f"{self.organization_name}-{self.name}" + return None + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.id = data.get("id", None) + self.organization_name = data.get("organizationName", None) + self.name = data.get("name", None) + self.description = data.get("description", None) + self.overrides = [ + ColumnAnalyzerOverrideEntry().fill_from_dict(o) + for o in data.get("overrides", []) or [] + ] + self.etag = data.get("etag", None) + self.created_on = data.get("createdOn", None) + self.created_by = data.get("createdBy", None) + self.modified_on = data.get("modifiedOn", None) + self.modified_by = data.get("modifiedBy", None) + return self + + def to_synapse_request(self) -> Dict[str, Any]: + body = { + "id": self.id, + "organizationName": self.organization_name, + "name": self.name, + "description": self.description, + "overrides": ( + [o.to_synapse_request() for o in self.overrides] + if self.overrides + else None + ), + "etag": self.etag, + } + delete_none_keys(body) + return body + + +# ---------- Synonym Set ---------- + + +@dataclass +class SynonymSet: + """A shareable OpenSearch `synonym_graph` (or legacy `synonym`) token + filter. Referenced from a TextAnalyzer's `settings.filter` registry map + via `{"$ref": "{organizationName}-{name}"}`. + + REST: + """ + + id: Optional[str] = None + organization_name: Optional[str] = None + name: Optional[str] = None + description: Optional[str] = None + definition: Optional[Dict[str, Any]] = None + """Required. The full OpenSearch token filter definition as a JSON + object. Must include `type` of `synonym` or `synonym_graph`. Synonyms are + supplied inline via the `synonyms` array using OpenSearch's native syntax: + `a, b, c` for equivalent (bidirectional), `a, b => c, d` for explicit + (directional).""" + etag: Optional[str] = None + created_on: Optional[str] = None + created_by: Optional[str] = None + modified_on: Optional[str] = None + modified_by: Optional[str] = None + + @property + def qualified_name(self) -> Optional[str]: + if self.organization_name and self.name: + return f"{self.organization_name}-{self.name}" + return None + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.id = data.get("id", None) + self.organization_name = data.get("organizationName", None) + self.name = data.get("name", None) + self.description = data.get("description", None) + self.definition = data.get("definition", None) + self.etag = data.get("etag", None) + self.created_on = data.get("createdOn", None) + self.created_by = data.get("createdBy", None) + self.modified_on = data.get("modifiedOn", None) + self.modified_by = data.get("modifiedBy", None) + return self + + def to_synapse_request(self) -> Dict[str, Any]: + body = { + "id": self.id, + "organizationName": self.organization_name, + "name": self.name, + "description": self.description, + "definition": self.definition, + "etag": self.etag, + } + delete_none_keys(body) + return body + + +# ---------- Search Configuration ---------- + + +@dataclass +class SearchConfiguration: + """A reusable search configuration resource. Points at the default + TextAnalyzer for the index and lists per-column overrides. Synonyms are + not wired here — a TextAnalyzer that wants synonyms references a + SynonymSet directly via `{"$ref": "{org}-{name}"}` inside its own + `settings.filter` registry map. + + Asymmetric index-time / search-time analysis is expressed inside the + chosen TextAnalyzer's settings (declare both `analyzer.default` and + `analyzer.default_search`), not by splitting the configuration into two + fields. + + REST: + """ + + id: Optional[str] = None + organization_name: Optional[str] = None + name: Optional[str] = None + description: Optional[str] = None + default_analyzer: Optional[Union[Dict[str, Any], str]] = None + """Optional. The TextAnalyzer that supplies this index's + `analysis.analyzer.default` slot. Either a `$ref` dict + `{"$ref": "{organizationName}-{name}"}` pointing at a saved TextAnalyzer + (preferred — supports reuse), or an inline TextAnalyzer literal pasted + directly. A bare qualified-name string is normalized to a `$ref` dict on + serialize. If the chosen TextAnalyzer also declares an + `analyzer.default_search` entry, that becomes the index's + `analysis.analyzer.default_search`. If omitted, each column falls back to + the system default analyzer for its data type.""" + column_analyzer_overrides: Optional[List[Union[Dict[str, Any], str]]] = field( + default_factory=list + ) + """Optional ordered list of ColumnAnalyzerOverride entries. Each entry is + either a `$ref` dict `{"$ref": "{organizationName}-{name}"}` pointing at + a saved ColumnAnalyzerOverride, an inline ColumnAnalyzerOverride literal + (with its own `overrides` list), or a bare qualified-name string that is + normalized to a `$ref` dict on serialize.""" + etag: Optional[str] = None + created_on: Optional[str] = None + created_by: Optional[str] = None + modified_on: Optional[str] = None + modified_by: Optional[str] = None + + @property + def qualified_name(self) -> Optional[str]: + if self.organization_name and self.name: + return f"{self.organization_name}-{self.name}" + return None + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.id = data.get("id", None) + self.organization_name = data.get("organizationName", None) + self.name = data.get("name", None) + self.description = data.get("description", None) + self.default_analyzer = data.get("defaultAnalyzer", None) + self.column_analyzer_overrides = data.get("columnAnalyzerOverrides", []) or [] + self.etag = data.get("etag", None) + self.created_on = data.get("createdOn", None) + self.created_by = data.get("createdBy", None) + self.modified_on = data.get("modifiedOn", None) + self.modified_by = data.get("modifiedBy", None) + return self + + def to_synapse_request(self) -> Dict[str, Any]: + default_analyzer: Optional[Union[Dict[str, Any], str]] + if isinstance(self.default_analyzer, str): + default_analyzer = {"$ref": self.default_analyzer} + else: + default_analyzer = self.default_analyzer + overrides: Optional[List[Dict[str, Any]]] + if self.column_analyzer_overrides: + overrides = [ + {"$ref": entry} if isinstance(entry, str) else entry + for entry in self.column_analyzer_overrides + ] + else: + overrides = None + body = { + "id": self.id, + "organizationName": self.organization_name, + "name": self.name, + "description": self.description, + "defaultAnalyzer": default_analyzer, + "columnAnalyzerOverrides": overrides, + "etag": self.etag, + } + delete_none_keys(body) + return body + + +# ---------- Search Config Binding ---------- + + +@dataclass +class SearchConfigBinding: + """A binding between a SearchConfiguration and an entity. + + Effective configuration for an entity is resolved by walking up the + hierarchy (entity -> folder -> project). + """ + + bind_id: Optional[str] = None + search_configuration_id: Optional[str] = None + object_id: Optional[str] = None + object_type: Optional[str] = None + created_by: Optional[str] = None + created_on: Optional[str] = None + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.bind_id = data.get("bindId", None) + self.search_configuration_id = data.get("searchConfigurationId", None) + self.object_id = data.get("objectId", None) + self.object_type = data.get("objectType", None) + self.created_by = data.get("createdBy", None) + self.created_on = data.get("createdOn", None) + return self + + +# ---------- Search Index Status / Query ---------- + + +@dataclass +class SearchIndexStatus: + """The build status of a SearchIndex's OpenSearch index.""" + + search_index_id: Optional[str] = None + state: Optional[SearchIndexState] = None + changed_on: Optional[str] = None + error_message: Optional[str] = None + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.search_index_id = data.get("searchIndexId", None) + st = data.get("state", None) + self.state = SearchIndexState(st) if st else None + self.changed_on = data.get("changedOn", None) + self.error_message = data.get("errorMessage", None) + return self + + +@dataclass +class KeyValues: + """Multi-value filter (IN clause) for a column. + + REST: + """ + + key: Optional[str] = None + """The column name to filter on.""" + + values: Optional[List[str]] = field(default_factory=list) + """The values to match.""" + + not_: Optional[bool] = None + """Excludes matching values when enabled; defaults to false. Serialized + as `not` (a Python keyword).""" + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.key = data.get("key", None) + self.values = data.get("values", []) or [] + self.not_ = data.get("not", None) + return self + + def to_synapse_request(self) -> Dict[str, Any]: + body = { + "key": self.key, + "values": self.values or None, + "not": self.not_, + } + delete_none_keys(body) + return body + + +@dataclass +class KeyRange: + """Range filter on a column. At least one of `min_value` or `max_value` + must be set. + + REST: + """ + + key: Optional[str] = None + """The column name to filter on.""" + + min_value: Optional[str] = None + """Inclusive minimum boundary. Serialized as `min`.""" + + max_value: Optional[str] = None + """Inclusive maximum boundary. Serialized as `max`.""" + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.key = data.get("key", None) + self.min_value = data.get("min", None) + self.max_value = data.get("max", None) + return self + + def to_synapse_request(self) -> Dict[str, Any]: + body = { + "key": self.key, + "min": self.min_value, + "max": self.max_value, + } + delete_none_keys(body) + return body + + +@dataclass +class FacetRequest: + """Column to aggregate as a facet. + + REST: + """ + + column_name: Optional[str] = None + """The name of the column to aggregate.""" + + max_value_count: Optional[int] = None + """Maximum number of facet values to return. Default: 25.""" + + sort_field: Optional[FacetSortField] = None + """Field used to organize facet values.""" + + sort_direction: Optional[SortDirection] = None + """Sort direction for facet values.""" + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.column_name = data.get("columnName", None) + self.max_value_count = data.get("maxValueCount", None) + sf = data.get("sortField", None) + self.sort_field = FacetSortField(sf) if sf else None + sd = data.get("sortDirection", None) + self.sort_direction = SortDirection(sd) if sd else None + return self + + def to_synapse_request(self) -> Dict[str, Any]: + body = { + "columnName": self.column_name, + "maxValueCount": self.max_value_count, + "sortField": self.sort_field.value if self.sort_field else None, + "sortDirection": ( + self.sort_direction.value if self.sort_direction else None + ), + } + delete_none_keys(body) + return body + + +@dataclass +class SortField: + """Sort specification for search results. + + REST: + """ + + column_name: Optional[str] = None + """The column name to sort by, or '_score' for relevance.""" + + direction: Optional[SortDirection] = None + """The direction to apply when ordering results.""" + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.column_name = data.get("columnName", None) + d = data.get("direction", None) + self.direction = SortDirection(d) if d else None + return self + + def to_synapse_request(self) -> Dict[str, Any]: + body = { + "columnName": self.column_name, + "direction": self.direction.value if self.direction else None, + } + delete_none_keys(body) + return body + + +@dataclass +class SearchQuery: + """A structured full-text query against a SearchIndex's OpenSearch index. + + REST: + """ + + query_type: Optional[SearchQueryType] = None + """The type of full-text query to execute. Required by the REST API.""" + + query_text: Optional[str] = None + """The search text. Null or empty matches all documents.""" + + query_fields: Optional[List[str]] = field(default_factory=list) + """Field names supporting boost notation (e.g., 'studyName^3'). Defaults + to all indexed fields when empty.""" + + terms_filters: Optional[List[KeyValues]] = field(default_factory=list) + """Multi-value filters (IN clause).""" + + range_filters: Optional[List[KeyRange]] = field(default_factory=list) + """Range filters with min and max.""" + + exists_filters: Optional[List[str]] = field(default_factory=list) + """Columns that must have a non-null value.""" + + not_exists_filters: Optional[List[str]] = field(default_factory=list) + """Columns that must be null or missing.""" + + fuzziness: Optional[str] = None + """Typo tolerance: 'AUTO', '0', '1', or '2'.""" + + facet_requests: Optional[List[FacetRequest]] = field(default_factory=list) + """Columns to aggregate as facets.""" + + return_fields: Optional[List[str]] = field(default_factory=list) + """Columns included in results; all columns returned when empty.""" + + sort: Optional[List[SortField]] = field(default_factory=list) + """Sort order. Default: relevance descending.""" + + highlight: Optional[bool] = None + """Returns highlighted snippets; defaults to false.""" + + offset: Optional[int] = None + """Zero-based pagination offset. Default: 0.""" + + limit: Optional[int] = None + """Results per page. Default: 25, maximum: 100.""" + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + qt = data.get("queryType", None) + self.query_type = SearchQueryType(qt) if qt else None + self.query_text = data.get("queryText", None) + self.query_fields = data.get("queryFields", []) or [] + self.terms_filters = [ + KeyValues().fill_from_dict(f) for f in data.get("termsFilters", []) or [] + ] + self.range_filters = [ + KeyRange().fill_from_dict(f) for f in data.get("rangeFilters", []) or [] + ] + self.exists_filters = data.get("existsFilters", []) or [] + self.not_exists_filters = data.get("notExistsFilters", []) or [] + self.fuzziness = data.get("fuzziness", None) + self.facet_requests = [ + FacetRequest().fill_from_dict(f) + for f in data.get("facetRequests", []) or [] + ] + self.return_fields = data.get("returnFields", []) or [] + self.sort = [SortField().fill_from_dict(s) for s in data.get("sort", []) or []] + self.highlight = data.get("highlight", None) + self.offset = data.get("offset", None) + self.limit = data.get("limit", None) + return self + + def to_synapse_request(self) -> Dict[str, Any]: + body = { + "queryType": self.query_type.value if self.query_type else None, + "queryText": self.query_text, + "queryFields": self.query_fields or None, + "termsFilters": ( + [f.to_synapse_request() for f in self.terms_filters] + if self.terms_filters + else None + ), + "rangeFilters": ( + [f.to_synapse_request() for f in self.range_filters] + if self.range_filters + else None + ), + "existsFilters": self.exists_filters or None, + "notExistsFilters": self.not_exists_filters or None, + "fuzziness": self.fuzziness, + "facetRequests": ( + [f.to_synapse_request() for f in self.facet_requests] + if self.facet_requests + else None + ), + "returnFields": self.return_fields or None, + "sort": ( + [s.to_synapse_request() for s in self.sort] if self.sort else None + ), + "highlight": self.highlight, + "offset": self.offset, + "limit": self.limit, + } + delete_none_keys(body) + return body + + +@dataclass +class SearchFieldValue: + """A name/value pair returned in a SearchHit's `fields` or `highlights`. + + REST: + """ + + name: Optional[str] = None + """The column name.""" + + value: Optional[str] = None + """The column value.""" + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.name = data.get("name", None) + self.value = data.get("value", None) + return self + + +@dataclass +class SearchHit: + """A single matching document in a SearchQueryResults response. + + REST: + """ + + row_id: Optional[int] = None + """The row ID from the source table.""" + + row_version: Optional[int] = None + """The row version from the source table.""" + + score: Optional[float] = None + """The relevance score for this hit.""" + + fields: Optional[List[SearchFieldValue]] = field(default_factory=list) + """Column name/value pairs for the requested return fields.""" + + highlights: Optional[List[SearchFieldValue]] = field(default_factory=list) + """Column name/highlighted snippet pairs, if highlight was requested.""" + + def fill_from_dict(self, data: Dict[str, Any]) -> "Self": + self.row_id = data.get("rowId", None) + self.row_version = data.get("rowVersion", None) + self.score = data.get("score", None) + self.fields = [ + SearchFieldValue().fill_from_dict(f) for f in data.get("fields", []) or [] + ] + self.highlights = [ + SearchFieldValue().fill_from_dict(h) + for h in data.get("highlights", []) or [] + ] + return self + + +@dataclass +class SearchIndexQuery(AsynchronousCommunicator): + """An async request to query a SearchIndex's OpenSearch index. + + Inherits from `AsynchronousCommunicator`: call `send_job_and_wait_async()` to + submit the job, poll the Synapse async job service, and populate response + fields (`hits`, `total_hits`, `select_columns`, `facets`, `offset`) on this + same instance. + + REST: + + Example: Run a search query. + + ```python + import asyncio + from synapseclient import Synapse + from synapseclient.models import ( + SearchIndexQuery, SearchQuery, SearchQueryPart, SearchQueryType, + ) + + async def main(): + Synapse().login() + query = SearchIndexQuery( + search_index_id="syn22806626", + search_query=SearchQuery( + query_type=SearchQueryType.SIMPLE_QUERY_STRING, + query_text="alzheimer", + limit=10, + ), + response_parts=[SearchQueryPart.HITS, SearchQueryPart.TOTAL_HITS], + ) + await query.send_job_and_wait_async() + print(query.total_hits, len(query.hits)) + + asyncio.run(main()) + ``` + """ + + concrete_type: str = concrete_types.SEARCH_INDEX_QUERY + """The Synapse concrete type identifying this async request.""" + + search_index_id: Optional[str] = None + """The ID of the SearchIndex entity to query.""" + + search_query: Optional[SearchQuery] = None + """The structured SearchQuery to execute against the index.""" + + response_parts: Optional[List[SearchQueryPart]] = field(default_factory=list) + """Optional list of additional response parts beyond default HITS.""" + + hits: Optional[List[SearchHit]] = field(default_factory=list) + """Response: matching documents. Populated after `send_job_and_wait_async()`.""" + + total_hits: Optional[int] = None + """Response: total number of matching documents. Populated when + SearchQueryPart.TOTAL_HITS is requested.""" + + select_columns: Optional[List[SelectColumn]] = field(default_factory=list) + """Response: columns represented in each hit's fields, in SELECT-clause + order. Populated when SearchQueryPart.SELECT_COLUMNS is requested.""" + + facets: Optional[List[Dict[str, Any]]] = field(default_factory=list) + """Response: facet aggregation results. Populated when + SearchQueryPart.FACETS is requested. Kept as raw dicts because + FacetColumnResult has multiple polymorphic shapes.""" + + offset: Optional[int] = None + """Response: zero-based pagination offset echoed from the request.""" + + def to_synapse_request(self) -> Dict[str, Any]: + """Convert to the SearchIndexQuery body for the async-job /start endpoint.""" + body = { + "concreteType": self.concrete_type, + "searchIndexId": self.search_index_id, + "searchQuery": ( + self.search_query.to_synapse_request() if self.search_query else None + ), + "responseParts": ( + [p.value for p in self.response_parts] if self.response_parts else None + ), + } + delete_none_keys(body) + return body + + def fill_from_dict(self, synapse_response: Dict[str, Any]) -> "Self": + """Populate response fields from a SearchQueryResults body. + + Called by `AsynchronousCommunicator.send_job_and_wait_async()` once the + async job completes. Leaves request fields untouched. + + REST: + """ + self.hits = [ + SearchHit().fill_from_dict(h) + for h in synapse_response.get("hits", []) or [] + ] + self.total_hits = synapse_response.get("totalHits", None) + self.select_columns = [ + SelectColumn.fill_from_dict(c) + for c in synapse_response.get("selectColumns", []) or [] + ] + self.facets = synapse_response.get("facets", []) or [] + self.offset = synapse_response.get("offset", None) + return self diff --git a/synapseclient/models/services/search_setup.py b/synapseclient/models/services/search_setup.py new file mode 100644 index 000000000..45f2b066a --- /dev/null +++ b/synapseclient/models/services/search_setup.py @@ -0,0 +1,367 @@ +"""Helpers to set up SearchIndex-related resources against the v3 SearchManagement API. + +These helpers idempotently upsert the chain of resources a SearchIndex needs to +have synonyms applied at search time: + + SynonymSet ──► TextAnalyzer (clone of a system analyzer + $ref to the + SynonymSet, wired into `analyzer.default_search`) + ──► SearchConfiguration (default_analyzer = the cloned analyzer, + plus optional per-column system-analyzer + overrides) + +The clone deliberately leaves `analyzer.default` (index-time) untouched and +synthesizes `analyzer.default_search` (search-time) as a copy of the default +filter chain with the `{"$ref": ""}` filter appended. Synonyms are +therefore expanded at query time only and do not bake into the index. +""" + +import copy +from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple + +from synapseclient.api.search_services import ( + create_search_configuration, + create_synonym_set, + create_text_analyzer, + list_search_configurations, + list_synonym_sets, + list_text_analyzers, +) +from synapseclient.models.search_management import ( + ColumnAnalyzerOverride, + ColumnAnalyzerOverrideEntry, + SearchConfiguration, + SynonymSet, + TextAnalyzer, +) +from synapseclient.models.table_components import ColumnType + +if TYPE_CHECKING: + from synapseclient import Synapse + +EQUIVALENT = "equivalent" +EXPLICIT = "explicit" + +SYSTEM_ANALYZER_ORG = "org.sagebionetworks" + +#: Default name of the filter-registry slot the synonym `$ref` is wired into +#: when cloning a system analyzer. Matches the convention used in the local +#: testing scripts and the Java `synonym_graph` filter examples. +DEFAULT_SYNONYM_FILTER_SLOT = "synonyms" + +SynonymRule = Tuple[str, List[str]] + + +# ---------- ColumnType → system analyzer mapping ---------- +# +# Python mirror of `ColumnTypeToOpenSearchMapping` in the repo: +# `Synapse-Repository-Services/services/repository-managers/src/main/java/` +# `org/sagebionetworks/repo/manager/search/ColumnTypeToOpenSearchMapping.java`. +# Each entry records the Synapse `ColumnType`, the OpenSearch field category, +# and the qualified name of the system TextAnalyzer used as the per-type +# default. Keep in sync with the Java enum. + +#: Names of the bootstrapped system TextAnalyzers (see the Java +#: `TextAnalyzerBootstrapper`). Qualified form is `org.sagebionetworks-`. +SYSTEM_ANALYZER_SCIENTIFIC = f"{SYSTEM_ANALYZER_ORG}-SCIENTIFIC" +SYSTEM_ANALYZER_STANDARD = f"{SYSTEM_ANALYZER_ORG}-STANDARD" +SYSTEM_ANALYZER_KEYWORD = f"{SYSTEM_ANALYZER_ORG}-KEYWORD" + + +COLUMN_TYPE_TO_DEFAULT_ANALYZER_QNAME: Dict[ColumnType, str] = { + # Text categories — analyzed with SCIENTIFIC by default. + ColumnType.STRING: SYSTEM_ANALYZER_SCIENTIFIC, + ColumnType.STRING_LIST: SYSTEM_ANALYZER_SCIENTIFIC, + ColumnType.MEDIUMTEXT: SYSTEM_ANALYZER_SCIENTIFIC, + ColumnType.LARGETEXT: SYSTEM_ANALYZER_SCIENTIFIC, + # LINK is text-shaped but defaults to KEYWORD per the Java mapping. + ColumnType.LINK: SYSTEM_ANALYZER_KEYWORD, + # Numeric / date — KEYWORD (no full-text analysis). + ColumnType.INTEGER: SYSTEM_ANALYZER_KEYWORD, + ColumnType.INTEGER_LIST: SYSTEM_ANALYZER_KEYWORD, + ColumnType.DATE: SYSTEM_ANALYZER_KEYWORD, + ColumnType.DATE_LIST: SYSTEM_ANALYZER_KEYWORD, + ColumnType.FILEHANDLEID: SYSTEM_ANALYZER_KEYWORD, + ColumnType.SUBMISSIONID: SYSTEM_ANALYZER_KEYWORD, + ColumnType.EVALUATIONID: SYSTEM_ANALYZER_KEYWORD, + # Identifier / boolean / double — KEYWORD. + ColumnType.ENTITYID: SYSTEM_ANALYZER_KEYWORD, + ColumnType.USERID: SYSTEM_ANALYZER_KEYWORD, + ColumnType.ENTITYID_LIST: SYSTEM_ANALYZER_KEYWORD, + ColumnType.USERID_LIST: SYSTEM_ANALYZER_KEYWORD, + ColumnType.DOUBLE: SYSTEM_ANALYZER_KEYWORD, + ColumnType.BOOLEAN: SYSTEM_ANALYZER_KEYWORD, + ColumnType.BOOLEAN_LIST: SYSTEM_ANALYZER_KEYWORD, + # JSON — STANDARD. + ColumnType.JSON: SYSTEM_ANALYZER_STANDARD, +} + + +def build_column_overrides( + columns: Mapping[str, ColumnType], + *, + default_analyzer_qname: str, + default_substitutes_system_qname: str = SYSTEM_ANALYZER_SCIENTIFIC, + system_analyzer_substitutions: Optional[Mapping[str, str]] = None, +) -> List[Tuple[str, str]]: + """Derive the ColumnAnalyzerOverride list for a SearchConfiguration whose + `defaultAnalyzer` is `default_analyzer_qname`. The default analyzer is + assumed to be a synonym-aware clone of `default_substitutes_system_qname` + (default: SCIENTIFIC). Each column's system-default analyzer (per + `COLUMN_TYPE_TO_DEFAULT_ANALYZER_QNAME`) is then mapped through + `system_analyzer_substitutions` (a `{system_qname: replacement_qname}` + map — typically `{system_qname: synonym_aware_clone_qname}`) before being + compared to `default_analyzer_qname`. Columns whose effective analyzer + matches the SearchConfiguration's default inherit the default; everything + else is pinned via an override. + + Returns an ordered list of `(column_name, analyzer_qname)` pairs.""" + substitutions = dict(system_analyzer_substitutions or {}) + overrides: List[Tuple[str, str]] = [] + for column_name, column_type in columns.items(): + system_default = COLUMN_TYPE_TO_DEFAULT_ANALYZER_QNAME.get(column_type) + if system_default is None: + continue + target = substitutions.get(system_default, system_default) + if system_default == default_substitutes_system_qname: + continue + if target == default_analyzer_qname: + continue + overrides.append((column_name, target)) + return overrides + + +def render_synonym_definition(rules: List[SynonymRule]) -> Dict[str, Any]: + """Render the rule list into the OpenSearch `synonym_graph` token-filter + definition. EQUIVALENT rules become bidirectional `"a, b, c"` lines; + EXPLICIT rules become directional `"first => first, t2, t3"` lines.""" + synonyms: List[str] = [] + for kind, terms in rules: + if kind == EQUIVALENT: + synonyms.append(", ".join(terms)) + elif kind == EXPLICIT: + head = terms[0] + synonyms.append(f"{head} => {', '.join(terms)}") + else: + raise ValueError(f"Unknown synonym rule kind: {kind!r}") + return {"type": "synonym_graph", "synonyms": synonyms} + + +async def _find_by_name( + list_fn, + organization_name: str, + name: str, + *, + synapse_client: "Synapse", +) -> Optional[Dict[str, Any]]: + next_page_token = None + while True: + page = await list_fn( + organization_name=organization_name, + next_page_token=next_page_token, + synapse_client=synapse_client, + ) + for item in page.get("results", []) or []: + if item.get("name") == name: + return item + next_page_token = page.get("nextPageToken") + if not next_page_token: + return None + + +async def get_system_analyzer( + name: str, + *, + synapse_client: "Synapse", +) -> TextAnalyzer: + """Look up a bootstrapped system analyzer by name (e.g. `SCIENTIFIC`, + `KEYWORD`) under the `org.sagebionetworks` organization.""" + found = await _find_by_name( + list_text_analyzers, + SYSTEM_ANALYZER_ORG, + name, + synapse_client=synapse_client, + ) + if found is None: + raise ValueError(f"System analyzer '{SYSTEM_ANALYZER_ORG}-{name}' not found.") + return TextAnalyzer().fill_from_dict(found) + + +def _build_search_chain(default_chain: List[str], slot_name: str) -> List[str]: + """Build the `default_search` filter chain by reordering the index-time + `default` chain so that the synonym filter sees normalized but ungraphed + tokens. `synonym_graph` rejects input from graph token filters + (`word_delimiter_graph` etc.), so the chain must be: + + [lowercase, , ] + + `lowercase` is hoisted to the front (its position is irrelevant for + correctness because it produces single-position tokens), the synonym + filter goes immediately after it, and any graph-emitting filter from + the original chain is preserved AFTER the synonym filter — matching the + working pattern in scripts_v2/local_testing_search_index.py's demo + `default_search`.""" + new_chain: List[str] = ["lowercase", slot_name] + for name in default_chain: + if name != "lowercase": + new_chain.append(name) + return new_chain + + +def clone_settings_with_search_synonyms( + base_settings: Dict[str, Any], + synonym_qname: str, + *, + filter_slot_name: str = DEFAULT_SYNONYM_FILTER_SLOT, +) -> Dict[str, Any]: + """Deep-copy a system analyzer's `settings` and add a `default_search` entry + that mirrors `default`'s filter chain with `{"$ref": synonym_qname}` + inserted after `lowercase`. Synonyms are therefore applied at search time + only and see normalized but unstemmed tokens (so multi-word and + natural-language synonym rules match). + + If `default_search` already exists on the base analyzer, the synonym + filter is spliced into that chain instead of rebuilding from `default`, + so analyzers like AUTOCOMPLETE that already declare an asymmetric search + chain keep their search-time behavior.""" + settings = copy.deepcopy(base_settings or {}) + analyzers = settings.setdefault("analyzer", {}) + if "default" not in analyzers: + raise ValueError( + "Base analyzer settings missing required `analyzer.default` entry." + ) + extra_keys = set(analyzers.keys()) - {"default", "default_search"} + if extra_keys: + raise ValueError( + "Base analyzer settings.analyzer must contain only 'default' and " + f"optional 'default_search'; got extra keys: {sorted(extra_keys)}" + ) + + filters = settings.setdefault("filter", {}) + if filter_slot_name in filters: + raise ValueError( + f"Filter slot {filter_slot_name!r} already used in base analyzer settings." + ) + filters[filter_slot_name] = {"$ref": synonym_qname} + + base_for_search = analyzers.get("default_search") or analyzers["default"] + new_search = copy.deepcopy(base_for_search) + new_search["filter"] = _build_search_chain( + new_search.get("filter") or [], filter_slot_name + ) + analyzers["default_search"] = new_search + + return settings + + +async def ensure_synonym_set( + organization_name: str, + name: str, + *, + rules: List[SynonymRule], + description: Optional[str] = None, + synapse_client: "Synapse", +) -> SynonymSet: + """Find or create a SynonymSet under the given organization.""" + existing = await _find_by_name( + list_synonym_sets, organization_name, name, synapse_client=synapse_client + ) + if existing is not None: + return SynonymSet().fill_from_dict(existing) + + synonym_set = SynonymSet( + organization_name=organization_name, + name=name, + description=description, + definition=render_synonym_definition(rules), + ) + result = await create_synonym_set( + request=synonym_set.to_synapse_request(), + synapse_client=synapse_client, + ) + return SynonymSet().fill_from_dict(result) + + +async def ensure_synonym_aware_analyzer( + organization_name: str, + name: str, + *, + base_system_analyzer_name: str, + synonym_set_qname: str, + description: Optional[str] = None, + filter_slot_name: str = DEFAULT_SYNONYM_FILTER_SLOT, + synapse_client: "Synapse", +) -> TextAnalyzer: + """Find or create a TextAnalyzer that clones a system analyzer and adds a + search-time-only synonym filter via `{"$ref": synonym_set_qname}` in the + cloned analyzer's `settings.filter` registry.""" + existing = await _find_by_name( + list_text_analyzers, organization_name, name, synapse_client=synapse_client + ) + if existing is not None: + return TextAnalyzer().fill_from_dict(existing) + + base = await get_system_analyzer( + base_system_analyzer_name, synapse_client=synapse_client + ) + settings = clone_settings_with_search_synonyms( + base.settings or {}, + synonym_set_qname, + filter_slot_name=filter_slot_name, + ) + analyzer = TextAnalyzer( + organization_name=organization_name, + name=name, + description=description, + settings=settings, + ) + result = await create_text_analyzer( + request=analyzer.to_synapse_request(), + synapse_client=synapse_client, + ) + return TextAnalyzer().fill_from_dict(result) + + +async def ensure_search_configuration( + organization_name: str, + name: str, + *, + default_analyzer_qname: str, + column_overrides: Optional[List[Tuple[str, str]]] = None, + description: Optional[str] = None, + synapse_client: "Synapse", +) -> SearchConfiguration: + """Find or create a SearchConfiguration that points at the given default + analyzer (by qualified name) and inlines a ColumnAnalyzerOverride for any + `(column_name, analyzer_qname)` pairs supplied in `column_overrides`.""" + existing = await _find_by_name( + list_search_configurations, + organization_name, + name, + synapse_client=synapse_client, + ) + if existing is not None: + return SearchConfiguration().fill_from_dict(existing) + + inline_overrides: Optional[List[Dict[str, Any]]] = None + if column_overrides: + inline_overrides = [ + ColumnAnalyzerOverride( + overrides=[ + ColumnAnalyzerOverrideEntry.from_ref(col, qname) + for col, qname in column_overrides + ], + ).to_synapse_request() + ] + + config = SearchConfiguration( + organization_name=organization_name, + name=name, + description=description, + default_analyzer=default_analyzer_qname, + column_analyzer_overrides=inline_overrides, + ) + result = await create_search_configuration( + request=config.to_synapse_request(), + synapse_client=synapse_client, + ) + return SearchConfiguration().fill_from_dict(result) diff --git a/tests/integration/synapseclient/models/async/test_search_index_async.py b/tests/integration/synapseclient/models/async/test_search_index_async.py new file mode 100644 index 000000000..f23e77c12 --- /dev/null +++ b/tests/integration/synapseclient/models/async/test_search_index_async.py @@ -0,0 +1,199 @@ +"""Integration tests for the SearchIndex entity and related Search Management API. + +These tests exercise the wire format end-to-end against a live Synapse server. + +Notes: +- SearchIndex entities are a relatively new server feature; if the server returns + 4xx for the create request, the test is skipped rather than reported as a failure + (the suite covers the wire format unconditionally elsewhere via unit tests). +- TextAnalyzer / SearchConfiguration write endpoints are restricted to Sage + Bionetworks employees server-side; tests therefore only exercise the read + endpoints (`list_*`, `get_*`) that any authenticated caller can hit. Anything + else is gated behind a `SYNAPSE_SAGE_EMPLOYEE_TOKEN` env var. +""" + +import os +import uuid +from typing import Callable + +import pytest + +from synapseclient import Synapse +from synapseclient.api.search_services import get_text_analyzer, list_text_analyzers +from synapseclient.core.exceptions import SynapseHTTPError +from synapseclient.models import Column, ColumnType, Project, SearchIndex, Table + + +def _server_supports_search_index(exc: SynapseHTTPError) -> bool: + """The SearchIndex feature may not be enabled on every Synapse environment. + Inspect the response to decide whether to fail or skip.""" + msg = str(exc).lower() + return not any( + token in msg + for token in ( + "unsupported entity type", + "concretetype", + "not allowed", + "503", + "service unavailable", + ) + ) + + +class TestSearchIndexEntity: + @pytest.fixture(autouse=True, scope="function") + def init(self, syn: Synapse, schedule_for_cleanup: Callable[..., None]) -> None: + self.syn = syn + self.schedule_for_cleanup = schedule_for_cleanup + + async def test_missing_defining_sql_raises_locally( + self, project_model: Project + ) -> None: + # GIVEN a SearchIndex with no defining_sql + index = SearchIndex(name=str(uuid.uuid4()), parent_id=project_model.id) + + # WHEN/THEN: ValueError is raised before any server contact + with pytest.raises( + ValueError, + match="The defining_sql attribute must be set for a SearchIndex.", + ): + await index.store_async(synapse_client=self.syn) + + async def test_search_index_lifecycle(self, project_model: Project) -> None: + # GIVEN a Table with a couple of columns to back the SearchIndex + table = Table( + name=str(uuid.uuid4()), + parent_id=project_model.id, + columns=[ + Column(name="title", column_type=ColumnType.STRING), + Column(name="body", column_type=ColumnType.LARGETEXT), + ], + ) + table = await table.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # WHEN creating a SearchIndex + index_name = str(uuid.uuid4()) + index = SearchIndex( + name=index_name, + description="Integration test SearchIndex", + parent_id=project_model.id, + defining_sql=f"SELECT * FROM {table.id}", + ) + + try: + index = await index.store_async(synapse_client=self.syn) + except SynapseHTTPError as exc: + if not _server_supports_search_index(exc): + pytest.skip( + f"Server does not appear to support the SearchIndex entity: {exc}" + ) + raise + + self.schedule_for_cleanup(index.id) + + # THEN the SearchIndex has the expected server-assigned fields + assert index.id is not None + assert index.id.startswith("syn") + assert index.etag + assert index.name == index_name + assert index.parent_id == project_model.id + assert index.defining_sql == f"SELECT * FROM {table.id}" + + # WHEN retrieving the SearchIndex by ID + fetched = await SearchIndex(id=index.id).get_async(synapse_client=self.syn) + + # THEN the round-trip preserves identifying fields + assert fetched.id == index.id + assert fetched.name == index_name + assert fetched.parent_id == project_model.id + assert fetched.defining_sql == f"SELECT * FROM {table.id}" + + # WHEN updating the description + new_description = "Updated description" + fetched.description = new_description + updated = await fetched.store_async(synapse_client=self.syn) + + # THEN the change is persisted + refetched = await SearchIndex(id=index.id).get_async(synapse_client=self.syn) + assert refetched.description == new_description + assert refetched.etag != index.etag + assert updated.id == index.id + + # WHEN deleting the SearchIndex + await SearchIndex(id=index.id).delete_async(synapse_client=self.syn) + + # THEN further reads fail + with pytest.raises(SynapseHTTPError): + await SearchIndex(id=index.id).get_async(synapse_client=self.syn) + + +class TestSearchManagementReadEndpoints: + """Read-only paths every authenticated caller can hit.""" + + @pytest.fixture(autouse=True, scope="function") + def init(self, syn: Synapse) -> None: + self.syn = syn + + async def test_list_text_analyzers_returns_system_analyzers(self) -> None: + # GIVEN the bootstrapped `org.sagebionetworks` analyzers exist on the server + try: + page = await list_text_analyzers( + organization_name="org.sagebionetworks", + synapse_client=self.syn, + ) + except SynapseHTTPError as exc: + pytest.skip( + "Server does not appear to expose the SearchManagementController: " + f"{exc}" + ) + + results = page.get("results") or [] + names = {item.get("name") for item in results} + + # THEN at least one of the expected system analyzers is present + assert results, "No system text analyzers returned" + assert names & {"SCIENTIFIC", "STANDARD", "KEYWORD"}, names + + async def test_get_text_analyzer_by_id(self) -> None: + # GIVEN we can list analyzers + try: + page = await list_text_analyzers( + organization_name="org.sagebionetworks", + synapse_client=self.syn, + ) + except SynapseHTTPError as exc: + pytest.skip( + "Server does not appear to expose the SearchManagementController: " + f"{exc}" + ) + + results = page.get("results") or [] + if not results: + pytest.skip("No system analyzers available to fetch by ID") + + analyzer_id = results[0]["id"] + + # WHEN fetching a single analyzer by ID + fetched = await get_text_analyzer(analyzer_id, synapse_client=self.syn) + + # THEN it round-trips + assert fetched["id"] == analyzer_id + assert fetched["organizationName"] == "org.sagebionetworks" + + +@pytest.mark.skipif( + not os.environ.get("SYNAPSE_SAGE_EMPLOYEE_TOKEN"), + reason="Search Management write endpoints require Sage employee privileges", +) +class TestSearchManagementWriteEndpoints: + """Sage-employee-only paths. Skipped unless `SYNAPSE_SAGE_EMPLOYEE_TOKEN` set. + + Implementation deferred until a Sage-employee CI integration token is wired + into the test environment. The unit tests in + `tests/unit/synapseclient/api/unit_test_search_services.py` already cover + the wire format for these endpoints. + """ + + async def test_placeholder(self) -> None: + pytest.skip("Pending Sage-employee CI auth wiring") diff --git a/tests/unit/synapseclient/api/unit_test_search_services.py b/tests/unit/synapseclient/api/unit_test_search_services.py new file mode 100644 index 000000000..df9e339b8 --- /dev/null +++ b/tests/unit/synapseclient/api/unit_test_search_services.py @@ -0,0 +1,226 @@ +"""Unit tests for synapseclient.api.search_services.""" + +import json +from unittest.mock import AsyncMock, patch + +import pytest + +import synapseclient.api.search_services as search_services + + +@pytest.fixture +def mock_client(): + """Patch Synapse.get_client to return an AsyncMock and yield it.""" + with patch("synapseclient.Synapse") as synapse_cls: + client = AsyncMock() + synapse_cls.get_client.return_value = client + yield client + + +# ---------- Text Analyzer ---------- + + +class TestTextAnalyzer: + async def test_create_posts_request_body(self, mock_client): + request = {"organizationName": "org", "name": "n", "settings": {"a": 1}} + mock_client.rest_post_async.return_value = {"id": "1"} + + result = await search_services.create_text_analyzer(request) + + assert result == {"id": "1"} + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/text/analyzer", body=json.dumps(request) + ) + + async def test_get_uses_path_id(self, mock_client): + mock_client.rest_get_async.return_value = {"id": "42"} + await search_services.get_text_analyzer("42") + mock_client.rest_get_async.assert_awaited_once_with( + uri="/search/text/analyzer/42" + ) + + async def test_update_puts_body_to_path_id(self, mock_client): + request = {"id": "42", "name": "n"} + mock_client.rest_put_async.return_value = request + + await search_services.update_text_analyzer("42", request) + + mock_client.rest_put_async.assert_awaited_once_with( + uri="/search/text/analyzer/42", body=json.dumps(request) + ) + + async def test_list_drops_none_filters(self, mock_client): + mock_client.rest_post_async.return_value = {"results": []} + + await search_services.list_text_analyzers() + + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/text/analyzer/list", body=json.dumps({}) + ) + + async def test_list_includes_org_and_token(self, mock_client): + mock_client.rest_post_async.return_value = {"results": []} + + await search_services.list_text_analyzers( + organization_name="org.sagebionetworks", next_page_token="tok" + ) + + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/text/analyzer/list", + body=json.dumps( + {"organizationName": "org.sagebionetworks", "nextPageToken": "tok"} + ), + ) + + +# ---------- Column Analyzer Override ---------- + + +class TestColumnAnalyzerOverride: + async def test_create(self, mock_client): + request = {"organizationName": "org", "name": "co", "overrides": []} + mock_client.rest_post_async.return_value = {"id": "1"} + + await search_services.create_column_analyzer_override(request) + + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/column/analyzer/override", body=json.dumps(request) + ) + + async def test_get(self, mock_client): + mock_client.rest_get_async.return_value = {"id": "1"} + await search_services.get_column_analyzer_override("1") + mock_client.rest_get_async.assert_awaited_once_with( + uri="/search/column/analyzer/override/1" + ) + + async def test_update(self, mock_client): + request = {"id": "1"} + mock_client.rest_put_async.return_value = request + await search_services.update_column_analyzer_override("1", request) + mock_client.rest_put_async.assert_awaited_once_with( + uri="/search/column/analyzer/override/1", body=json.dumps(request) + ) + + async def test_list_drops_none(self, mock_client): + mock_client.rest_post_async.return_value = {"results": []} + await search_services.list_column_analyzer_overrides() + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/column/analyzer/override/list", body=json.dumps({}) + ) + + +# ---------- Synonym Set ---------- + + +class TestSynonymSet: + async def test_create(self, mock_client): + request = {"organizationName": "org", "name": "syn", "definition": {}} + mock_client.rest_post_async.return_value = {"id": "1"} + await search_services.create_synonym_set(request) + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/synonym/set", body=json.dumps(request) + ) + + async def test_get(self, mock_client): + mock_client.rest_get_async.return_value = {"id": "1"} + await search_services.get_synonym_set("1") + mock_client.rest_get_async.assert_awaited_once_with(uri="/search/synonym/set/1") + + async def test_update(self, mock_client): + request = {"id": "1"} + mock_client.rest_put_async.return_value = request + await search_services.update_synonym_set("1", request) + mock_client.rest_put_async.assert_awaited_once_with( + uri="/search/synonym/set/1", body=json.dumps(request) + ) + + async def test_list(self, mock_client): + mock_client.rest_post_async.return_value = {"results": []} + await search_services.list_synonym_sets(organization_name="org") + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/synonym/set/list", + body=json.dumps({"organizationName": "org"}), + ) + + +# ---------- Search Configuration ---------- + + +class TestSearchConfiguration: + async def test_create(self, mock_client): + request = {"organizationName": "org", "name": "cfg"} + mock_client.rest_post_async.return_value = {"id": "1"} + await search_services.create_search_configuration(request) + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/configuration", body=json.dumps(request) + ) + + async def test_get(self, mock_client): + mock_client.rest_get_async.return_value = {"id": "1"} + await search_services.get_search_configuration("1") + mock_client.rest_get_async.assert_awaited_once_with( + uri="/search/configuration/1" + ) + + async def test_update(self, mock_client): + request = {"id": "1"} + mock_client.rest_put_async.return_value = request + await search_services.update_search_configuration("1", request) + mock_client.rest_put_async.assert_awaited_once_with( + uri="/search/configuration/1", body=json.dumps(request) + ) + + async def test_list(self, mock_client): + mock_client.rest_post_async.return_value = {"results": []} + await search_services.list_search_configurations() + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/configuration/list", body=json.dumps({}) + ) + + +# ---------- Search Config Bindings ---------- + + +class TestSearchConfigBinding: + async def test_bind_puts_body_with_entity_and_config_id(self, mock_client): + mock_client.rest_put_async.return_value = {"bindId": "1"} + + await search_services.bind_search_config_to_entity( + entity_id="syn1", search_configuration_id="42" + ) + + mock_client.rest_put_async.assert_awaited_once_with( + uri="/entity/syn1/searchconfig/binding", + body=json.dumps({"entityId": "syn1", "searchConfigurationId": "42"}), + ) + + async def test_get_binding(self, mock_client): + mock_client.rest_get_async.return_value = {"bindId": "1"} + await search_services.get_search_config_binding("syn1") + mock_client.rest_get_async.assert_awaited_once_with( + uri="/entity/syn1/searchconfig/binding" + ) + + async def test_clear_returns_none(self, mock_client): + mock_client.rest_delete_async.return_value = None + result = await search_services.clear_search_config_binding("syn1") + assert result is None + mock_client.rest_delete_async.assert_awaited_once_with( + uri="/entity/syn1/searchconfig/binding" + ) + + +# ---------- Autocomplete ---------- + + +class TestAutocomplete: + async def test_autocomplete_posts_to_search_endpoint(self, mock_client): + request = {"searchIndexId": "syn1", "searchQuery": {"queryText": "abc"}} + mock_client.rest_post_async.return_value = {"hits": []} + + await search_services.autocomplete_search(request) + + mock_client.rest_post_async.assert_awaited_once_with( + uri="/search/autocomplete", body=json.dumps(request) + ) diff --git a/tests/unit/synapseclient/models/async/unit_test_search_index_async.py b/tests/unit/synapseclient/models/async/unit_test_search_index_async.py new file mode 100644 index 000000000..2fbd16d81 --- /dev/null +++ b/tests/unit/synapseclient/models/async/unit_test_search_index_async.py @@ -0,0 +1,185 @@ +"""Unit tests for the SearchIndex entity model.""" + +from unittest.mock import patch + +import pytest + +from synapseclient import Synapse +from synapseclient.core.constants import concrete_types +from synapseclient.models import SearchIndex +from synapseclient.models.mixins.table_components import ( + DeleteMixin, + GetMixin, + TableStoreMixin, +) + + +class TestSearchIndexBasics: + """Round-trip serialization and field-level behavior.""" + + @pytest.fixture(autouse=True, scope="function") + def init_syn(self, syn: Synapse) -> None: + self.syn = syn + + def test_default_fields_are_none_or_empty(self): + index = SearchIndex() + assert index.id is None + assert index.name is None + assert index.parent_id is None + assert index.defining_sql is None + assert index.search_configuration_id is None + assert index.annotations == {} + assert index.activity is None + + def test_fill_from_dict_maps_camelcase_to_snakecase(self): + index = SearchIndex().fill_from_dict( + { + "id": "syn1", + "name": "Idx", + "description": "d", + "parentId": "syn2", + "etag": "e", + "createdOn": "2024-01-01", + "modifiedOn": "2024-01-02", + "createdBy": "u1", + "modifiedBy": "u2", + "versionNumber": 3, + "versionLabel": "v3", + "versionComment": "c", + "isLatestVersion": True, + "definingSQL": "SELECT * FROM syn99", + "searchConfigurationId": "42", + } + ) + assert index.id == "syn1" + assert index.name == "Idx" + assert index.description == "d" + assert index.parent_id == "syn2" + assert index.etag == "e" + assert index.created_on == "2024-01-01" + assert index.modified_on == "2024-01-02" + assert index.created_by == "u1" + assert index.modified_by == "u2" + assert index.version_number == 3 + assert index.version_label == "v3" + assert index.version_comment == "c" + assert index.is_latest_version is True + assert index.defining_sql == "SELECT * FROM syn99" + assert index.search_configuration_id == "42" + + def test_fill_from_dict_skips_annotations_when_flag_false(self): + index = SearchIndex(annotations={"x": ["1"]}) + index.fill_from_dict( + {"id": "syn1", "annotations": {"y": ["2"]}}, set_annotations=False + ) + assert index.annotations == {"x": ["1"]} + + def test_to_synapse_request_emits_concrete_type_and_drops_none(self): + index = SearchIndex( + name="Idx", + parent_id="syn1", + defining_sql="SELECT * FROM syn2", + ) + body = index.to_synapse_request() + assert body == { + "entity": { + "name": "Idx", + "parentId": "syn1", + "concreteType": concrete_types.SEARCH_INDEX_ENTITY, + "definingSQL": "SELECT * FROM syn2", + } + } + + def test_to_synapse_request_includes_search_configuration_id(self): + index = SearchIndex( + name="Idx", + parent_id="syn1", + defining_sql="SELECT * FROM syn2", + search_configuration_id="cfg42", + ) + body = index.to_synapse_request()["entity"] + assert body["searchConfigurationId"] == "cfg42" + + def test_has_changed_uses_last_persistent_instance(self): + index = SearchIndex(name="Idx", parent_id="syn1", defining_sql="SELECT 1") + assert index.has_changed is True + index._set_last_persistent_instance() + assert index.has_changed is False + index.name = "renamed" + assert index.has_changed is True + + def test_set_last_persistent_instance_deepcopies_annotations(self): + index = SearchIndex( + name="Idx", + parent_id="syn1", + defining_sql="SELECT 1", + annotations={"k": ["v"]}, + ) + index._set_last_persistent_instance() + index.annotations["k"].append("v2") + assert index._last_persistent_instance.annotations == {"k": ["v"]} + + +class TestSearchIndexStoreAsync: + """`store_async` validation + super-delegation behavior.""" + + @pytest.fixture(autouse=True, scope="function") + def init_syn(self, syn: Synapse) -> None: + self.syn = syn + + async def test_missing_defining_sql_raises_valueerror(self): + index = SearchIndex(name="Idx", parent_id="syn1") + + with patch.object(TableStoreMixin, "store_async") as mock_super_store: + with pytest.raises( + ValueError, + match="The defining_sql attribute must be set for a SearchIndex.", + ): + await index.store_async(synapse_client=self.syn) + mock_super_store.assert_not_called() + + async def test_with_defining_sql_calls_super_store(self): + index = SearchIndex( + name="Idx", parent_id="syn1", defining_sql="SELECT * FROM syn2" + ) + + with patch.object(TableStoreMixin, "store_async") as mock_super_store: + mock_super_store.return_value = index + result = await index.store_async(synapse_client=self.syn) + + mock_super_store.assert_called_once_with( + dry_run=False, job_timeout=600, synapse_client=self.syn + ) + assert result is index + + +class TestSearchIndexGetDelete: + """`get_async` and `delete_async` thin pass-throughs.""" + + @pytest.fixture(autouse=True, scope="function") + def init_syn(self, syn: Synapse) -> None: + self.syn = syn + + async def test_get_async_delegates_to_get_mixin(self): + index = SearchIndex(id="syn1") + + with patch.object(GetMixin, "get_async") as mock_super_get: + mock_super_get.return_value = index + result = await index.get_async( + include_columns=False, + include_activity=True, + synapse_client=self.syn, + ) + mock_super_get.assert_called_once_with( + include_columns=False, + include_activity=True, + synapse_client=self.syn, + ) + assert result is index + + async def test_delete_async_delegates_to_delete_mixin(self): + index = SearchIndex(id="syn1") + + with patch.object(DeleteMixin, "delete_async") as mock_super_delete: + await index.delete_async(synapse_client=self.syn) + mock_super_delete.assert_called_once_with(synapse_client=self.syn) diff --git a/tests/unit/synapseclient/models/async/unit_test_search_management_async.py b/tests/unit/synapseclient/models/async/unit_test_search_management_async.py new file mode 100644 index 000000000..4dce85c3c --- /dev/null +++ b/tests/unit/synapseclient/models/async/unit_test_search_management_async.py @@ -0,0 +1,478 @@ +"""Unit tests for the search_management dataclasses.""" + +from unittest.mock import patch + +import pytest + +from synapseclient.core.constants import concrete_types +from synapseclient.models.search_management import ( + ColumnAnalyzerOverride, + ColumnAnalyzerOverrideEntry, + FacetRequest, + FacetSortField, + KeyRange, + KeyValues, + SearchConfigBinding, + SearchConfiguration, + SearchFieldValue, + SearchHit, + SearchIndexQuery, + SearchIndexState, + SearchIndexStatus, + SearchQuery, + SearchQueryPart, + SearchQueryType, + SortDirection, + SortField, + SynonymSet, + TextAnalyzer, +) + + +class TestTextAnalyzer: + def test_qualified_name(self): + ta = TextAnalyzer(organization_name="org", name="n") + assert ta.qualified_name == "org-n" + + def test_qualified_name_none_when_missing_pieces(self): + assert TextAnalyzer().qualified_name is None + assert TextAnalyzer(organization_name="org").qualified_name is None + + def test_ref_static_method(self): + assert TextAnalyzer.ref("org-n") == {"$ref": "org-n"} + + def test_round_trip(self): + original = { + "id": "1", + "organizationName": "org", + "name": "n", + "description": "d", + "settings": {"analyzer": {"default": {"type": "standard"}}}, + "etag": "e", + "createdOn": "c", + "createdBy": "u", + "modifiedOn": "m", + "modifiedBy": "u2", + } + ta = TextAnalyzer().fill_from_dict(original) + assert ta.id == "1" + assert ta.organization_name == "org" + assert ta.name == "n" + assert ta.settings == {"analyzer": {"default": {"type": "standard"}}} + # Server-managed timestamps are NOT echoed back in to_synapse_request. + body = ta.to_synapse_request() + assert body == { + "id": "1", + "organizationName": "org", + "name": "n", + "description": "d", + "settings": {"analyzer": {"default": {"type": "standard"}}}, + "etag": "e", + } + + +class TestColumnAnalyzerOverrideEntry: + def test_from_ref_builds_ref_dict(self): + entry = ColumnAnalyzerOverrideEntry.from_ref("col1", "org-n") + assert entry.column_name == "col1" + assert entry.analyzer == {"$ref": "org-n"} + + def test_to_synapse_request_normalizes_string_to_ref(self): + entry = ColumnAnalyzerOverrideEntry(column_name="col", analyzer="org-n") + body = entry.to_synapse_request() + assert body == {"columnName": "col", "analyzer": {"$ref": "org-n"}} + + def test_to_synapse_request_preserves_dict_analyzer(self): + entry = ColumnAnalyzerOverrideEntry( + column_name="col", analyzer={"$ref": "org-n"} + ) + assert entry.to_synapse_request() == { + "columnName": "col", + "analyzer": {"$ref": "org-n"}, + } + + def test_fill_from_dict(self): + entry = ColumnAnalyzerOverrideEntry().fill_from_dict( + {"columnName": "c", "analyzer": {"$ref": "org-n"}} + ) + assert entry.column_name == "c" + assert entry.analyzer == {"$ref": "org-n"} + + +class TestColumnAnalyzerOverride: + def test_round_trip_with_nested_overrides(self): + data = { + "id": "1", + "organizationName": "org", + "name": "n", + "description": "d", + "overrides": [ + {"columnName": "c1", "analyzer": {"$ref": "org-a1"}}, + {"columnName": "c2", "analyzer": {"$ref": "org-a2"}}, + ], + "etag": "e", + "createdOn": "co", + "createdBy": "cu", + "modifiedOn": "mo", + "modifiedBy": "mu", + } + cao = ColumnAnalyzerOverride().fill_from_dict(data) + assert len(cao.overrides) == 2 + assert cao.overrides[0].column_name == "c1" + # Server-managed timestamps are NOT echoed back in to_synapse_request. + body = cao.to_synapse_request() + assert body == { + "id": "1", + "organizationName": "org", + "name": "n", + "description": "d", + "overrides": [ + {"columnName": "c1", "analyzer": {"$ref": "org-a1"}}, + {"columnName": "c2", "analyzer": {"$ref": "org-a2"}}, + ], + "etag": "e", + } + + def test_qualified_name(self): + assert ( + ColumnAnalyzerOverride(organization_name="org", name="n").qualified_name + == "org-n" + ) + + +class TestSynonymSet: + def test_round_trip(self): + data = { + "id": "1", + "organizationName": "org", + "name": "syn", + "description": "d", + "definition": {"type": "synonym_graph", "synonyms": ["a, b"]}, + "etag": "e", + "createdOn": "co", + "createdBy": "cu", + "modifiedOn": "mo", + "modifiedBy": "mu", + } + ss = SynonymSet().fill_from_dict(data) + assert ss.definition["synonyms"] == ["a, b"] + body = ss.to_synapse_request() + assert body == { + "id": "1", + "organizationName": "org", + "name": "syn", + "description": "d", + "definition": {"type": "synonym_graph", "synonyms": ["a, b"]}, + "etag": "e", + } + + +class TestSearchConfiguration: + def test_string_default_analyzer_serialized_as_ref(self): + cfg = SearchConfiguration( + organization_name="org", + name="cfg", + default_analyzer="org-an", + ) + body = cfg.to_synapse_request() + assert body["defaultAnalyzer"] == {"$ref": "org-an"} + + def test_dict_default_analyzer_preserved(self): + cfg = SearchConfiguration( + organization_name="org", + name="cfg", + default_analyzer={"$ref": "org-an"}, + ) + body = cfg.to_synapse_request() + assert body["defaultAnalyzer"] == {"$ref": "org-an"} + + def test_string_overrides_normalized_to_ref(self): + cfg = SearchConfiguration( + organization_name="org", + name="cfg", + column_analyzer_overrides=["org-co1", {"$ref": "org-co2"}], + ) + body = cfg.to_synapse_request() + assert body["columnAnalyzerOverrides"] == [ + {"$ref": "org-co1"}, + {"$ref": "org-co2"}, + ] + + def test_empty_overrides_omitted(self): + cfg = SearchConfiguration(organization_name="org", name="cfg") + body = cfg.to_synapse_request() + assert "columnAnalyzerOverrides" not in body + + def test_fill_from_dict(self): + cfg = SearchConfiguration().fill_from_dict( + { + "id": "1", + "organizationName": "org", + "name": "cfg", + "defaultAnalyzer": {"$ref": "org-an"}, + "columnAnalyzerOverrides": [{"$ref": "org-co"}], + } + ) + assert cfg.default_analyzer == {"$ref": "org-an"} + assert cfg.column_analyzer_overrides == [{"$ref": "org-co"}] + + +class TestSearchConfigBinding: + def test_fill_from_dict(self): + binding = SearchConfigBinding().fill_from_dict( + { + "bindId": "b1", + "searchConfigurationId": "42", + "objectId": "syn1", + "objectType": "ENTITY", + "createdBy": "u", + "createdOn": "now", + } + ) + assert binding.bind_id == "b1" + assert binding.search_configuration_id == "42" + assert binding.object_id == "syn1" + + +class TestSearchIndexStatus: + def test_state_enum_coerced(self): + status = SearchIndexStatus().fill_from_dict( + {"searchIndexId": "syn1", "state": "ACTIVE", "changedOn": "now"} + ) + assert status.state == SearchIndexState.ACTIVE + assert status.search_index_id == "syn1" + + def test_state_missing_returns_none(self): + status = SearchIndexStatus().fill_from_dict({"searchIndexId": "syn1"}) + assert status.state is None + + +class TestKeyValues: + def test_python_keyword_not_serialized_as_not(self): + kv = KeyValues(key="col", values=["a", "b"], not_=True) + body = kv.to_synapse_request() + assert body == {"key": "col", "values": ["a", "b"], "not": True} + + def test_fill_from_dict_handles_not_key(self): + kv = KeyValues().fill_from_dict({"key": "c", "values": ["x"], "not": False}) + assert kv.key == "c" + assert kv.not_ is False + + def test_to_synapse_request_drops_empty_values(self): + kv = KeyValues(key="c") + assert kv.to_synapse_request() == {"key": "c"} + + +class TestKeyRange: + def test_min_max_keys(self): + kr = KeyRange(key="c", min_value="1", max_value="10") + body = kr.to_synapse_request() + assert body == {"key": "c", "min": "1", "max": "10"} + + def test_fill_from_dict_min_max(self): + kr = KeyRange().fill_from_dict({"key": "c", "min": "1", "max": "10"}) + assert kr.min_value == "1" + assert kr.max_value == "10" + + +class TestFacetRequest: + def test_enum_coercion_round_trip(self): + fr = FacetRequest( + column_name="c", + max_value_count=5, + sort_field=FacetSortField.COUNT, + sort_direction=SortDirection.DESC, + ) + body = fr.to_synapse_request() + assert body == { + "columnName": "c", + "maxValueCount": 5, + "sortField": "COUNT", + "sortDirection": "DESC", + } + + def test_fill_from_dict_coerces_enums(self): + fr = FacetRequest().fill_from_dict( + {"columnName": "c", "sortField": "KEY", "sortDirection": "ASC"} + ) + assert fr.sort_field == FacetSortField.KEY + assert fr.sort_direction == SortDirection.ASC + + +class TestSortField: + def test_round_trip(self): + sf = SortField(column_name="c", direction=SortDirection.DESC) + assert sf.to_synapse_request() == {"columnName": "c", "direction": "DESC"} + sf2 = SortField().fill_from_dict({"columnName": "c", "direction": "ASC"}) + assert sf2.direction == SortDirection.ASC + + +class TestSearchQuery: + def test_full_round_trip(self): + query = SearchQuery( + query_type=SearchQueryType.MATCH, + query_text="alzheimer", + query_fields=["name^2"], + terms_filters=[KeyValues(key="status", values=["active"])], + range_filters=[KeyRange(key="year", min_value="2020")], + exists_filters=["a"], + not_exists_filters=["b"], + fuzziness="AUTO", + facet_requests=[FacetRequest(column_name="c")], + return_fields=["id", "name"], + sort=[SortField(column_name="rel", direction=SortDirection.ASC)], + highlight=True, + offset=0, + limit=10, + ) + body = query.to_synapse_request() + assert body["queryType"] == "MATCH" + assert body["queryText"] == "alzheimer" + assert body["queryFields"] == ["name^2"] + assert body["termsFilters"] == [{"key": "status", "values": ["active"]}] + assert body["rangeFilters"] == [{"key": "year", "min": "2020"}] + assert body["existsFilters"] == ["a"] + assert body["notExistsFilters"] == ["b"] + assert body["fuzziness"] == "AUTO" + assert body["facetRequests"] == [{"columnName": "c"}] + assert body["returnFields"] == ["id", "name"] + assert body["sort"] == [{"columnName": "rel", "direction": "ASC"}] + assert body["highlight"] is True + assert body["offset"] == 0 + assert body["limit"] == 10 + + def test_minimal_query_drops_empty_lists(self): + query = SearchQuery(query_type=SearchQueryType.MATCH_ALL) + body = query.to_synapse_request() + assert body == {"queryType": "MATCH_ALL"} + + def test_fill_from_dict(self): + query = SearchQuery().fill_from_dict( + { + "queryType": "MATCH", + "queryText": "x", + "termsFilters": [{"key": "k", "values": ["v"], "not": True}], + "facetRequests": [{"columnName": "c"}], + } + ) + assert query.query_type == SearchQueryType.MATCH + assert len(query.terms_filters) == 1 + assert query.terms_filters[0].not_ is True + assert len(query.facet_requests) == 1 + + +class TestSearchHit: + def test_fill_from_dict(self): + hit = SearchHit().fill_from_dict( + { + "rowId": 5, + "rowVersion": 1, + "score": 0.75, + "fields": [{"name": "n", "value": "v"}], + "highlights": [{"name": "h", "value": "v"}], + } + ) + assert hit.row_id == 5 + assert hit.score == 0.75 + assert isinstance(hit.fields[0], SearchFieldValue) + assert hit.fields[0].name == "n" + assert hit.highlights[0].value == "v" + + +class TestSearchIndexQuery: + def test_concrete_type_default(self): + q = SearchIndexQuery() + assert q.concrete_type == concrete_types.SEARCH_INDEX_QUERY + + def test_to_synapse_request_includes_concrete_type_and_response_parts(self): + q = SearchIndexQuery( + search_index_id="syn1", + search_query=SearchQuery(query_type=SearchQueryType.MATCH_ALL), + response_parts=[SearchQueryPart.HITS, SearchQueryPart.TOTAL_HITS], + ) + body = q.to_synapse_request() + assert body["concreteType"] == concrete_types.SEARCH_INDEX_QUERY + assert body["searchIndexId"] == "syn1" + assert body["searchQuery"] == {"queryType": "MATCH_ALL"} + assert body["responseParts"] == ["HITS", "TOTAL_HITS"] + + def test_to_synapse_request_omits_empty_response_parts(self): + q = SearchIndexQuery( + search_index_id="syn1", + search_query=SearchQuery(query_type=SearchQueryType.MATCH_ALL), + ) + body = q.to_synapse_request() + assert "responseParts" not in body + + def test_fill_from_dict_populates_response_fields(self): + q = SearchIndexQuery() + q.fill_from_dict( + { + "hits": [ + {"rowId": 1, "score": 0.5, "fields": [{"name": "a", "value": "b"}]} + ], + "totalHits": 100, + "selectColumns": [{"name": "a", "columnType": "STRING"}], + "facets": [{"facetType": "ENUMERATION"}], + "offset": 0, + } + ) + assert len(q.hits) == 1 + assert q.hits[0].row_id == 1 + assert q.total_hits == 100 + assert len(q.select_columns) == 1 + assert q.facets == [{"facetType": "ENUMERATION"}] + assert q.offset == 0 + + async def test_send_job_and_wait_async_invokes_communicator(self): + q = SearchIndexQuery( + search_index_id="syn1", + search_query=SearchQuery(query_type=SearchQueryType.MATCH_ALL), + ) + + sentinel_response = { + "hits": [], + "totalHits": 0, + "selectColumns": [], + "facets": [], + "offset": 0, + } + + with patch( + "synapseclient.models.mixins.asynchronous_job.send_job_and_wait_async" + ) as mock_send: + + async def _fake(*args, **kwargs): + return sentinel_response + + mock_send.side_effect = _fake + + result = await q.send_job_and_wait_async(synapse_client=None) + + assert mock_send.await_count == 1 + sent_kwargs = mock_send.await_args.kwargs + assert sent_kwargs["request_type"] == concrete_types.SEARCH_INDEX_QUERY + assert sent_kwargs["request"]["concreteType"] == ( + concrete_types.SEARCH_INDEX_QUERY + ) + # Response parts populated on the same instance + assert result is q + assert q.total_hits == 0 + assert q.hits == [] + + +@pytest.mark.parametrize( + ("enum_cls", "value"), + [ + (SearchIndexState, "ACTIVE"), + (SearchQueryPart, "HITS"), + (SearchQueryType, "MATCH"), + (SortDirection, "ASC"), + (FacetSortField, "COUNT"), + ], +) +def test_enum_string_values(enum_cls, value): + """Each enum is a `str, Enum`, so its value equals the string literal.""" + member = enum_cls(value) + assert member.value == value + assert str(member.value) == value diff --git a/tests/unit/synapseclient/models/async/unit_test_search_setup_async.py b/tests/unit/synapseclient/models/async/unit_test_search_setup_async.py new file mode 100644 index 000000000..aeb3177cf --- /dev/null +++ b/tests/unit/synapseclient/models/async/unit_test_search_setup_async.py @@ -0,0 +1,365 @@ +"""Unit tests for synapseclient.models.services.search_setup.""" + +from unittest.mock import AsyncMock, patch + +import pytest + +from synapseclient.models.services import search_setup +from synapseclient.models.services.search_setup import ( + EQUIVALENT, + EXPLICIT, + SYSTEM_ANALYZER_KEYWORD, + SYSTEM_ANALYZER_SCIENTIFIC, + _build_search_chain, + _find_by_name, + build_column_overrides, + clone_settings_with_search_synonyms, + ensure_search_configuration, + ensure_synonym_aware_analyzer, + ensure_synonym_set, + render_synonym_definition, +) +from synapseclient.models.table_components import ColumnType + + +class TestRenderSynonymDefinition: + def test_equivalent_rule(self): + out = render_synonym_definition([(EQUIVALENT, ["a", "b", "c"])]) + assert out == {"type": "synonym_graph", "synonyms": ["a, b, c"]} + + def test_explicit_rule(self): + out = render_synonym_definition([(EXPLICIT, ["a", "b", "c"])]) + assert out == {"type": "synonym_graph", "synonyms": ["a => a, b, c"]} + + def test_mixed_rules(self): + out = render_synonym_definition( + [(EQUIVALENT, ["x", "y"]), (EXPLICIT, ["foo", "bar"])] + ) + assert out == { + "type": "synonym_graph", + "synonyms": ["x, y", "foo => foo, bar"], + } + + def test_unknown_rule_kind_raises(self): + with pytest.raises(ValueError, match="Unknown synonym rule kind"): + render_synonym_definition([("UNKNOWN", ["a"])]) + + +class TestBuildSearchChain: + def test_lowercase_hoisted_with_synonym_slot_after(self): + chain = _build_search_chain(["lowercase", "stop"], "syn_slot") + assert chain == ["lowercase", "syn_slot", "stop"] + + def test_lowercase_added_when_missing_from_default(self): + chain = _build_search_chain(["stop"], "syn_slot") + assert chain == ["lowercase", "syn_slot", "stop"] + + def test_empty_default(self): + chain = _build_search_chain([], "syn_slot") + assert chain == ["lowercase", "syn_slot"] + + +class TestCloneSettingsWithSearchSynonyms: + def test_basic_clone_adds_default_search(self): + base = { + "analyzer": { + "default": {"tokenizer": "standard", "filter": ["lowercase", "stop"]} + }, + "filter": {}, + } + out = clone_settings_with_search_synonyms(base, "org-syn") + assert "default" in out["analyzer"] + assert "default_search" in out["analyzer"] + assert out["filter"]["synonyms"] == {"$ref": "org-syn"} + assert out["analyzer"]["default_search"]["filter"][:2] == [ + "lowercase", + "synonyms", + ] + + def test_does_not_mutate_input(self): + base = { + "analyzer": {"default": {"tokenizer": "standard", "filter": ["lowercase"]}} + } + original_copy = { + "analyzer": {"default": {"tokenizer": "standard", "filter": ["lowercase"]}} + } + clone_settings_with_search_synonyms(base, "org-syn") + assert base == original_copy + + def test_missing_default_raises(self): + with pytest.raises(ValueError, match="missing required `analyzer.default`"): + clone_settings_with_search_synonyms({"analyzer": {}}, "org-syn") + + def test_extra_analyzer_keys_raises(self): + base = { + "analyzer": { + "default": {"tokenizer": "standard"}, + "weird_extra": {"tokenizer": "keyword"}, + } + } + with pytest.raises(ValueError, match="must contain only 'default'"): + clone_settings_with_search_synonyms(base, "org-syn") + + def test_filter_slot_collision_raises(self): + base = { + "analyzer": {"default": {"tokenizer": "standard"}}, + "filter": {"synonyms": {"type": "synonym"}}, + } + with pytest.raises(ValueError, match="already used in base analyzer settings"): + clone_settings_with_search_synonyms(base, "org-syn") + + def test_existing_default_search_preserved_and_extended(self): + base = { + "analyzer": { + "default": {"tokenizer": "standard", "filter": ["lowercase", "stop"]}, + "default_search": { + "tokenizer": "edge_ngram", + "filter": ["lowercase", "stem"], + }, + } + } + out = clone_settings_with_search_synonyms(base, "org-syn") + # default_search starts from existing search analyzer, not default + assert out["analyzer"]["default_search"]["tokenizer"] == "edge_ngram" + assert out["analyzer"]["default_search"]["filter"][:2] == [ + "lowercase", + "synonyms", + ] + + +class TestBuildColumnOverrides: + def test_columns_matching_default_substitution_skipped(self): + # All STRING columns default to SCIENTIFIC; if default is the synonym-aware + # clone of SCIENTIFIC, those columns inherit -> no override. + result = build_column_overrides( + {"col1": ColumnType.STRING, "col2": ColumnType.STRING}, + default_analyzer_qname="org-clone-of-scientific", + default_substitutes_system_qname=SYSTEM_ANALYZER_SCIENTIFIC, + system_analyzer_substitutions={ + SYSTEM_ANALYZER_SCIENTIFIC: "org-clone-of-scientific" + }, + ) + assert result == [] + + def test_keyword_columns_pinned_when_default_is_scientific(self): + # INTEGER → KEYWORD by default, but the default analyzer is SCIENTIFIC clone. + # That mismatch produces an override pinned to KEYWORD. + result = build_column_overrides( + {"col1": ColumnType.STRING, "id_col": ColumnType.INTEGER}, + default_analyzer_qname="org-clone-of-scientific", + default_substitutes_system_qname=SYSTEM_ANALYZER_SCIENTIFIC, + system_analyzer_substitutions={ + SYSTEM_ANALYZER_SCIENTIFIC: "org-clone-of-scientific" + }, + ) + assert result == [("id_col", SYSTEM_ANALYZER_KEYWORD)] + + def test_substituted_keyword_used_when_provided(self): + result = build_column_overrides( + {"id_col": ColumnType.INTEGER}, + default_analyzer_qname="org-clone-of-scientific", + default_substitutes_system_qname=SYSTEM_ANALYZER_SCIENTIFIC, + system_analyzer_substitutions={ + SYSTEM_ANALYZER_SCIENTIFIC: "org-clone-of-scientific", + SYSTEM_ANALYZER_KEYWORD: "org-clone-of-keyword", + }, + ) + assert result == [("id_col", "org-clone-of-keyword")] + + +class TestFindByName: + async def test_returns_match_in_first_page(self): + list_fn = AsyncMock( + return_value={ + "results": [{"name": "alpha"}, {"name": "beta"}], + "nextPageToken": None, + } + ) + + out = await _find_by_name(list_fn, "org", "beta", synapse_client=None) + + assert out == {"name": "beta"} + assert list_fn.await_count == 1 + + async def test_paginates_through_pages(self): + responses = [ + {"results": [{"name": "alpha"}], "nextPageToken": "tok1"}, + {"results": [{"name": "beta"}], "nextPageToken": None}, + ] + list_fn = AsyncMock(side_effect=responses) + + out = await _find_by_name(list_fn, "org", "beta", synapse_client=None) + + assert out == {"name": "beta"} + assert list_fn.await_count == 2 + + async def test_returns_none_when_not_found(self): + list_fn = AsyncMock( + return_value={"results": [{"name": "alpha"}], "nextPageToken": None} + ) + + out = await _find_by_name(list_fn, "org", "missing", synapse_client=None) + + assert out is None + + +class TestEnsureSynonymSet: + async def test_returns_existing_when_found(self): + existing = {"id": "1", "organizationName": "org", "name": "syn"} + + with ( + patch.object( + search_setup, "_find_by_name", AsyncMock(return_value=existing) + ), + patch.object(search_setup, "create_synonym_set") as mock_create, + ): + + ss = await ensure_synonym_set( + "org", "syn", rules=[(EQUIVALENT, ["a", "b"])], synapse_client=None + ) + + assert ss.id == "1" + mock_create.assert_not_called() + + async def test_creates_when_missing(self): + created = {"id": "2", "organizationName": "org", "name": "syn"} + + with ( + patch.object(search_setup, "_find_by_name", AsyncMock(return_value=None)), + patch.object( + search_setup, "create_synonym_set", AsyncMock(return_value=created) + ) as mock_create, + ): + + ss = await ensure_synonym_set( + "org", "syn", rules=[(EQUIVALENT, ["a", "b"])], synapse_client=None + ) + + assert ss.id == "2" + mock_create.assert_awaited_once() + request_body = mock_create.await_args.kwargs["request"] + assert request_body["definition"] == { + "type": "synonym_graph", + "synonyms": ["a, b"], + } + + +class TestEnsureSynonymAwareAnalyzer: + async def test_returns_existing_when_found(self): + existing = {"id": "1", "organizationName": "org", "name": "ta"} + + with ( + patch.object( + search_setup, "_find_by_name", AsyncMock(return_value=existing) + ), + patch.object(search_setup, "create_text_analyzer") as mock_create, + ): + + ta = await ensure_synonym_aware_analyzer( + "org", + "ta", + base_system_analyzer_name="SCIENTIFIC", + synonym_set_qname="org-syn", + synapse_client=None, + ) + + assert ta.id == "1" + mock_create.assert_not_called() + + async def test_creates_clone_when_missing(self): + base_settings = { + "analyzer": {"default": {"tokenizer": "standard", "filter": ["lowercase"]}} + } + + async def _system_lookup(name, *, synapse_client): + from synapseclient.models.search_management import TextAnalyzer + + return TextAnalyzer( + organization_name="org.sagebionetworks", + name=name, + settings=base_settings, + ) + + created = {"id": "99", "organizationName": "org", "name": "ta"} + + with ( + patch.object(search_setup, "_find_by_name", AsyncMock(return_value=None)), + patch.object( + search_setup, "get_system_analyzer", side_effect=_system_lookup + ), + patch.object( + search_setup, "create_text_analyzer", AsyncMock(return_value=created) + ) as mock_create, + ): + + ta = await ensure_synonym_aware_analyzer( + "org", + "ta", + base_system_analyzer_name="SCIENTIFIC", + synonym_set_qname="org-syn", + synapse_client=None, + ) + + assert ta.id == "99" + request = mock_create.await_args.kwargs["request"] + assert "default_search" in request["settings"]["analyzer"] + assert request["settings"]["filter"]["synonyms"] == {"$ref": "org-syn"} + + +class TestEnsureSearchConfiguration: + async def test_returns_existing(self): + existing = { + "id": "5", + "organizationName": "org", + "name": "cfg", + "defaultAnalyzer": {"$ref": "org-an"}, + } + + with ( + patch.object( + search_setup, "_find_by_name", AsyncMock(return_value=existing) + ), + patch.object(search_setup, "create_search_configuration") as mock_create, + ): + + cfg = await ensure_search_configuration( + "org", + "cfg", + default_analyzer_qname="org-an", + synapse_client=None, + ) + + assert cfg.id == "5" + mock_create.assert_not_called() + + async def test_creates_with_inline_overrides(self): + created = { + "id": "6", + "organizationName": "org", + "name": "cfg", + } + + with ( + patch.object(search_setup, "_find_by_name", AsyncMock(return_value=None)), + patch.object( + search_setup, + "create_search_configuration", + AsyncMock(return_value=created), + ) as mock_create, + ): + + cfg = await ensure_search_configuration( + "org", + "cfg", + default_analyzer_qname="org-an", + column_overrides=[("col1", "org-co")], + synapse_client=None, + ) + + assert cfg.id == "6" + request = mock_create.await_args.kwargs["request"] + assert request["defaultAnalyzer"] == {"$ref": "org-an"} + inline = request["columnAnalyzerOverrides"][0] + assert inline["overrides"] == [ + {"columnName": "col1", "analyzer": {"$ref": "org-co"}} + ]