diff --git a/src/nameres/handlers/base.py b/src/nameres/handlers/base.py index df0dfb4..9755011 100644 --- a/src/nameres/handlers/base.py +++ b/src/nameres/handlers/base.py @@ -1,5 +1,7 @@ """Shared handler behavior for NameResolution API endpoints.""" +import json + from biothings.web.handlers import BaseHandler @@ -25,3 +27,7 @@ def set_default_headers(self): def options(self, *args, **kwargs): self.finish() + + def finish_json(self, response: dict | list) -> None: + self.set_header("Content-Type", "application/json; charset=UTF-8") + super().finish(json.dumps(response)) diff --git a/src/nameres/handlers/lookup.py b/src/nameres/handlers/lookup.py index 87507c6..8e940ee 100644 --- a/src/nameres/handlers/lookup.py +++ b/src/nameres/handlers/lookup.py @@ -4,10 +4,9 @@ Converted from SOLR -> Elasticsearch """ -import collections import dataclasses -import logging import json +import logging import re from typing import Optional @@ -27,7 +26,8 @@ class LookupArgumentException(Exception): @dataclasses.dataclass() class LookupQuery: - string: str + raw_string: str + query_strings: tuple[str, ...] autocomplete: Optional[bool] highlighting: Optional[bool] offset: Optional[int] @@ -146,9 +146,10 @@ def parse_boolean(argument: str | bool) -> bool: raise LookupArgumentException(lookup_message) self.lookup_queries = [] - for search_string in sanitized_lookup_strings: + for raw_string, query_strings in sanitized_lookup_strings: lookup_query = LookupQuery( - string=search_string, + raw_string=raw_string, + query_strings=query_strings, autocomplete=autocomplete_option, highlighting=highlighting_option, offset=offset_option, @@ -174,7 +175,7 @@ def _parse_lookup_string_arguments(self) -> list[str]: lookup_strings.extend(search_string_collection) return lookup_strings - def _sanitize_lookup_query(self, lookup_strings: list[str]) -> list[tuple[str]]: + def _sanitize_lookup_query(self, lookup_strings: list[str]) -> list[tuple[str, tuple[str, ...]]]: r"""Performs input sanitization on the lookup query terms. Sanitization Operations: @@ -203,7 +204,8 @@ def _sanitize_lookup_query(self, lookup_strings: list[str]) -> list[tuple[str]]: """ sanitized_lookup_strings = [] for lookup_string in lookup_strings: - lookup_string = lookup_string.strip().lower() + raw_lookup_string = lookup_string.strip() + lookup_string = raw_lookup_string.lower() windows_smart_single_quote_pattern = r"[‘’]" windows_smart_double_quote_pattern = r"[“”]" @@ -240,7 +242,11 @@ def _sanitize_lookup_query(self, lookup_strings: list[str]) -> list[tuple[str]]: fully_escaped_lookup_string = fully_escaped_lookup_string.replace("&&", " ") fully_escaped_lookup_string = fully_escaped_lookup_string.replace("||", " ") - sanitized_lookup_strings.append(set([lookup_string_with_escaped_groups, fully_escaped_lookup_string])) + query_strings = [lookup_string_with_escaped_groups] + if fully_escaped_lookup_string != lookup_string_with_escaped_groups: + query_strings.append(fully_escaped_lookup_string) + + sanitized_lookup_strings.append((raw_lookup_string, tuple(query_strings))) return sanitized_lookup_strings @@ -338,7 +344,7 @@ async def get(self): lookup_result = await lookup(self.biothings, self.lookup_queries[0], self.filters) except Exception as gen_exc: raise HTTPError(detail="Error occurred during processing.", status_code=500) from gen_exc - self.finish(lookup_result) + self.finish_json(lookup_result) async def post(self): """Returns cliques with a name or synonym that contains a specified string.""" @@ -346,7 +352,7 @@ async def post(self): lookup_result = await lookup(self.biothings, self.lookup_queries[0], self.filters) except Exception as gen_exc: raise HTTPError(detail="Error occurred during processing.", status_code=500) from gen_exc - self.finish(lookup_result) + self.finish_json(lookup_result) class NameResolutionBulkLookupHandler(BaseNameResolutionLookupHandler): @@ -359,23 +365,22 @@ class NameResolutionBulkLookupHandler(BaseNameResolutionLookupHandler): name = "bulk-lookup" - async def post(self) -> dict[str, collections.OrderedDict]: + async def post(self) -> None: """Returns cliques with a name or synonym that contains a specified string sent via batch.""" try: lookup_results = {} for lookup_query in self.lookup_queries: - lookup_result: collections.OrderedDict = await lookup(self.biothings, lookup_query, self.filters) - lookup_key: str = lookup_query.string.pop() - lookup_results[lookup_key] = lookup_result + lookup_result: list[dict] = await lookup(self.biothings, lookup_query, self.filters) + lookup_results[lookup_query.raw_string] = lookup_result except Exception as gen_exc: raise HTTPError(detail="Error occurred during processing.", status_code=500) from gen_exc self.finish(lookup_results) async def lookup( - biothings_metadata: NameResolutionAPINamespace, lookup_query: list[LookupQuery], filters: dict -) -> collections.OrderedDict: + biothings_metadata: NameResolutionAPINamespace, lookup_query: LookupQuery, filters: dict +) -> list[dict]: """Returns cliques with a name or synonym that contains a specified string.""" elasticsearch_query = _build_elasticsearch_query(lookup_query, filters) @@ -404,10 +409,7 @@ async def lookup( } lookup_response = await biothings_metadata.elasticsearch.async_client.search(**search_parameters) - # https://www.tornadoweb.org/en/stable/web.html#tornado.web.RequestHandler.write - # We have to change the API slightly here due to security requirements around returning - # list-objects as the API response - outputs = collections.OrderedDict() + outputs = [] for doc in lookup_response["hits"]["hits"]: preferred_matches = [] synonym_matches = [] @@ -419,34 +421,36 @@ async def lookup( source = doc["_source"] curie_identifier = source.get("curie", "") - outputs[curie_identifier] = dataclasses.asdict( - LookupResult( - curie=curie_identifier, - label=source.get("preferred_name", ""), - highlighting=( - { - "labels": preferred_matches, - "synonyms": synonym_matches, - } - if lookup_query.highlighting - else {} - ), - synonyms=source.get("names", []), - score=doc.get("_score", ""), - taxa=source.get("taxa", []), - clique_identifier_count=source.get("clique_identifier_count", 0), - types=[f"biolink:{d}" for d in source.get("biolink_types", [])], + outputs.append( + dataclasses.asdict( + LookupResult( + curie=curie_identifier, + label=source.get("preferred_name", ""), + highlighting=( + { + "labels": preferred_matches, + "synonyms": synonym_matches, + } + if lookup_query.highlighting + else {} + ), + synonyms=source.get("names", []), + score=doc.get("_score", ""), + taxa=source.get("taxa", []), + clique_identifier_count=source.get("clique_identifier_count", 0), + types=[f"biolink:{d}" for d in source.get("biolink_types", [])], + ) ) ) return outputs -def _build_elasticsearch_query(lookup_query: list[LookupQuery], filters: dict) -> dict: +def _build_elasticsearch_query(lookup_query: LookupQuery, filters: dict) -> dict: queries = [] # Base Query - for lookup_string in lookup_query.string: + for lookup_string in lookup_query.query_strings: queries.append( { "multi_match": { @@ -459,7 +463,7 @@ def _build_elasticsearch_query(lookup_query: list[LookupQuery], filters: dict) - # https://www.elastic.co/search-labs/blog/elasticsearch-autocomplete-search#2.-query-time if lookup_query.autocomplete: - for lookup_string in lookup_query.string: + for lookup_string in lookup_query.query_strings: queries.append( { "multi_match": { diff --git a/test/test_api_contract.py b/test/test_api_contract.py index 7c7e281..28ed103 100644 --- a/test/test_api_contract.py +++ b/test/test_api_contract.py @@ -124,10 +124,10 @@ def test_lookup_get_returns_result_shape(nameres_server): status, _, body = _request_json(nameres_server, "/lookup?string=aspirin&limit=1") assert status == 200 - assert isinstance(body, dict) + assert isinstance(body, list) assert body - first_result = next(iter(body.values())) + first_result = body[0] assert first_result["curie"] assert first_result["label"] assert isinstance(first_result["synonyms"], list) @@ -149,7 +149,7 @@ def test_lookup_accepts_issue_8_query_shape(nameres_server): status, _, body = _request_json(nameres_server, f"/lookup?{query}") assert status == 200 - assert isinstance(body, dict) + assert isinstance(body, list) def test_bulk_lookup_post_returns_results_by_input_string(nameres_server): @@ -157,19 +157,19 @@ def test_bulk_lookup_post_returns_results_by_input_string(nameres_server): nameres_server, "/bulk-lookup?limit=1", method="POST", - body={"strings": ["aspirin", "diabetes"]}, + body={"strings": [" Aspirin ", "diabetes"]}, ) assert status == 200 - assert set(body) == {"aspirin", "diabetes"} - assert isinstance(body["aspirin"], dict) - assert isinstance(body["diabetes"], dict) + assert set(body) == {"Aspirin", "diabetes"} + assert isinstance(body["Aspirin"], list) + assert isinstance(body["diabetes"], list) def test_synonyms_post_returns_known_and_missing_curies(nameres_server): lookup_status, _, lookup_body = _request_json(nameres_server, "/lookup?string=aspirin&limit=1") assert lookup_status == 200 - known_curie = next(iter(lookup_body)) + known_curie = lookup_body[0]["curie"] status, _, body = _request_json( nameres_server,