DEVEXP-509 Can now search documents

rjrudin · rjrudin · commit 1608a4490432 · 2023-07-19T14:14:13.000-04:00
diff --git a/marklogic/documents.py b/marklogic/documents.py
@@ -267,10 +267,56 @@ def _extract_values_from_header(part) -> dict:
     }
 
 
+def multipart_response_to_documents(response: Response) -> list[Document]:
+    """
+    Returns a list of Documents, one for each URI found in the various parts in the
+    given multipart response. The response is assumed to correspond to the structure
+    defined by https://docs.marklogic.com/REST/GET/v1/documents when the Accept header
+    is "multipart/mixed".
+    """
+    decoder = MultipartDecoder.from_response(response)
+
+    uris_to_documents = OrderedDict()
+
+    for part in decoder.parts:
+        header_values = _extract_values_from_header(part)
+        uri = header_values["uri"]
+        if header_values["category"] == "content":
+            content = (
+                json.loads(part.content)
+                if header_values["content_type"] == "application/json"
+                else part.content
+            )
+            content_type = header_values["content_type"]
+            version_id = header_values["version_id"]
+            if uris_to_documents.get(uri):
+                doc: Document = uris_to_documents[uri]
+                doc.content = content
+                doc.content_type = content_type
+                doc.version_id = version_id
+            else:
+                uris_to_documents[uri] = Document(
+                    uri, content, content_type=content_type, version_id=version_id
+                )
+        else:
+            doc = (
+                uris_to_documents[uri]
+                if uris_to_documents.get(uri)
+                else Document(uri, None)
+            )
+            uris_to_documents[uri] = doc
+            dict_to_metadata(json.loads(part.content), doc)
+
+    return list(uris_to_documents.values())
+
+
 class DocumentManager:
     """
-    Provides methods to simplify interacting with the /v1/documents REST endpoint
-    defined at https://docs.marklogic.com/REST/client/management.
+    Provides methods to simplify interacting with REST endpoints that either accept
+    or return documents. Primarily involves endpoints defined at
+    https://docs.marklogic.com/REST/client/management , but also includes support for
+    the search endpoint at https://docs.marklogic.com/REST/POST/v1/search which can
+    return documents as well.
     """
 
     def __init__(self, session: Session):
@@ -311,11 +357,18 @@ def write(
 
         return self._session.post("/v1/documents", data=data, headers=headers, **kwargs)
 
-    def _get_multipart_documents_response(
-        self, uris: list[str], categories: list[str], **kwargs
-    ) -> Response:
+    def read(
+        self, uris: list[str], categories: list[str] = None, **kwargs
+    ) -> Union[list[Document], Response]:
         """
-        Constructs and sends a multipart/mixed request to the v1/documents endpoint.
+        Read one or many documents via a GET to the endpoint defined at
+        https://docs.marklogic.com/REST/POST/v1/documents . If a 200 is not returned
+        by that endpoint, then the Response is returned instead.
+
+        :param uris: list of URIs to read.
+        :param categories: optional list of the categories of data to return for each
+        URI. By default, only content will be returned for each URI. See the endpoint
+        documentation for further information.
         """
         params = kwargs.pop("params", {})
         params["uri"] = uris
@@ -325,59 +378,92 @@ def _get_multipart_documents_response(
 
         headers = kwargs.pop("headers", {})
         headers["Accept"] = "multipart/mixed"
-        return self._session.get(
+        response = self._session.get(
             "/v1/documents", params=params, headers=headers, **kwargs
         )
 
-    def read(
-        self, uris: list[str], categories: list[str] = None, **kwargs
+        return (
+            multipart_response_to_documents(response)
+            if response.status_code == 200
+            else response
+        )
+
+    def search(
+        self,
+        query: Union[dict, str] = None,
+        categories: list[str] = None,
+        q: str = None,
+        start: int = None,
+        page_length: int = None,
+        options: str = None,
+        collections: list[str] = None,
+        **kwargs,
     ) -> Union[list[Document], Response]:
         """
-        Read one or many documents via a GET to the endpoint defined at
-        https://docs.marklogic.com/REST/POST/v1/documents . If a 200 is not returned
-        by that endpoint, then the Response is returned instead.
-
-        :param uris: list of URIs to read.
+        Leverages the support in the search endpoint defined at
+        https://docs.marklogic.com/REST/POST/v1/search for returning a list of
+        documents instead of a search response. Parameters that are commonly used for
+        that endpoint are included as arguments to this method for ease of use.
+
+        :param query: JSON or XML query matching one of the types supported by the 
+        search endpoint. The "Content-type" header will be set based on whether this 
+        is a dict, a string of JSON, or a string of XML.
         :param categories: optional list of the categories of data to return for each
         URI. By default, only content will be returned for each URI. See the endpoint
         documentation for further information.
+        :param q: optional search string.
+        :param start: index of the first result to return.
+        :param page_length: maximum number of documents to return.
+        :param options: name of a query options instance to use.
+        :param collections: restrict results to documents in these collections.
         """
-        response = self._get_multipart_documents_response(uris, categories, **kwargs)
-        if response.status_code != 200:
-            return response
-
-        decoder = MultipartDecoder.from_response(response)
-
-        # Use a dict to store URIs to Document objects so that we don't assume any
-        # order with how the metadata and content parts are returned. An OrderedDict is
-        # used to ensure that the order of the URIs is maintained, though the REST
-        # endpoint is not guaranteed to return them in the same order as provided by
-        # the user.
-        docs = OrderedDict()
-
-        for part in decoder.parts:
-            header_values = _extract_values_from_header(part)
-            uri = header_values["uri"]
-            if header_values["category"] == "content":
-                content = (
-                    json.loads(part.content)
-                    if header_values["content_type"] == "application/json"
-                    else part.content
-                )
-                content_type = header_values["content_type"]
-                version_id = header_values["version_id"]
-                if docs.get(uri):
-                    doc: Document = docs[uri]
-                    doc.content = content
-                    doc.content_type = content_type
-                    doc.version_id = version_id
-                else:
-                    docs[uri] = Document(
-                        uri, content, content_type=content_type, version_id=version_id
-                    )
-            else:
-                doc = docs[uri] if docs.get(uri) else Document(uri, None)
-                docs[uri] = doc
-                dict_to_metadata(json.loads(part.content), doc)
+        params = kwargs.pop("params", {})
+        params["format"] = "json"  # This refers to the metadata format.
+        if categories:
+            params["category"] = categories
+        if collections:
+            params["collection"] = collections
+        if q:
+            params["q"] = q
+        if start:
+            params["start"] = start
+        if page_length:
+            params["pageLength"] = page_length
+        if options:
+            params["options"] = options
+
+        headers = kwargs.pop("headers", {})
+        headers["Accept"] = "multipart/mixed"
+        data = None
 
-        return list(docs.values())
+        if query:
+            if isinstance(query, dict):
+                data = json.dumps(query)
+                headers["Content-type"] = "application/json"
+            else:
+                data = query
+                try:
+                    json.loads(query)
+                except Exception:
+                    headers["Content-type"] = "application/xml"
+                else:
+                    headers["Content-type"] = "application/json"
+
+        if data:
+            response = self._session.post(
+                "/v1/search",
+                headers=headers,
+                params=params,
+                data=data,
+                **kwargs,
+            )
+        else:
+            response = self._session.post(
+                "/v1/search", headers=headers, params=params, **kwargs
+            )
+
+        return (
+            multipart_response_to_documents(response)
+            if response.status_code == 200
+            else response
+        )
diff --git a/test-app/src/main/ml-modules/options/test-options.xml b/test-app/src/main/ml-modules/options/test-options.xml
@@ -0,0 +1,7 @@
+<options xmlns="http://marklogic.com/appservices/search">
+    <constraint name="hello">
+        <value>
+            <element ns="" name="hello"/>
+        </value>
+    </constraint>
+</options>
diff --git a/tests/test_search_docs.py b/tests/test_search_docs.py
@@ -0,0 +1,66 @@
+import json
+
+from requests import Response
+
+from marklogic import Client
+
+
+def test_structured_json_string_query(client: Client):
+    query = json.dumps({"query": {"term-query": {"text": "world"}}})
+    docs = client.documents.search(query=query)
+    assert len(docs) == 2
+
+
+def test_structured_json_query(client: Client):
+    query = {"query": {"term-query": {"text": "world"}}}
+    docs = client.documents.search(query=query)
+    assert len(docs) == 2
+
+
+def test_structured_xml_query(client: Client):
+    query = "<query xmlns='http://marklogic.com/appservices/search'>\
+        <term-query><text>world</text></term-query></query>"
+    docs = client.documents.search(query=query)
+    assert len(docs) == 2
+
+
+def test_qtext_and_start(client: Client):
+    docs = client.documents.search(q="world", start=2)
+    assert len(docs) == 1, "2 docs match, but start=2, so only 1 should be returned"
+
+
+def test_qtext_and_page_length(client: Client):
+    docs = client.documents.search(q="world", page_length=1)
+    assert len(docs) == 1
+
+
+def test_search_options(client: Client):
+    docs = client.documents.search(q="hello:world", options="test-options")
+    assert len(docs) == 1
+    assert docs[0].uri == "/doc2.xml"
+    docs = client.documents.search(q="hello:no matches", options="test-options")
+    assert len(docs) == 0
+
+
+def test_collection(client: Client):
+    docs = client.documents.search(
+        categories=["content", "collections"], collections=["test-data"]
+    )
+    assert len(docs) == 2
+
+    doc1 = next(doc for doc in docs if doc.uri == "/doc1.json")
+    assert doc1.content is not None
+    assert doc1.collections[0] == "test-data"
+
+    doc2 = next(doc for doc in docs if doc.uri == "/doc2.xml")
+    assert doc2.content is not None
+    assert doc2.collections[0] == "test-data"
+
+
+def test_not_rest_user(not_rest_user_client: Client):
+    response: Response = not_rest_user_client.documents.search(q="hello")
+    assert (
+        response.status_code == 403
+    ), """The user does not have the rest-reader privilege, so MarkLogic is expected
+    to return a 403. And the documents.search method is then expected to return the
+    Response so that the user has access to everything in it."""