Skip to content

Commit 1608a44

Browse files
committed
DEVEXP-509 Can now search documents
1 parent ecccf48 commit 1608a44

File tree

3 files changed

+211
-52
lines changed

3 files changed

+211
-52
lines changed

marklogic/documents.py

Lines changed: 138 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -267,10 +267,56 @@ def _extract_values_from_header(part) -> dict:
267267
}
268268

269269

270+
def multipart_response_to_documents(response: Response) -> list[Document]:
271+
"""
272+
Returns a list of Documents, one for each URI found in the various parts in the
273+
given multipart response. The response is assumed to correspond to the structure
274+
defined by https://docs.marklogic.com/REST/GET/v1/documents when the Accept header
275+
is "multipart/mixed".
276+
"""
277+
decoder = MultipartDecoder.from_response(response)
278+
279+
uris_to_documents = OrderedDict()
280+
281+
for part in decoder.parts:
282+
header_values = _extract_values_from_header(part)
283+
uri = header_values["uri"]
284+
if header_values["category"] == "content":
285+
content = (
286+
json.loads(part.content)
287+
if header_values["content_type"] == "application/json"
288+
else part.content
289+
)
290+
content_type = header_values["content_type"]
291+
version_id = header_values["version_id"]
292+
if uris_to_documents.get(uri):
293+
doc: Document = uris_to_documents[uri]
294+
doc.content = content
295+
doc.content_type = content_type
296+
doc.version_id = version_id
297+
else:
298+
uris_to_documents[uri] = Document(
299+
uri, content, content_type=content_type, version_id=version_id
300+
)
301+
else:
302+
doc = (
303+
uris_to_documents[uri]
304+
if uris_to_documents.get(uri)
305+
else Document(uri, None)
306+
)
307+
uris_to_documents[uri] = doc
308+
dict_to_metadata(json.loads(part.content), doc)
309+
310+
return list(uris_to_documents.values())
311+
312+
270313
class DocumentManager:
271314
"""
272-
Provides methods to simplify interacting with the /v1/documents REST endpoint
273-
defined at https://docs.marklogic.com/REST/client/management.
315+
Provides methods to simplify interacting with REST endpoints that either accept
316+
or return documents. Primarily involves endpoints defined at
317+
https://docs.marklogic.com/REST/client/management , but also includes support for
318+
the search endpoint at https://docs.marklogic.com/REST/POST/v1/search which can
319+
return documents as well.
274320
"""
275321

276322
def __init__(self, session: Session):
@@ -311,11 +357,18 @@ def write(
311357

312358
return self._session.post("/v1/documents", data=data, headers=headers, **kwargs)
313359

314-
def _get_multipart_documents_response(
315-
self, uris: list[str], categories: list[str], **kwargs
316-
) -> Response:
360+
def read(
361+
self, uris: list[str], categories: list[str] = None, **kwargs
362+
) -> Union[list[Document], Response]:
317363
"""
318-
Constructs and sends a multipart/mixed request to the v1/documents endpoint.
364+
Read one or many documents via a GET to the endpoint defined at
365+
https://docs.marklogic.com/REST/POST/v1/documents . If a 200 is not returned
366+
by that endpoint, then the Response is returned instead.
367+
368+
:param uris: list of URIs to read.
369+
:param categories: optional list of the categories of data to return for each
370+
URI. By default, only content will be returned for each URI. See the endpoint
371+
documentation for further information.
319372
"""
320373
params = kwargs.pop("params", {})
321374
params["uri"] = uris
@@ -325,59 +378,92 @@ def _get_multipart_documents_response(
325378

326379
headers = kwargs.pop("headers", {})
327380
headers["Accept"] = "multipart/mixed"
328-
return self._session.get(
381+
response = self._session.get(
329382
"/v1/documents", params=params, headers=headers, **kwargs
330383
)
331384

332-
def read(
333-
self, uris: list[str], categories: list[str] = None, **kwargs
385+
return (
386+
multipart_response_to_documents(response)
387+
if response.status_code == 200
388+
else response
389+
)
390+
391+
def search(
392+
self,
393+
query: Union[dict, str] = None,
394+
categories: list[str] = None,
395+
q: str = None,
396+
start: int = None,
397+
page_length: int = None,
398+
options: str = None,
399+
collections: list[str] = None,
400+
**kwargs,
334401
) -> Union[list[Document], Response]:
335402
"""
336-
Read one or many documents via a GET to the endpoint defined at
337-
https://docs.marklogic.com/REST/POST/v1/documents . If a 200 is not returned
338-
by that endpoint, then the Response is returned instead.
339-
340-
:param uris: list of URIs to read.
403+
Leverages the support in the search endpoint defined at
404+
https://docs.marklogic.com/REST/POST/v1/search for returning a list of
405+
documents instead of a search response. Parameters that are commonly used for
406+
that endpoint are included as arguments to this method for ease of use.
407+
408+
:param query: JSON or XML query matching one of the types supported by the
409+
search endpoint. The "Content-type" header will be set based on whether this
410+
is a dict, a string of JSON, or a string of XML.
341411
:param categories: optional list of the categories of data to return for each
342412
URI. By default, only content will be returned for each URI. See the endpoint
343413
documentation for further information.
414+
:param q: optional search string.
415+
:param start: index of the first result to return.
416+
:param page_length: maximum number of documents to return.
417+
:param options: name of a query options instance to use.
418+
:param collections: restrict results to documents in these collections.
344419
"""
345-
response = self._get_multipart_documents_response(uris, categories, **kwargs)
346-
if response.status_code != 200:
347-
return response
348-
349-
decoder = MultipartDecoder.from_response(response)
350-
351-
# Use a dict to store URIs to Document objects so that we don't assume any
352-
# order with how the metadata and content parts are returned. An OrderedDict is
353-
# used to ensure that the order of the URIs is maintained, though the REST
354-
# endpoint is not guaranteed to return them in the same order as provided by
355-
# the user.
356-
docs = OrderedDict()
357-
358-
for part in decoder.parts:
359-
header_values = _extract_values_from_header(part)
360-
uri = header_values["uri"]
361-
if header_values["category"] == "content":
362-
content = (
363-
json.loads(part.content)
364-
if header_values["content_type"] == "application/json"
365-
else part.content
366-
)
367-
content_type = header_values["content_type"]
368-
version_id = header_values["version_id"]
369-
if docs.get(uri):
370-
doc: Document = docs[uri]
371-
doc.content = content
372-
doc.content_type = content_type
373-
doc.version_id = version_id
374-
else:
375-
docs[uri] = Document(
376-
uri, content, content_type=content_type, version_id=version_id
377-
)
378-
else:
379-
doc = docs[uri] if docs.get(uri) else Document(uri, None)
380-
docs[uri] = doc
381-
dict_to_metadata(json.loads(part.content), doc)
420+
params = kwargs.pop("params", {})
421+
params["format"] = "json" # This refers to the metadata format.
422+
if categories:
423+
params["category"] = categories
424+
if collections:
425+
params["collection"] = collections
426+
if q:
427+
params["q"] = q
428+
if start:
429+
params["start"] = start
430+
if page_length:
431+
params["pageLength"] = page_length
432+
if options:
433+
params["options"] = options
434+
435+
headers = kwargs.pop("headers", {})
436+
headers["Accept"] = "multipart/mixed"
437+
data = None
382438

383-
return list(docs.values())
439+
if query:
440+
if isinstance(query, dict):
441+
data = json.dumps(query)
442+
headers["Content-type"] = "application/json"
443+
else:
444+
data = query
445+
try:
446+
json.loads(query)
447+
except Exception:
448+
headers["Content-type"] = "application/xml"
449+
else:
450+
headers["Content-type"] = "application/json"
451+
452+
if data:
453+
response = self._session.post(
454+
"/v1/search",
455+
headers=headers,
456+
params=params,
457+
data=data,
458+
**kwargs,
459+
)
460+
else:
461+
response = self._session.post(
462+
"/v1/search", headers=headers, params=params, **kwargs
463+
)
464+
465+
return (
466+
multipart_response_to_documents(response)
467+
if response.status_code == 200
468+
else response
469+
)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<options xmlns="http://marklogic.com/appservices/search">
2+
<constraint name="hello">
3+
<value>
4+
<element ns="" name="hello"/>
5+
</value>
6+
</constraint>
7+
</options>

tests/test_search_docs.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import json
2+
3+
from requests import Response
4+
5+
from marklogic import Client
6+
7+
8+
def test_structured_json_string_query(client: Client):
9+
query = json.dumps({"query": {"term-query": {"text": "world"}}})
10+
docs = client.documents.search(query=query)
11+
assert len(docs) == 2
12+
13+
14+
def test_structured_json_query(client: Client):
15+
query = {"query": {"term-query": {"text": "world"}}}
16+
docs = client.documents.search(query=query)
17+
assert len(docs) == 2
18+
19+
20+
def test_structured_xml_query(client: Client):
21+
query = "<query xmlns='http://marklogic.com/appservices/search'>\
22+
<term-query><text>world</text></term-query></query>"
23+
docs = client.documents.search(query=query)
24+
assert len(docs) == 2
25+
26+
27+
def test_qtext_and_start(client: Client):
28+
docs = client.documents.search(q="world", start=2)
29+
assert len(docs) == 1, "2 docs match, but start=2, so only 1 should be returned"
30+
31+
32+
def test_qtext_and_page_length(client: Client):
33+
docs = client.documents.search(q="world", page_length=1)
34+
assert len(docs) == 1
35+
36+
37+
def test_search_options(client: Client):
38+
docs = client.documents.search(q="hello:world", options="test-options")
39+
assert len(docs) == 1
40+
assert docs[0].uri == "/doc2.xml"
41+
docs = client.documents.search(q="hello:no matches", options="test-options")
42+
assert len(docs) == 0
43+
44+
45+
def test_collection(client: Client):
46+
docs = client.documents.search(
47+
categories=["content", "collections"], collections=["test-data"]
48+
)
49+
assert len(docs) == 2
50+
51+
doc1 = next(doc for doc in docs if doc.uri == "/doc1.json")
52+
assert doc1.content is not None
53+
assert doc1.collections[0] == "test-data"
54+
55+
doc2 = next(doc for doc in docs if doc.uri == "/doc2.xml")
56+
assert doc2.content is not None
57+
assert doc2.collections[0] == "test-data"
58+
59+
60+
def test_not_rest_user(not_rest_user_client: Client):
61+
response: Response = not_rest_user_client.documents.search(q="hello")
62+
assert (
63+
response.status_code == 403
64+
), """The user does not have the rest-reader privilege, so MarkLogic is expected
65+
to return a 403. And the documents.search method is then expected to return the
66+
Response so that the user has access to everything in it."""

0 commit comments

Comments
 (0)