Skip to content

Commit 305de78

Browse files
committed
DEVEXP-503 Can now read a batch of documents with metadata
Reworked some of the existing "write" tests to now use `client.documents.read` to verify data that was written.
1 parent aee68d2 commit 305de78

File tree

9 files changed

+383
-153
lines changed

9 files changed

+383
-153
lines changed

marklogic/documents.py

Lines changed: 152 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import json
2+
from collections import OrderedDict
23
from typing import Union
34

45
from requests import Response, Session
6+
from requests_toolbelt.multipart.decoder import MultipartDecoder
57
from urllib3.fields import RequestField
68
from urllib3.filepost import encode_multipart_formdata
79

@@ -63,27 +65,29 @@ def metadata_to_dict(metadata: Metadata) -> dict:
6365
return md
6466

6567

68+
def dict_to_metadata(metadata: dict, target_metadata: Metadata) -> None:
69+
"""
70+
Populates the given Metadata instance based on the metadata dictionary as returned
71+
by the /v1/documents REST endpoint.
72+
"""
73+
target_metadata.collections = metadata.get("collections")
74+
target_metadata.quality = metadata.get("quality")
75+
target_metadata.metadata_values = metadata.get("metadataValues")
76+
target_metadata.properties = metadata.get("properties")
77+
if metadata.get("permissions"):
78+
perms = {}
79+
for perm in metadata["permissions"]:
80+
role = perm["role-name"]
81+
perms[role] = perm["capabilities"]
82+
target_metadata.permissions = perms
83+
else:
84+
target_metadata.perms = None
85+
86+
6687
class Document(Metadata):
6788
"""
68-
:param uri: the URI of the document; can be None when relying on MarkLogic to
69-
generate a URI.
70-
:param content: the content of the document.
71-
:param collections: see definition in parent class.
72-
:param permissions: see definition in parent class.
73-
:param quality: see definition in parent class.
74-
:param metadata_values: see definition in parent class.
75-
:param properties: see definition in parent class.
76-
:param content_type: the MIME type of the document; use when MarkLogic cannot
77-
determine the MIME type based on the URI.
78-
:param extension: specifies a suffix for a URI generated by MarkLogic.
79-
:param directory: specifies a prefix for a URI generated by MarkLogic.
80-
:param repair: for an XML document, the level of XML repair to perform; can be
81-
"full" or "none", with "none" being the default.
82-
:param version_id: affects updates when optimistic locking is enabled; see
83-
https://docs.marklogic.com/REST/POST/v1/documents for more information.
84-
:param temporal_document: the logical document URI for a document written to a
85-
temporal collection; requires that a "temporal-collection" parameter be included in
86-
the request.
89+
Represents a document, either as read from MarkLogic or as a document to be
90+
written to MarkLogic.
8791
"""
8892

8993
def __init__(
@@ -96,24 +100,48 @@ def __init__(
96100
metadata_values: dict = None,
97101
properties: dict = None,
98102
content_type: str = None,
103+
version_id: str = None,
99104
extension: str = None,
100105
directory: str = None,
101106
repair: str = None,
102107
extract: str = None,
103-
version_id: str = None,
104108
temporal_document: str = None,
105109
):
110+
"""
111+
:param uri: the URI of the document; can be None when relying on MarkLogic to
112+
generate a URI.
113+
:param content: the content of the document.
114+
:param collections: see definition in parent class.
115+
:param permissions: see definition in parent class.
116+
:param quality: see definition in parent class.
117+
:param metadata_values: see definition in parent class.
118+
:param properties: see definition in parent class.
119+
:param content_type: the MIME type of the document; use when MarkLogic cannot
120+
determine the MIME type based on the URI.
121+
:param version_id: affects updates when optimistic locking is enabled; see
122+
https://docs.marklogic.com/REST/POST/v1/documents for more information.
123+
:param temporal_document: the logical document URI for a document written to a
124+
:param extension: specifies a suffix for a URI generated by MarkLogic; only used
125+
when writing a document.
126+
:param directory: specifies a prefix for a URI generated by MarkLogic; only used
127+
when writing a document.
128+
:param repair: for an XML document, the level of XML repair to perform; can be
129+
"full" or "none", with "none" being the default; only used when writing a
130+
document.
131+
temporal collection; requires that a "temporal-collection" parameter be
132+
included in the request; only used when writing a document.
133+
"""
106134
super().__init__(collections, permissions, quality, metadata_values, properties)
107135
self.uri = uri
108136
self.content = content
137+
self.content_type = content_type
138+
self.version_id = version_id
109139

110140
# The following are all specific to writing a document.
111-
self.content_type = content_type
112141
self.extension = extension
113142
self.directory = directory
114143
self.repair = repair
115144
self.extract = extract
116-
self.version_id = version_id
117145
self.temporal_document = temporal_document
118146

119147
def to_request_field(self) -> RequestField:
@@ -208,6 +236,37 @@ def to_metadata_request_field(self) -> RequestField:
208236
return field
209237

210238

239+
def _extract_values_from_header(part) -> dict:
240+
"""
241+
Returns a dict containing values about the document content or metadata.
242+
"""
243+
encoding = part.encoding
244+
disposition = part.headers["Content-Disposition".encode(encoding)].decode(encoding)
245+
disposition_values = {}
246+
for item in disposition.split(";"):
247+
tokens = item.split("=")
248+
# The first item will be "attachment" and can be ignored.
249+
if len(tokens) == 2:
250+
disposition_values[tokens[0].strip()] = tokens[1]
251+
252+
content_type = None
253+
if part.headers.get("Content-Type".encode(encoding)):
254+
content_type = part.headers["Content-Type".encode(encoding)].decode(encoding)
255+
256+
uri = disposition_values["filename"]
257+
if uri.startswith('"'):
258+
uri = uri[1:]
259+
if uri.endswith('"'):
260+
uri = uri[:-1]
261+
262+
return {
263+
"uri": uri,
264+
"category": disposition_values["category"],
265+
"content_type": content_type,
266+
"version_id": disposition_values.get("versionId"),
267+
}
268+
269+
211270
class DocumentManager:
212271
"""
213272
Provides methods to simplify interacting with the /v1/documents REST endpoint
@@ -251,3 +310,74 @@ def write(
251310
headers["Accept"] = "application/json"
252311

253312
return self._session.post("/v1/documents", data=data, headers=headers, **kwargs)
313+
314+
def _get_multipart_documents_response(
315+
self, uris: list[str], categories: list[str], **kwargs
316+
) -> Response:
317+
"""
318+
Constructs and sends a multipart/mixed request to the v1/documents endpoint.
319+
"""
320+
params = kwargs.pop("params", {})
321+
params["uri"] = uris
322+
params["format"] = "json" # This refers to the metadata format.
323+
if categories:
324+
params["category"] = categories
325+
326+
headers = kwargs.pop("headers", {})
327+
headers["Accept"] = "multipart/mixed"
328+
return self._session.get(
329+
"/v1/documents", params=params, headers=headers, **kwargs
330+
)
331+
332+
def read(
333+
self, uris: list[str], categories: list[str] = None, **kwargs
334+
) -> Union[list[Document], Response]:
335+
"""
336+
Read one or many documents via a GET to the endpoint defined at
337+
https://docs.marklogic.com/REST/POST/v1/documents . If a 200 is not returned
338+
by that endpoint, then the Response is returned instead.
339+
340+
:param uris: list of URIs to read.
341+
:param categories: optional list of the categories of data to return for each
342+
URI. By default, only content will be returned for each URI. See the endpoint
343+
documentation for further information.
344+
"""
345+
response = self._get_multipart_documents_response(uris, categories, **kwargs)
346+
if response.status_code != 200:
347+
return response
348+
349+
decoder = MultipartDecoder.from_response(response)
350+
351+
# Use a dict to store URIs to Document objects so that we don't assume any
352+
# order with how the metadata and content parts are returned. An OrderedDict is
353+
# used to ensure that the order of the URIs is maintained, though the REST
354+
# endpoint is not guaranteed to return them in the same order as provided by
355+
# the user.
356+
docs = OrderedDict()
357+
358+
for part in decoder.parts:
359+
header_values = _extract_values_from_header(part)
360+
uri = header_values["uri"]
361+
if header_values["category"] == "content":
362+
content = (
363+
json.loads(part.content)
364+
if header_values["content_type"] == "application/json"
365+
else part.content
366+
)
367+
content_type = header_values["content_type"]
368+
version_id = header_values["version_id"]
369+
if docs.get(uri):
370+
doc: Document = docs[uri]
371+
doc.content = content
372+
doc.content_type = content_type
373+
doc.version_id = version_id
374+
else:
375+
docs[uri] = Document(
376+
uri, content, content_type=content_type, version_id=version_id
377+
)
378+
else:
379+
doc = docs[uri] if docs.get(uri) else Document(uri, None)
380+
docs[uri] = doc
381+
dict_to_metadata(json.loads(part.content), doc)
382+
383+
return list(docs.values())
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"user-name": "python-not-rest-user",
3+
"description": "For tests where the user does not have the privileges required by the REST API.",
4+
"password": "password",
5+
"role": [
6+
"qconsole-user"
7+
]
8+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
function transform(context, params, content) {
2+
return {
3+
"envelope": content
4+
}
5+
};
6+
exports.transform = transform;

tests/conftest.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,29 @@
11
import pytest
2+
23
from marklogic import Client
34

5+
BASE_URL = "http://localhost:8030"
6+
47

58
@pytest.fixture
69
def client():
7-
return Client("http://localhost:8030", digest=("python-test-user", "password"))
10+
return Client(BASE_URL, digest=("python-test-user", "password"))
811

912

1013
@pytest.fixture
1114
def admin_client():
12-
return Client("http://localhost:8030", digest=("python-test-admin", "password"))
15+
return Client(BASE_URL, digest=("python-test-admin", "password"))
1316

1417

1518
@pytest.fixture
1619
def basic_client():
1720
# requests allows a tuple to be passed when doing basic authentication.
18-
return Client("http://localhost:8030", auth=("python-test-user", "password"))
21+
return Client(BASE_URL, auth=("python-test-user", "password"))
22+
23+
24+
@pytest.fixture
25+
def not_rest_user_client():
26+
return Client(BASE_URL, digest=("python-not-rest-user", "password"))
1927

2028

2129
@pytest.fixture

tests/test_get_documents.py

Lines changed: 0 additions & 54 deletions
This file was deleted.

0 commit comments

Comments
 (0)