11import json
2+ from collections import OrderedDict
23from typing import Union
34
45from requests import Response , Session
6+ from requests_toolbelt .multipart .decoder import MultipartDecoder
57from urllib3 .fields import RequestField
68from urllib3 .filepost import encode_multipart_formdata
79
@@ -63,27 +65,29 @@ def metadata_to_dict(metadata: Metadata) -> dict:
6365 return md
6466
6567
68+ def dict_to_metadata (metadata : dict , target_metadata : Metadata ) -> None :
69+ """
70+ Populates the given Metadata instance based on the metadata dictionary as returned
71+ by the /v1/documents REST endpoint.
72+ """
73+ target_metadata .collections = metadata .get ("collections" )
74+ target_metadata .quality = metadata .get ("quality" )
75+ target_metadata .metadata_values = metadata .get ("metadataValues" )
76+ target_metadata .properties = metadata .get ("properties" )
77+ if metadata .get ("permissions" ):
78+ perms = {}
79+ for perm in metadata ["permissions" ]:
80+ role = perm ["role-name" ]
81+ perms [role ] = perm ["capabilities" ]
82+ target_metadata .permissions = perms
83+ else :
84+ target_metadata .perms = None
85+
86+
6687class Document (Metadata ):
6788 """
68- :param uri: the URI of the document; can be None when relying on MarkLogic to
69- generate a URI.
70- :param content: the content of the document.
71- :param collections: see definition in parent class.
72- :param permissions: see definition in parent class.
73- :param quality: see definition in parent class.
74- :param metadata_values: see definition in parent class.
75- :param properties: see definition in parent class.
76- :param content_type: the MIME type of the document; use when MarkLogic cannot
77- determine the MIME type based on the URI.
78- :param extension: specifies a suffix for a URI generated by MarkLogic.
79- :param directory: specifies a prefix for a URI generated by MarkLogic.
80- :param repair: for an XML document, the level of XML repair to perform; can be
81- "full" or "none", with "none" being the default.
82- :param version_id: affects updates when optimistic locking is enabled; see
83- https://docs.marklogic.com/REST/POST/v1/documents for more information.
84- :param temporal_document: the logical document URI for a document written to a
85- temporal collection; requires that a "temporal-collection" parameter be included in
86- the request.
89+ Represents a document, either as read from MarkLogic or as a document to be
90+ written to MarkLogic.
8791 """
8892
8993 def __init__ (
@@ -96,24 +100,48 @@ def __init__(
96100 metadata_values : dict = None ,
97101 properties : dict = None ,
98102 content_type : str = None ,
103+ version_id : str = None ,
99104 extension : str = None ,
100105 directory : str = None ,
101106 repair : str = None ,
102107 extract : str = None ,
103- version_id : str = None ,
104108 temporal_document : str = None ,
105109 ):
110+ """
111+ :param uri: the URI of the document; can be None when relying on MarkLogic to
112+ generate a URI.
113+ :param content: the content of the document.
114+ :param collections: see definition in parent class.
115+ :param permissions: see definition in parent class.
116+ :param quality: see definition in parent class.
117+ :param metadata_values: see definition in parent class.
118+ :param properties: see definition in parent class.
119+ :param content_type: the MIME type of the document; use when MarkLogic cannot
120+ determine the MIME type based on the URI.
121+ :param version_id: affects updates when optimistic locking is enabled; see
122+ https://docs.marklogic.com/REST/POST/v1/documents for more information.
123+ :param temporal_document: the logical document URI for a document written to a
124+ :param extension: specifies a suffix for a URI generated by MarkLogic; only used
125+ when writing a document.
126+ :param directory: specifies a prefix for a URI generated by MarkLogic; only used
127+ when writing a document.
128+ :param repair: for an XML document, the level of XML repair to perform; can be
129+ "full" or "none", with "none" being the default; only used when writing a
130+ document.
131+ temporal collection; requires that a "temporal-collection" parameter be
132+ included in the request; only used when writing a document.
133+ """
106134 super ().__init__ (collections , permissions , quality , metadata_values , properties )
107135 self .uri = uri
108136 self .content = content
137+ self .content_type = content_type
138+ self .version_id = version_id
109139
110140 # The following are all specific to writing a document.
111- self .content_type = content_type
112141 self .extension = extension
113142 self .directory = directory
114143 self .repair = repair
115144 self .extract = extract
116- self .version_id = version_id
117145 self .temporal_document = temporal_document
118146
119147 def to_request_field (self ) -> RequestField :
@@ -208,6 +236,37 @@ def to_metadata_request_field(self) -> RequestField:
208236 return field
209237
210238
239+ def _extract_values_from_header (part ) -> dict :
240+ """
241+ Returns a dict containing values about the document content or metadata.
242+ """
243+ encoding = part .encoding
244+ disposition = part .headers ["Content-Disposition" .encode (encoding )].decode (encoding )
245+ disposition_values = {}
246+ for item in disposition .split (";" ):
247+ tokens = item .split ("=" )
248+ # The first item will be "attachment" and can be ignored.
249+ if len (tokens ) == 2 :
250+ disposition_values [tokens [0 ].strip ()] = tokens [1 ]
251+
252+ content_type = None
253+ if part .headers .get ("Content-Type" .encode (encoding )):
254+ content_type = part .headers ["Content-Type" .encode (encoding )].decode (encoding )
255+
256+ uri = disposition_values ["filename" ]
257+ if uri .startswith ('"' ):
258+ uri = uri [1 :]
259+ if uri .endswith ('"' ):
260+ uri = uri [:- 1 ]
261+
262+ return {
263+ "uri" : uri ,
264+ "category" : disposition_values ["category" ],
265+ "content_type" : content_type ,
266+ "version_id" : disposition_values .get ("versionId" ),
267+ }
268+
269+
211270class DocumentManager :
212271 """
213272 Provides methods to simplify interacting with the /v1/documents REST endpoint
@@ -251,3 +310,74 @@ def write(
251310 headers ["Accept" ] = "application/json"
252311
253312 return self ._session .post ("/v1/documents" , data = data , headers = headers , ** kwargs )
313+
314+ def _get_multipart_documents_response (
315+ self , uris : list [str ], categories : list [str ], ** kwargs
316+ ) -> Response :
317+ """
318+ Constructs and sends a multipart/mixed request to the v1/documents endpoint.
319+ """
320+ params = kwargs .pop ("params" , {})
321+ params ["uri" ] = uris
322+ params ["format" ] = "json" # This refers to the metadata format.
323+ if categories :
324+ params ["category" ] = categories
325+
326+ headers = kwargs .pop ("headers" , {})
327+ headers ["Accept" ] = "multipart/mixed"
328+ return self ._session .get (
329+ "/v1/documents" , params = params , headers = headers , ** kwargs
330+ )
331+
332+ def read (
333+ self , uris : list [str ], categories : list [str ] = None , ** kwargs
334+ ) -> Union [list [Document ], Response ]:
335+ """
336+ Read one or many documents via a GET to the endpoint defined at
337+ https://docs.marklogic.com/REST/POST/v1/documents . If a 200 is not returned
338+ by that endpoint, then the Response is returned instead.
339+
340+ :param uris: list of URIs to read.
341+ :param categories: optional list of the categories of data to return for each
342+ URI. By default, only content will be returned for each URI. See the endpoint
343+ documentation for further information.
344+ """
345+ response = self ._get_multipart_documents_response (uris , categories , ** kwargs )
346+ if response .status_code != 200 :
347+ return response
348+
349+ decoder = MultipartDecoder .from_response (response )
350+
351+ # Use a dict to store URIs to Document objects so that we don't assume any
352+ # order with how the metadata and content parts are returned. An OrderedDict is
353+ # used to ensure that the order of the URIs is maintained, though the REST
354+ # endpoint is not guaranteed to return them in the same order as provided by
355+ # the user.
356+ docs = OrderedDict ()
357+
358+ for part in decoder .parts :
359+ header_values = _extract_values_from_header (part )
360+ uri = header_values ["uri" ]
361+ if header_values ["category" ] == "content" :
362+ content = (
363+ json .loads (part .content )
364+ if header_values ["content_type" ] == "application/json"
365+ else part .content
366+ )
367+ content_type = header_values ["content_type" ]
368+ version_id = header_values ["version_id" ]
369+ if docs .get (uri ):
370+ doc : Document = docs [uri ]
371+ doc .content = content
372+ doc .content_type = content_type
373+ doc .version_id = version_id
374+ else :
375+ docs [uri ] = Document (
376+ uri , content , content_type = content_type , version_id = version_id
377+ )
378+ else :
379+ doc = docs [uri ] if docs .get (uri ) else Document (uri , None )
380+ docs [uri ] = doc
381+ dict_to_metadata (json .loads (part .content ), doc )
382+
383+ return list (docs .values ())
0 commit comments