sortxml/sortxml.py at master · Kopachris/sortxml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/usr/bin/python310

"""Simple XML element sorter.

This module can be used by importing `sort_xml` or by running standalone from the command-line.

"""

#  Copyright (c) 2022, Chris Koch <kopachris@gmail.com>
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are
#  met:
#
#      (1) Redistributions of source code must retain the above copyright
#      notice, this list of conditions and the following disclaimer.
#
#      (2) Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in
#      the documentation and/or other materials provided with the
#      distribution.
#
#      (3)The name of the author may not be used to
#      endorse or promote products derived from this software without
#      specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
#  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
#  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
#  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
#  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
#  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
#  IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  POSSIBILITY OF SUCH DAMAGE.

__version__ = (0, 1, 0)
__version_str__ = '.'.join([str(v) for v in __version__])

__description__ = """
    A simple XML element sorter.  Will sort the children of selected elements
    using a given attribute's value or subelement's text as the sort key.
    Example usage:
        $ python sortxml.py ARForm_orig.rdl "./DataSets/DataSet[@Name='ARForm']/Fields" Name -o ARForm.rdl
"""

import argparse as ap
import xml.etree.ElementTree as ET
from pathlib import Path
from io import TextIOWrapper
from codecs import BOM_UTF8
from decimal import Decimal
from dateutil.parser import parse as parse_dt


class NSElement(ET.Element):
    """Subclass of ElementTree.Element which keeps track of its TreeBuilder and namespaces if available."""

    def __init__(self, *args, **kwargs):
        self._ns_map = dict()
        self._builder = None
        if 'builder' in kwargs:
            builder = kwargs.pop('builder')
            self._builder = builder
            if hasattr(builder, 'ns_map'):
                self._ns_map = builder.ns_map
        super().__init__(*args, **kwargs)

    def find(self, path, namespaces=None):
        if namespaces is None:
            namespaces = self._ns_map
        return super().find(path, namespaces)

    def findall(self, path, namespaces=None):
        if namespaces is None:
            namespaces = self._ns_map
        return super().findall(path, namespaces)

    def findtext(self, path, default=None, namespaces=None):
        if namespaces is None:
            namespaces = self._ns_map
        return super().findtext(path, default, namespaces)

    def iterfind(self, path, namespaces=None):
        if namespaces is None:
            namespaces = self._ns_map
        return super().iterfind(path, namespaces)


class NSTreeBuilder(ET.TreeBuilder):
    """Subclass of ElementTree.TreeBuilder which adds namespaces in the document to the namespace registry."""

    def __init__(self, **kwargs):
        self.ns_map = dict()
        if 'element_factory' in kwargs:
            del kwargs['element_factory']
        super().__init__(element_factory=NSElement, **kwargs)

    def start_ns(self, prefix, uri):
        self.ns_map[prefix] = uri
        ET.register_namespace(prefix, uri)

    def start(self, tag, attrs):
        if self._factory is NSElement:
            self._flush()
            self._last = e = self._factory(tag, attrs, builder=self)
            if self._elem:
                self._elem[-1].append(e)
            elif self._root is None:
                self._root = e
            self._elem.append(e)
            self._tail = 0
            return e
        else:
            return super().start(tag, attrs)

    def _handle_single(self, factory, insert, *args):
        if factory is NSElement:
            e = factory(*args, builder=self)
            if insert:
                self._flush()
                self._last = e
                if self._elem:
                    self._elem[-1].append(e)
                self._tail = 1
            return e
        else:
            return super()._handle_single(factory, insert, *args)


def sort_xml(xml_doc, node_path, sort_attr, use_text=False, sort_as_datetime=False, sort_as_decimal=False,
             descending=False):
    """Sort the children of a selection of elements in an XML document. Returns an ElementTree representing the
    resulting whole document. ElementTree can easily be converted to string or written to a file like so:

    >>> foo_str = ET.tostring(sort_xml(xml_doc, node_path, sort_attr).getroot())
    >>> sort_xml(xml_doc, node_path, sort_attr).write('foo.xml')

    Required arguments:
    -------------------
    * `xml_doc` -- a text IO stream (such as an open file object), Path object pointing to an XML
      file, string representing the file path, or string containing the file contents of a valid XML file. Can't take
      an ElementTree instance because we need to use our own parser to keep track of namespaces.
    * `node_path` -- a string containing the path to the node you want to sort the children of in the XPath language
      of the etree module
    * `sort_attr` -- the attribute of the child elements to use as the sort key

    Optional arguments:
    -------------------
    * `use_text` -- use `sort_attr` as the name of a subelement of the path's children whose text will be the
      sort key (default: False)
    * `sort_as_datetime` -- try to parse the values of the sort key as a datetime using the `dateutil` module and sort
      chronologically (default: False, mutually exclusive with `sort_as_decimal`)
    * `sort_as_decimal` -- try to parse the values of the sort key as a decimal and sort numerically (useful to keep
      '10' from showing up right after '1') (default: False, mutually exclusive with `sort_as_datetime`)
    * `descending` -- sort in descending order instead of ascending (default: False)

    """
    # check parameters

    # xml_doc
    if isinstance(xml_doc, TextIOWrapper) and xml_doc.readable():
        # xml_doc is a readable text stream, let's read it
        # but first make sure to remove any byte order marker

        if xml_doc.encoding != 'utf-8-sig':
            xml_doc.reconfigure(encoding='utf-8-sig')

        xml_str = xml_doc.read()
    elif isinstance(xml_doc, Path) and xml_doc.is_file():
        # xml_doc is a Path object to a file
        xml_str = xml_doc.read_text('utf-8-sig')  # utf-8-sig to remove byte order marker
    elif isinstance(xml_doc, str) and Path(xml_doc).is_file():
        # xml_doc is a filename
        xml_str = Path(xml_doc).read_text('utf-8-sig')
    elif isinstance(xml_doc, str) and len(xml_doc) > 0:
        # xml_doc hopefully contains valid XML
        if xml_doc.startswith(BOM_UTF8.decode('utf-8')):
            xml_str = xml_doc[3:]
        else:
            xml_str = xml_doc
    else:
        raise TypeError("sort_xml() requires first parameter must be a string, readable IO stream, or path for a "
                        f"valid xml file! xml_doc: {repr(xml_doc)}")

    # sort_attr
    if not (isinstance(sort_attr, str) and len(sort_attr) > 0):
        raise TypeError("sort_xml() requires sort attribute must be a non-empty string!\n\t"
                        f"sort_attr: {repr(sort_attr)}")
    else:
        sort_attr = sort_attr.strip()
    if not (sort_attr.replace('_', '').isalnum() and (sort_attr[0].isalpha() or sort_attr[0] == '_')):
        raise ValueError("Sort attribute passed to sort_xml() is an invalid name!\n\t"
                         f"sort_attr: {repr(sort_attr)}")

    # make our element tree using our custom treebuilder and get all the parents we have to sort children of

    dom = ET.fromstring(xml_str, ET.XMLParser(target=NSTreeBuilder()))
    matching_parents = dom.findall(node_path)

    # check what kind of sorting we're doing and do it
    # TODO might be faster if we do the check once and then run the appropriate for loop?
    for par in matching_parents:
        if use_text:
            if sort_as_datetime:
                par[:] = sorted(par, key=lambda x: parse_dt(x.findtext(sort_attr)), reverse=descending)
            elif sort_as_decimal:
                par[:] = sorted(par, key=lambda x: Decimal(x.findtext(sort_attr)), reverse=descending)
            else:
                par[:] = sorted(par, key=lambda x: x.findtext(sort_attr), reverse=descending)
        elif sort_as_datetime:
            par[:] = sorted(par, key=lambda x: parse_dt(x.get(sort_attr)), reverse=descending)
        elif sort_as_decimal:
            par[:] = sorted(par, key=lambda x: Decimal(x.get(sort_attr)), reverse=descending)
        else:
            par[:] = sorted(par, key=lambda x: x.get(sort_attr), reverse=descending)

    return ET.ElementTree(dom)


if __name__ == '__main__':
    argp = ap.ArgumentParser(description=__description__, formatter_class=ap.RawDescriptionHelpFormatter)
    argp.add_argument('-v', '--version', action='version', version=f"%(prog)s -- version {__version_str__}")
    argp.add_argument('input_file', type=Path, help="File path to the source xml file.")
    argp.add_argument('sort_xpath',
                      help="XPath-style selector for elements to sort the children of.  This has the same limitations "
                      "as Python's ElementTree module.")
    argp.add_argument('sort_attr', help="The name of the attribute to use as the sort key.")
    argp.add_argument('-r', '--reverse', '--descending', action='store_true', dest='descending',
                      help="Sort the child elements in reverse (descending) order.")
    argp.add_argument('-t', '--text', '--use-text', action='store_true', dest='use_text',
                      help="Treat the sort attribute name as the name of a subelement whose text is the sort key.")
    sort_style = argp.add_mutually_exclusive_group()
    sort_style.add_argument('--datetime', '--as-datetime', action='store_true', dest='as_datetime',
                            help="Try to parse the sort key as a date/time value.  Mutually exclusive with --decimal.")
    sort_style.add_argument('--decimal', '--as-decimal', action='store_true', dest='as_decimal',
                            help="Try to parse the sort key as a decimal number.  Mutually exclusive with --datetime.")
    argp.add_argument('-o', '--output', type=Path, dest='output_file',
                      help="File path to the destination file.  (Default is to append '_sorted' to the filename.)")

    argv = argp.parse_args()

    xml_doc = argv.input_file
    sort_path = argv.sort_xpath
    sort_attr = argv.sort_attr
    sort_desc = argv.descending
    use_text = argv.use_text
    as_dt = argv.as_datetime
    as_dec = argv.as_decimal

    sorted_xml = sort_xml(xml_doc, sort_path, sort_attr, use_text, as_dt, as_dec, sort_desc)

    if not hasattr(argv, 'output_file'):
        new_filename = xml_doc.stem + '_sorted'
        out_file = xml_doc.with_stem(new_filename)
    else:
        out_file = argv.output_file

    out_file.write_text(ET.tostring(sorted_xml.getroot(), encoding='unicode'), encoding='utf-8')

    print(f"Output sorted file as `{out_file}`")