PyAutoHomework/docxparser.py at master · 747929791/PyAutoHomework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Modified copy of docx2txt

import argparse
import re
import xml.etree.ElementTree as ET
import zipfile
import os
import sys

import cv2
import numpy as np


nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

graphic_token = '^GRAPHICTOKEN$'


def process_args():
    parser = argparse.ArgumentParser(description='A pure python-based utility '
                                                 'to extract text and images '
                                                 'from docx files.')
    parser.add_argument("docx", help="path of the docx file")
    parser.add_argument('-i', '--img_dir', help='path of directory '
                                                'to extract images')

    args = parser.parse_args()

    if not os.path.exists(args.docx):
        print('File {} does not exist.'.format(args.docx))
        sys.exit(1)

    if args.img_dir is not None:
        if not os.path.exists(args.img_dir):
            try:
                os.makedirs(args.img_dir)
            except OSError:
                print("Unable to create img_dir {}".format(args.img_dir))
                sys.exit(1)
    return args


def qn(tag):
    """
    Stands for 'qualified name', a utility function to turn a namespace
    prefixed tag name into a Clark-notation qualified tag name for lxml. For
    example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
    Source: https://github.com/python-openxml/python-docx/
    """
    prefix, tagroot = tag.split(':')
    uri = nsmap[prefix]
    return '{{{}}}{}'.format(uri, tagroot)


def xml2text(xml):
    """
    A string representing the textual content of this run, with content
    child elements like ``<w:tab/>`` translated to their Python
    equivalent.
    Adapted from: https://github.com/python-openxml/python-docx/
    """
    text = u''
    root = ET.fromstring(xml)
    for child in root.iter():
        tagroot = child.tag.split('}')[-1]
        if child.tag == qn('w:t'):
            t_text = child.text
            text += t_text if t_text is not None else ''
        elif child.tag == qn('w:tab'):
            text += '\t'
        elif child.tag in (qn('w:br'), qn('w:cr')):
            text += '\n'
        elif child.tag == qn("w:p"):
            text += '\n\n'
        elif tagroot == 'graphic':
            text += graphic_token
        elif 'grid' in tagroot.lower() and tagroot!='snapToGrid':
            text += '{'+tagroot+'}'
        else:
            continue
            ignore = {'document', 'body', 'pPr', 'jc', 'rPr', 'rFonts', 'b', 'bCs', 'docGrid', 'sz',
                      'szCs', 'r', 'color', 'pStyle', 'numPr', 'ilvl', 'numId', 'ind', 'tbl', 'tblPr', 'tblStyle',
                      'tblW', 'tblBorders', 'left', 'right', 'gridCol', 'tblLook', 'tr', 'trPr', 'tc', 'tcPr', 'tcW', 'i', 'iCs',
                      'tblInd'}  # ,'lang','noProof','drawing','inline','extent','effectExtent','docPr','cNvGraphicFramePr',
            #'graphicFrameLocks','graphic','graphicData','pic','nvPicPr','cNvPr','cNvPicPr','blipFill'}
            suffix = text[-20:]
            if tagroot not in ignore:
                #print(child.tag)
                log = ' keys:'+repr(child.keys())+' items:' + \
                    repr(child.items())+' attrib:'+repr(child.attrib)
                #text += '{'+tagroot+(' text:'+str(child.text) if child.text else '')+(' tail:'+str(child.tail) if child.tail else '')+log+'}'
    return text


def process(docx):
    # return (text:str,imgs:List[cv2::img])
    text = u''

    # unzip the docx in memory
    zipf = zipfile.ZipFile(docx)
    filelist = zipf.namelist()

    # get header text
    # there can be 3 header files in the zip
    header_xmls = 'word/header[0-9]*.xml'
    for fname in filelist:
        if re.match(header_xmls, fname):
            text += xml2text(zipf.read(fname))

    # get main text
    doc_xml = 'word/document.xml'
    text += xml2text(zipf.read(doc_xml))

    # get footer text
    # there can be 3 footer files in the zip
    footer_xmls = 'word/footer[0-9]*.xml'
    for fname in filelist:
        if re.match(footer_xmls, fname):
            text += xml2text(zipf.read(fname))

    imgList = []
    # extract images
    imgfile = [s for s in filelist if 'word/media/image' in s]
    # for fname in filelist:
    #     _, extension = os.path.splitext(fname)
    #     if extension in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
    for fname in imgfile:
        binary = np.frombuffer(zipf.read(fname), np.uint8)
        img = cv2.imdecode(binary, cv2.IMREAD_ANYCOLOR)
        n, m, _ = img.shape
        tn, tm = 800, 800
        r = min(tn/n, tm/m)
        tn, tm = int(n*r), int(m*r)
        img = cv2.resize(img, (tm, tn))
        imgid = re.findall(r'word/media/image(\d+)\..+?', fname)
        assert(len(imgid) == 1)
        imgid = int(imgid[0])
        imgList.append((imgid, img))
    imgList = sorted(imgList, key=lambda x: x[0])
    imgList = [img for id, img in imgList]
    zipf.close()
    return (text.strip(), imgList)


if __name__ == '__main__':
    #args = process_args()
    #text = process(args.docx, args.img_dir)
    text = process('test.docx')
    print(text[0])
    for i in range(len(text[1])):
        print('img', i)
        cv2.imshow('img', text[1][i])
        cv2.waitKey(0)
    with open('test.out', 'w', encoding='utf-8') as w:
        w.write(repr(text[0]))
    #sys.stdout.write(text.encode('utf-8'))