-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocxparser.py
More file actions
158 lines (135 loc) · 5.35 KB
/
docxparser.py
File metadata and controls
158 lines (135 loc) · 5.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Modified copy of docx2txt
import argparse
import re
import xml.etree.ElementTree as ET
import zipfile
import os
import sys
import cv2
import numpy as np
nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
graphic_token = '^GRAPHICTOKEN$'
def process_args():
parser = argparse.ArgumentParser(description='A pure python-based utility '
'to extract text and images '
'from docx files.')
parser.add_argument("docx", help="path of the docx file")
parser.add_argument('-i', '--img_dir', help='path of directory '
'to extract images')
args = parser.parse_args()
if not os.path.exists(args.docx):
print('File {} does not exist.'.format(args.docx))
sys.exit(1)
if args.img_dir is not None:
if not os.path.exists(args.img_dir):
try:
os.makedirs(args.img_dir)
except OSError:
print("Unable to create img_dir {}".format(args.img_dir))
sys.exit(1)
return args
def qn(tag):
"""
Stands for 'qualified name', a utility function to turn a namespace
prefixed tag name into a Clark-notation qualified tag name for lxml. For
example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
Source: https://github.com/python-openxml/python-docx/
"""
prefix, tagroot = tag.split(':')
uri = nsmap[prefix]
return '{{{}}}{}'.format(uri, tagroot)
def xml2text(xml):
"""
A string representing the textual content of this run, with content
child elements like ``<w:tab/>`` translated to their Python
equivalent.
Adapted from: https://github.com/python-openxml/python-docx/
"""
text = u''
root = ET.fromstring(xml)
for child in root.iter():
tagroot = child.tag.split('}')[-1]
if child.tag == qn('w:t'):
t_text = child.text
text += t_text if t_text is not None else ''
elif child.tag == qn('w:tab'):
text += '\t'
elif child.tag in (qn('w:br'), qn('w:cr')):
text += '\n'
elif child.tag == qn("w:p"):
text += '\n\n'
elif tagroot == 'graphic':
text += graphic_token
elif 'grid' in tagroot.lower() and tagroot!='snapToGrid':
text += '{'+tagroot+'}'
else:
continue
ignore = {'document', 'body', 'pPr', 'jc', 'rPr', 'rFonts', 'b', 'bCs', 'docGrid', 'sz',
'szCs', 'r', 'color', 'pStyle', 'numPr', 'ilvl', 'numId', 'ind', 'tbl', 'tblPr', 'tblStyle',
'tblW', 'tblBorders', 'left', 'right', 'gridCol', 'tblLook', 'tr', 'trPr', 'tc', 'tcPr', 'tcW', 'i', 'iCs',
'tblInd'} # ,'lang','noProof','drawing','inline','extent','effectExtent','docPr','cNvGraphicFramePr',
#'graphicFrameLocks','graphic','graphicData','pic','nvPicPr','cNvPr','cNvPicPr','blipFill'}
suffix = text[-20:]
if tagroot not in ignore:
#print(child.tag)
log = ' keys:'+repr(child.keys())+' items:' + \
repr(child.items())+' attrib:'+repr(child.attrib)
#text += '{'+tagroot+(' text:'+str(child.text) if child.text else '')+(' tail:'+str(child.tail) if child.tail else '')+log+'}'
return text
def process(docx):
# return (text:str,imgs:List[cv2::img])
text = u''
# unzip the docx in memory
zipf = zipfile.ZipFile(docx)
filelist = zipf.namelist()
# get header text
# there can be 3 header files in the zip
header_xmls = 'word/header[0-9]*.xml'
for fname in filelist:
if re.match(header_xmls, fname):
text += xml2text(zipf.read(fname))
# get main text
doc_xml = 'word/document.xml'
text += xml2text(zipf.read(doc_xml))
# get footer text
# there can be 3 footer files in the zip
footer_xmls = 'word/footer[0-9]*.xml'
for fname in filelist:
if re.match(footer_xmls, fname):
text += xml2text(zipf.read(fname))
imgList = []
# extract images
imgfile = [s for s in filelist if 'word/media/image' in s]
# for fname in filelist:
# _, extension = os.path.splitext(fname)
# if extension in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
for fname in imgfile:
binary = np.frombuffer(zipf.read(fname), np.uint8)
img = cv2.imdecode(binary, cv2.IMREAD_ANYCOLOR)
n, m, _ = img.shape
tn, tm = 800, 800
r = min(tn/n, tm/m)
tn, tm = int(n*r), int(m*r)
img = cv2.resize(img, (tm, tn))
imgid = re.findall(r'word/media/image(\d+)\..+?', fname)
assert(len(imgid) == 1)
imgid = int(imgid[0])
imgList.append((imgid, img))
imgList = sorted(imgList, key=lambda x: x[0])
imgList = [img for id, img in imgList]
zipf.close()
return (text.strip(), imgList)
if __name__ == '__main__':
#args = process_args()
#text = process(args.docx, args.img_dir)
text = process('test.docx')
print(text[0])
for i in range(len(text[1])):
print('img', i)
cv2.imshow('img', text[1][i])
cv2.waitKey(0)
with open('test.out', 'w', encoding='utf-8') as w:
w.write(repr(text[0]))
#sys.stdout.write(text.encode('utf-8'))