-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocx_to_json.py
More file actions
79 lines (66 loc) · 2.88 KB
/
docx_to_json.py
File metadata and controls
79 lines (66 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import json
import re
# Section mappings
section_headers = {
"SUMMARY OF QUALIFICATIONS": "summary",
"TECHNICAL SKILLS": "skills",
"EDUCATION": "education",
"EMPLOYMENT HISTORY": "experience"
}
sections = {
"summary": [],
"skills": [],
"education": [],
"experience": []
}
# Read the entire file
with open("M2sResumeWord.txt", encoding="utf-8") as f:
content = f.read()
# Create a regex to find any of the headers
# We use | to join headers and handle potential spacing/newlines
pattern = "|".join(re.escape(h) for h in section_headers.keys())
# Split the content by headers, but keep the headers so we know which section started
parts = re.split(f"({pattern})", content)
current_section = None
for part in parts:
part = part.strip()
if not part:
continue
# Check if this part is a header
upper_part = part.upper()
if upper_part in section_headers:
current_section = section_headers[upper_part]
elif current_section:
# This is content for the current section
# We split by common separators or newlines to keep the array-based structure
# In the original resume, items are often separated by periods or labels
# Try to break down experience into logical chunks (Employer, Title, etc.)
if current_section == "experience":
# Ensure key labels are separated by newlines for better parsing
processed = part
for label in ["Location:", "Title:", "Duration:", "Employer:"]:
# Use regex for case-insensitive replacement if needed, but here simple is fine
processed = processed.replace(label, f"\n{label}")
# Split by "Employer:" to separate jobs
sub_parts = re.split(r"(Employer:)", processed)
temp = ""
for sp in sub_parts:
if sp == "Employer:":
if temp: sections[current_section].append(temp.strip())
temp = "Employer: "
else:
temp += sp
if temp: sections[current_section].append(temp.strip())
else:
# For other sections, split by common patterns or newlines
# Some sections have labels like "AI Orchestration:" embedded
processed = part
if current_section == "skills":
for label in ["Foundry Operations:", "Development:", "Cloud & Governance:"]:
processed = processed.replace(label, f"\n{label}")
lines = [line.strip() for line in processed.split('\n') if line.strip()]
sections[current_section].extend(lines)
# Write output
with open("resume.json", "w", encoding="utf-8") as f:
json.dump(sections, f, indent=2)
print("Resume parsing complete. Check resume.json for results.")