Skip to content

Commit a82fb32

Browse files
committed
Source code
1 parent 84a0db4 commit a82fb32

File tree

11 files changed

+696
-0
lines changed

11 files changed

+696
-0
lines changed

install.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
sudo apt install tesseract-ocr
2+
sudo apt install libtesseract-dev
3+
python -m venv venv
4+
source venv/bin/activate
5+
pip install -r requirments.txt

readme.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
install.sh is for Debian Linux
2+
Prerequisites: apt and python above 3.8
3+
4+
The project includes 3 main parts:<br>
5+
PDF Text Extractor - extracts text from PDF</br>
6+
Image Extractor from PDF - extracts images and saves it to a folder
7+
Text Visualizer - Visualize the text to see what the computer recognizes
8+
9+
If on debian linux do <h5>Sudo bash install.sh</h5>
10+
Steps:
11+
1. Install tesseract-ocr and libtesseract-dev using your os package installed
12+
2. Create a virual env python3 -m venv venv
13+
3. source venv/bin/activate
14+
4. Install all libraries required pip install -r requirments.txt
15+
16+
Depending on your work load either use **main.py** if you want a graphical interface or **maincli.py** to use command line argumets
17+
18+
For __mainCLI.py__ you can use either syntax
19+
<h6>python3 main.py PDFfile </h6> or
20+
<h6> python3 main.py PDFfile -o outputFileName</h6>
21+
22+
For __visualizer.py__ the syntax is
23+
<h6> python3 visualizer.py PDFfile</h6>

requirments.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
fitz
2+
frontend
3+
matplot
4+
py-pdf-parser
5+
pdfplumber
6+
pdf2image
7+
pymupdf
8+
pytesseract
9+
pyvoronoi
10+
pdfminer.six==20191110
11+
shapely
12+
tk

src/Postprocess.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
"""
2+
The script function is really long and confusing and unless you have additonal characters to take out or want to redo
3+
the file please don't edit the code as it's easy to break but it works good with some tables so don't rely on it for tables
4+
5+
"""
6+
7+
8+
# This is one is going one final time
9+
def finalprocessing(string):
10+
i = 0
11+
special_characters = "!@#$%^&*()+?_=,<>/\"\'[];-_–"
12+
count = 0
13+
while i + 3 < len(string):
14+
15+
# If there is a space and new line separating the characters joining them toghether because they are the same sentence
16+
if (string[i].isupper() or string[i].islower() or any(c in special_characters for c in string[i]) or string[
17+
i].isdigit()) \
18+
and string[i + 1] == " " and string[i + 2] == "\n" \
19+
and (string[i + 3].islower() or string[i + 3].isdigit()):
20+
string = string[:i + 1] + " " + string[i + 3:]
21+
if string[i] == "." and string[i + 1].isspace() and string[i + 2] == "\n" and string[i + 3].isupper():
22+
string = string[:i + 1] + " " + string[i + 3:]
23+
24+
i += 1
25+
26+
return string
27+
28+
29+
def PostProcessing(string):
30+
print("Started Processing...")
31+
32+
# There are some characters that the last character or the new line starts with and make them traated as a normal letter except a "."
33+
special_characters = "!@#$%^&*()+?_=,<>/\"\'[];-_–"
34+
35+
# Removes the weird characters used to give more data in the table
36+
weird_characters = "†‡§"
37+
38+
# Also replace the dash when the sentences uses a dash to indicate the space not enough and that the word continues on a new line
39+
string = string.replace("- ", "")
40+
41+
# Dictionary switch for UTF-16 characters
42+
char_switch = {
43+
"fi ": "fi",
44+
"": "ft",
45+
"ff": "ff"
46+
}
47+
48+
# Removing spaces after a new line to make the post-processing smooth
49+
string = string.replace("\n ", "\n")
50+
51+
# While loop to take out the weird unicode character that indicates it's an image
52+
j = 0
53+
count = 1
54+
while j < len(string):
55+
if string[j] == "ð":
56+
string = string.replace("ð" + str(count) + "Þ", f"Image {str(count)}")
57+
count += 1
58+
j += 1
59+
60+
# This is where the actual post-processing is done
61+
i = 0
62+
63+
# Use a while to go through every character of the string that is passed to the function
64+
while i < len(string):
65+
'''if string[i] == "\n" and (string[i+1].islower() or string[i+1].isdigit()):
66+
string = string[:i] + " " + string[i+1:]
67+
#print("It worked")'''
68+
69+
# Use the code below if your having spacing issues in the file
70+
# string = string.replace("\n ", "\n")
71+
72+
# Incase the line stops at a comma and continues on a new line
73+
if string[i] == "," and string[i + 2] == "\n":
74+
string = string[:i] + " " + string[i + 1:]
75+
76+
# Unless you are going to do it please don't mess up the order
77+
if (i + 2) < len(string):
78+
79+
# When the last character is a space and before is a character that is not a full stop it join the sentences
80+
if (string[i].islower() or string[i].isdigit() or any(c in special_characters for c in string[i])) \
81+
and string[i + 1] == " " and string[i + 2] == "\n": \
82+
# and (string[i+3].islower() or string[i+3].isdigit() or string[i+3].isupper() or any(c in weird_characters for c in string[i+3])):
83+
string = string[:i + 1] + " " + string[i + 3:]
84+
85+
# When the first character is a special character and last letter of the previous sentence is not a full stop
86+
if (string[i].isprintable() and string[i] == ".") and string[i + 1] == "\n" \
87+
and any(c in special_characters for c in string[i + 2]):
88+
string = string[:i + 1] + string[i + 2:]
89+
90+
# Removes the extra new lines for the fotters of tables
91+
if any(c in weird_characters for c in string[i]) and string[i + 1] == "\n" and string[i + 2].isdigit():
92+
string = string[:i + 1] + " " + string[i + 2:]
93+
94+
# Same thing as above but in reverse order and also includes lower letter
95+
if (string[i].islower() or string[i].isdigit()) and string[i + 1] == "\n" and any(
96+
c in special_characters for c in string[i + 2]):
97+
string = string[:i + 1] + " " + string[i + 2:]
98+
99+
# Incase there is a new line between a digit and a lower letter
100+
if string[i].isdigit() and string[i + 1] == "\n" and (string[i + 2].islower() or string[i + 2].isdigit()):
101+
string = string[:i] + " " + string[i + 2:]
102+
103+
# Removing a new line in between a special character and a lower character or a number
104+
if any(c in special_characters for c in string[i]) and string[i + 1] == "\n" and (
105+
string[i + 2].islower() or string[i + 2].isdigit()):
106+
string = string[:i + 1] + " " + string[i + 2:]
107+
108+
# Remove the new line if it's between a letter and digit
109+
if (string[i].islower() or string[i].isupper()) and string[i + 1] == "\n" and string[i + 2].isdigit():
110+
string = string[:i + 1] + string[i + 2:]
111+
112+
# Remove the - and new line that used to indicate the word continues on a new line
113+
if string[i] == "-" and string[i + 1] == "\n":
114+
string = string[:i] + string[i + 2:]
115+
116+
# kind of the same as the before the upper one but detailed to incorporate more
117+
if (string[i].islower() or string[i].isupper()) and string[i + 1] == "\n" \
118+
and (string[i + 2].islower() or string[i + 2].isupper() or any(
119+
c in special_characters for c in string[i + 2]) or string[i + 2].isspace()):
120+
string = string[:i + 1] + " " + string[i + 2:]
121+
i += 1
122+
123+
# Used a dictionary to replace unwanted or UTF-16 characters to UTF8 characters
124+
for word, replacement in char_switch.items():
125+
string = string.replace(word, replacement)
126+
127+
# Final replacement of double spaces and spaces with a dash of compound names
128+
string = string.replace(" ", " ")
129+
string = string.replace("- ", "-")
130+
string = string.replace("\n)", ")")
131+
132+
finalString = finalprocessing(string)
133+
134+
return finalString
135+
136+
def fitzPostProcess(string):
137+
string = str(string)
138+
string = string.replace("-\n", "")
139+
i = 0
140+
while i+2 < len(string):
141+
if string[i].isspace() and string[i+1] == "\n" and string[i+2].isprintable():
142+
string = string[:i+1] + string[i+2:]
143+
144+
145+
i += 1
146+
147+
return string
3.86 KB
Binary file not shown.
1.96 KB
Binary file not shown.

src/justimages.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import os
2+
import shutil
3+
import sys
4+
import tkinter.filedialog
5+
import fitz
6+
from Postprocess import fitzPostProcess
7+
8+
9+
def imageextraction(file_path):
10+
if ('/' in file_path):
11+
pdf_file = fitz.open(file_path)
12+
split = file_path.rsplit("/", 1)
13+
loc = os.getcwd()
14+
folderName = split[1].rsplit(".", 1)[0]
15+
else:
16+
pdf_file = fitz.open(file_path)
17+
loc = os.getcwd()
18+
folderName = file_path.rsplit(".", 1)[0]
19+
location = f"{loc}/{folderName}_output"
20+
if not (os.path.exists(location)):
21+
os.mkdir(location)
22+
else:
23+
try:
24+
shutil.rmtree(location)
25+
os.mkdir(location)
26+
except:
27+
print(f"\033[91mCouldn't Delete {location}, you'll need to manually delete it\033[0m")
28+
sys.exit(0)
29+
30+
# Finding the number of pages in the pdf
31+
number_of_pages = len(pdf_file)
32+
33+
# Iterating through each page in the pdf
34+
35+
print("\n Extracting Images from PDF ")
36+
print("--------------------------------------------")
37+
for current_page_index in range(number_of_pages):
38+
count = -1
39+
# iterating through each image in every page of PDF
40+
for img_index, img in enumerate(pdf_file.get_page_images(current_page_index)):
41+
count += 1
42+
xref = img[0]
43+
image = fitz.Pixmap(pdf_file, xref)
44+
try:
45+
46+
# If Image colorspace is unspecified or unrecognized
47+
if image.colorspace is None:
48+
image.save("{}/image{}-{}.png".format(location, current_page_index, img_index))
49+
50+
# If the image colorspace is different than GRAY or RGB image
51+
52+
elif image.colorspace not in (fitz.csGRAY.name, fitz.csRGB.name):
53+
54+
image = fitz.Pixmap(fitz.csRGB, image)
55+
56+
image.save("{}/image{}-{}.png".format(location, current_page_index, img_index))
57+
58+
59+
60+
61+
# if it is a is GRAY or RGB image
62+
elif image.n < 5:
63+
image = fitz.Pixmap(fitz.csRGB, image)
64+
image.save("{}/image{}-{}.png".format(location, current_page_index, img_index))
65+
66+
67+
# Convert to RGB first
68+
else:
69+
70+
new_image = fitz.Pixmap(fitz.csRGB, image)
71+
new_image.save("{}/image{}-{}.png".foramt(location, current_page_index, img_index))
72+
except:
73+
print(f"image{current_page_index}-{img_index} has invalid color space")
74+
# All the above is for image processing
75+
76+
print(f"\nImages extracted to {location} folder")
77+
# This is for text processing
78+
out = open(location + "/text_notFinal.txt", "wb") # open text output
79+
for page in pdf_file: # iterate the document pages
80+
text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
81+
out.write(text) # write text of page
82+
out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
83+
pdf_file.close()
84+
85+
86+
if __name__ == "__main__":
87+
root = tkinter.Tk()
88+
file = tkinter.filedialog.askopenfilename(filetypes=[("pdf files", "*.pdf")])
89+
root.destroy()
90+
if file == ():
91+
print("No File Selected")
92+
exit(0)
93+
imageextraction(file)

0 commit comments

Comments
 (0)