NLPatVCU
diff --git a/‎install.sh‎
Lines changed: 5 additions & 0 deletions b/‎install.sh‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎readme.md‎
Lines changed: 23 additions & 0 deletions b/‎readme.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎requirments.txt‎
Lines changed: 12 additions & 0 deletions b/‎requirments.txt‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/Postprocess.py‎
Lines changed: 147 additions & 0 deletions b/‎src/Postprocess.py‎
Lines changed: 147 additions & 0 deletions
diff --git a/‎src/__pycache__/Postprocess.cpython-38.pyc‎
3.86 KB b/‎src/__pycache__/Postprocess.cpython-38.pyc‎
3.86 KB
diff --git a/‎src/__pycache__/justimages.cpython-38.pyc‎
1.96 KB b/‎src/__pycache__/justimages.cpython-38.pyc‎
1.96 KB
diff --git a/‎src/justimages.py‎
Lines changed: 93 additions & 0 deletions b/‎src/justimages.py‎
Lines changed: 93 additions & 0 deletions
@@ -0,0 +1,5 @@
+sudo apt install tesseract-ocr
+sudo apt install libtesseract-dev
+python -m venv venv
+source venv/bin/activate
+pip install -r requirments.txt
@@ -0,0 +1,23 @@
+install.sh is for Debian Linux
+Prerequisites: apt and python above 3.8
+
+The project includes 3 main parts:<br>
+PDF Text Extractor - extracts text from PDF</br>
+Image Extractor from PDF - extracts images and saves it to a folder 
+Text Visualizer - Visualize the text to see what the computer recognizes
+
+If on debian linux do <h5>Sudo bash install.sh</h5>
+Steps:
+1. Install tesseract-ocr and libtesseract-dev using your os package installed
+2. Create a virual env python3 -m venv venv
+3. source venv/bin/activate
+4. Install all libraries required pip install -r requirments.txt
+
+Depending on your work load either use **main.py** if you want a graphical interface or **maincli.py** to use command line argumets
+
+For __mainCLI.py__ you can use either syntax
+<h6>python3 main.py PDFfile </h6>   or
+<h6> python3 main.py PDFfile -o outputFileName</h6>
+
+For __visualizer.py__ the syntax is
+<h6> python3 visualizer.py PDFfile</h6>
@@ -0,0 +1,12 @@
+fitz
+frontend
+matplot
+py-pdf-parser
+pdfplumber
+pdf2image
+pymupdf
+pytesseract
+pyvoronoi
+pdfminer.six==20191110
+shapely
+tk
@@ -0,0 +1,147 @@
+"""
+The script function is really long and confusing and unless you have additonal characters to take out or want to redo
+the file please don't edit the code as it's easy to break but it works good with some tables so don't rely on it for tables
+
+"""
+
+
+# This is one is going one final time
+def finalprocessing(string):
+    i = 0
+    special_characters = "!@#$%^&*()+?_=,<>/\"\'[];-_–"
+    count = 0
+    while i + 3 < len(string):
+
+        # If there is a space and new line separating the characters joining them toghether because they are the same sentence
+        if (string[i].isupper() or string[i].islower() or any(c in special_characters for c in string[i]) or string[
+            i].isdigit()) \
+                and string[i + 1] == " " and string[i + 2] == "\n" \
+                and (string[i + 3].islower() or string[i + 3].isdigit()):
+            string = string[:i + 1] + " " + string[i + 3:]
+        if string[i] == "." and string[i + 1].isspace() and string[i + 2] == "\n" and string[i + 3].isupper():
+            string = string[:i + 1] + " " + string[i + 3:]
+
+        i += 1
+
+    return string
+
+
+def PostProcessing(string):
+    print("Started Processing...")
+
+    # There are some characters that the last character or the new line starts with and make them traated as a normal letter except a "."
+    special_characters = "!@#$%^&*()+?_=,<>/\"\'[];-_–"
+
+    # Removes the weird characters used to give more data in the table
+    weird_characters = "†‡§"
+
+    # Also replace the dash when the sentences uses a dash to indicate the space not enough and that the word continues on a new line
+    string = string.replace("- ", "")
+
+    # Dictionary switch for UTF-16 characters
+    char_switch = {
+        "ﬁ ": "fi",
+        "": "ft",
+        "ﬀ": "ff"
+    }
+
+    # Removing spaces after a new line to make the post-processing smooth
+    string = string.replace("\n ", "\n")
+
+    # While loop to take out the weird unicode character that indicates it's an image
+    j = 0
+    count = 1
+    while j < len(string):
+        if string[j] == "ð":
+            string = string.replace("ð" + str(count) + "Þ", f"Image {str(count)}")
+            count += 1
+        j += 1
+
+    # This is where the actual post-processing is done
+    i = 0
+
+    # Use a while to go through every character of the string that is passed to the function
+    while i < len(string):
+        '''if string[i] == "\n" and (string[i+1].islower() or string[i+1].isdigit()):
+            string = string[:i] + " " + string[i+1:]
+            #print("It worked")'''
+
+        # Use the code below if your having spacing issues in the file
+        # string = string.replace("\n ", "\n")
+
+        # Incase the line stops at a comma and continues on a new line
+        if string[i] == "," and string[i + 2] == "\n":
+            string = string[:i] + " " + string[i + 1:]
+
+        # Unless you are going to do it please don't mess up the order
+        if (i + 2) < len(string):
+
+            # When the last character is a space and before is a character that is not a full stop it join the sentences
+            if (string[i].islower() or string[i].isdigit() or any(c in special_characters for c in string[i])) \
+                    and string[i + 1] == " " and string[i + 2] == "\n": \
+                    # and (string[i+3].islower() or string[i+3].isdigit() or string[i+3].isupper() or any(c in weird_characters for c in string[i+3])):
+                string = string[:i + 1] + " " + string[i + 3:]
+
+            # When the first character is a special character and last letter of the previous sentence is not a full stop
+            if (string[i].isprintable() and string[i] == ".") and string[i + 1] == "\n" \
+                    and any(c in special_characters for c in string[i + 2]):
+                string = string[:i + 1] + string[i + 2:]
+
+            # Removes the extra new lines for the fotters of tables
+            if any(c in weird_characters for c in string[i]) and string[i + 1] == "\n" and string[i + 2].isdigit():
+                string = string[:i + 1] + " " + string[i + 2:]
+
+            # Same thing as above but in reverse order and also includes lower letter
+            if (string[i].islower() or string[i].isdigit()) and string[i + 1] == "\n" and any(
+                    c in special_characters for c in string[i + 2]):
+                string = string[:i + 1] + " " + string[i + 2:]
+
+            # Incase there is a new line between a digit and a lower letter
+            if string[i].isdigit() and string[i + 1] == "\n" and (string[i + 2].islower() or string[i + 2].isdigit()):
+                string = string[:i] + " " + string[i + 2:]
+
+            # Removing a new line in between a special character and a lower character or a number
+            if any(c in special_characters for c in string[i]) and string[i + 1] == "\n" and (
+                    string[i + 2].islower() or string[i + 2].isdigit()):
+                string = string[:i + 1] + " " + string[i + 2:]
+
+            # Remove the new line if it's between a letter and digit
+            if (string[i].islower() or string[i].isupper()) and string[i + 1] == "\n" and string[i + 2].isdigit():
+                string = string[:i + 1] + string[i + 2:]
+
+            # Remove the - and new line that used to indicate the word continues on a new line
+            if string[i] == "-" and string[i + 1] == "\n":
+                string = string[:i] + string[i + 2:]
+
+            # kind of the same as the before the upper one but detailed to incorporate more
+            if (string[i].islower() or string[i].isupper()) and string[i + 1] == "\n" \
+                    and (string[i + 2].islower() or string[i + 2].isupper() or any(
+                c in special_characters for c in string[i + 2]) or string[i + 2].isspace()):
+                string = string[:i + 1] + " " + string[i + 2:]
+        i += 1
+
+    # Used a dictionary to replace unwanted or UTF-16 characters to UTF8 characters
+    for word, replacement in char_switch.items():
+        string = string.replace(word, replacement)
+
+    # Final replacement of double spaces and spaces with a dash of compound names
+    string = string.replace("  ", " ")
+    string = string.replace("- ", "-")
+    string = string.replace("\n)", ")")
+
+    finalString = finalprocessing(string)
+
+    return finalString
+
+def fitzPostProcess(string):
+    string = str(string)
+    string = string.replace("-\n", "")
+    i = 0
+    while i+2 < len(string):
+        if string[i].isspace() and string[i+1] == "\n" and string[i+2].isprintable():
+            string = string[:i+1] + string[i+2:]
+
+
+        i += 1
+
+    return string
@@ -0,0 +1,93 @@
+import os
+import shutil
+import sys
+import tkinter.filedialog
+import fitz
+from Postprocess import fitzPostProcess
+
+
+def imageextraction(file_path):
+    if ('/' in file_path):
+        pdf_file = fitz.open(file_path)
+        split = file_path.rsplit("/", 1)
+        loc = os.getcwd()
+        folderName = split[1].rsplit(".", 1)[0]
+    else:
+        pdf_file = fitz.open(file_path)
+        loc = os.getcwd()
+        folderName = file_path.rsplit(".", 1)[0]
+    location = f"{loc}/{folderName}_output"
+    if not (os.path.exists(location)):
+        os.mkdir(location)
+    else:
+        try:
+            shutil.rmtree(location)
+            os.mkdir(location)
+        except:
+            print(f"\033[91mCouldn't Delete {location}, you'll need to manually delete it\033[0m")
+            sys.exit(0)
+
+    # Finding the number of pages in the pdf
+    number_of_pages = len(pdf_file)
+
+    # Iterating through each page in the pdf
+
+    print("\n        Extracting Images from PDF        ")
+    print("--------------------------------------------")
+    for current_page_index in range(number_of_pages):
+        count = -1
+        # iterating through each image in every page of PDF
+        for img_index, img in enumerate(pdf_file.get_page_images(current_page_index)):
+            count += 1
+            xref = img[0]
+            image = fitz.Pixmap(pdf_file, xref)
+            try:
+
+                # If Image colorspace is unspecified or unrecognized
+                if image.colorspace is None:
+                    image.save("{}/image{}-{}.png".format(location, current_page_index, img_index))
+
+                # If the image colorspace is different than GRAY or RGB image
+
+                elif image.colorspace not in (fitz.csGRAY.name, fitz.csRGB.name):
+
+                    image = fitz.Pixmap(fitz.csRGB, image)
+
+                    image.save("{}/image{}-{}.png".format(location, current_page_index, img_index))
+
+
+
+
+                # if it is a is GRAY or RGB image
+                elif image.n < 5:
+                    image = fitz.Pixmap(fitz.csRGB, image)
+                    image.save("{}/image{}-{}.png".format(location, current_page_index, img_index))
+
+
+                # Convert to RGB first
+                else:
+
+                    new_image = fitz.Pixmap(fitz.csRGB, image)
+                    new_image.save("{}/image{}-{}.png".foramt(location, current_page_index, img_index))
+            except:
+                print(f"image{current_page_index}-{img_index} has invalid color space")
+    # All the above is for image processing
+
+    print(f"\nImages extracted to {location} folder")
+    # This is for text processing
+    out = open(location + "/text_notFinal.txt", "wb")  # open text output
+    for page in pdf_file:  # iterate the document pages
+        text = page.get_text().encode("utf8")  # get plain text (is in UTF-8)
+        out.write(text)  # write text of page
+        out.write(bytes((12,)))  # write page delimiter (form feed 0x0C)
+    pdf_file.close()
+
+
+if __name__ == "__main__":
+    root = tkinter.Tk()
+    file = tkinter.filedialog.askopenfilename(filetypes=[("pdf files", "*.pdf")])
+    root.destroy()
+    if file == ():
+        print("No File Selected")
+        exit(0)
+    imageextraction(file)