1+ """
2+ The script function is really long and confusing and unless you have additonal characters to take out or want to redo
3+ the file please don't edit the code as it's easy to break but it works good with some tables so don't rely on it for tables
4+
5+ """
6+
7+
8+ # This is one is going one final time
9+ def finalprocessing (string ):
10+ i = 0
11+ special_characters = "!@#$%^&*()+?_=,<>/\" \' [];-_–"
12+ count = 0
13+ while i + 3 < len (string ):
14+
15+ # If there is a space and new line separating the characters joining them toghether because they are the same sentence
16+ if (string [i ].isupper () or string [i ].islower () or any (c in special_characters for c in string [i ]) or string [
17+ i ].isdigit ()) \
18+ and string [i + 1 ] == " " and string [i + 2 ] == "\n " \
19+ and (string [i + 3 ].islower () or string [i + 3 ].isdigit ()):
20+ string = string [:i + 1 ] + " " + string [i + 3 :]
21+ if string [i ] == "." and string [i + 1 ].isspace () and string [i + 2 ] == "\n " and string [i + 3 ].isupper ():
22+ string = string [:i + 1 ] + " " + string [i + 3 :]
23+
24+ i += 1
25+
26+ return string
27+
28+
29+ def PostProcessing (string ):
30+ print ("Started Processing..." )
31+
32+ # There are some characters that the last character or the new line starts with and make them traated as a normal letter except a "."
33+ special_characters = "!@#$%^&*()+?_=,<>/\" \' [];-_–"
34+
35+ # Removes the weird characters used to give more data in the table
36+ weird_characters = "†‡§"
37+
38+ # Also replace the dash when the sentences uses a dash to indicate the space not enough and that the word continues on a new line
39+ string = string .replace ("- " , "" )
40+
41+ # Dictionary switch for UTF-16 characters
42+ char_switch = {
43+ "fi " : "fi" ,
44+ "" : "ft" ,
45+ "ff" : "ff"
46+ }
47+
48+ # Removing spaces after a new line to make the post-processing smooth
49+ string = string .replace ("\n " , "\n " )
50+
51+ # While loop to take out the weird unicode character that indicates it's an image
52+ j = 0
53+ count = 1
54+ while j < len (string ):
55+ if string [j ] == "ð" :
56+ string = string .replace ("ð" + str (count ) + "Þ" , f"Image { str (count )} " )
57+ count += 1
58+ j += 1
59+
60+ # This is where the actual post-processing is done
61+ i = 0
62+
63+ # Use a while to go through every character of the string that is passed to the function
64+ while i < len (string ):
65+ '''if string[i] == "\n " and (string[i+1].islower() or string[i+1].isdigit()):
66+ string = string[:i] + " " + string[i+1:]
67+ #print("It worked")'''
68+
69+ # Use the code below if your having spacing issues in the file
70+ # string = string.replace("\n ", "\n")
71+
72+ # Incase the line stops at a comma and continues on a new line
73+ if string [i ] == "," and string [i + 2 ] == "\n " :
74+ string = string [:i ] + " " + string [i + 1 :]
75+
76+ # Unless you are going to do it please don't mess up the order
77+ if (i + 2 ) < len (string ):
78+
79+ # When the last character is a space and before is a character that is not a full stop it join the sentences
80+ if (string [i ].islower () or string [i ].isdigit () or any (c in special_characters for c in string [i ])) \
81+ and string [i + 1 ] == " " and string [i + 2 ] == "\n " : \
82+ # and (string[i+3].islower() or string[i+3].isdigit() or string[i+3].isupper() or any(c in weird_characters for c in string[i+3])):
83+ string = string [:i + 1 ] + " " + string [i + 3 :]
84+
85+ # When the first character is a special character and last letter of the previous sentence is not a full stop
86+ if (string [i ].isprintable () and string [i ] == "." ) and string [i + 1 ] == "\n " \
87+ and any (c in special_characters for c in string [i + 2 ]):
88+ string = string [:i + 1 ] + string [i + 2 :]
89+
90+ # Removes the extra new lines for the fotters of tables
91+ if any (c in weird_characters for c in string [i ]) and string [i + 1 ] == "\n " and string [i + 2 ].isdigit ():
92+ string = string [:i + 1 ] + " " + string [i + 2 :]
93+
94+ # Same thing as above but in reverse order and also includes lower letter
95+ if (string [i ].islower () or string [i ].isdigit ()) and string [i + 1 ] == "\n " and any (
96+ c in special_characters for c in string [i + 2 ]):
97+ string = string [:i + 1 ] + " " + string [i + 2 :]
98+
99+ # Incase there is a new line between a digit and a lower letter
100+ if string [i ].isdigit () and string [i + 1 ] == "\n " and (string [i + 2 ].islower () or string [i + 2 ].isdigit ()):
101+ string = string [:i ] + " " + string [i + 2 :]
102+
103+ # Removing a new line in between a special character and a lower character or a number
104+ if any (c in special_characters for c in string [i ]) and string [i + 1 ] == "\n " and (
105+ string [i + 2 ].islower () or string [i + 2 ].isdigit ()):
106+ string = string [:i + 1 ] + " " + string [i + 2 :]
107+
108+ # Remove the new line if it's between a letter and digit
109+ if (string [i ].islower () or string [i ].isupper ()) and string [i + 1 ] == "\n " and string [i + 2 ].isdigit ():
110+ string = string [:i + 1 ] + string [i + 2 :]
111+
112+ # Remove the - and new line that used to indicate the word continues on a new line
113+ if string [i ] == "-" and string [i + 1 ] == "\n " :
114+ string = string [:i ] + string [i + 2 :]
115+
116+ # kind of the same as the before the upper one but detailed to incorporate more
117+ if (string [i ].islower () or string [i ].isupper ()) and string [i + 1 ] == "\n " \
118+ and (string [i + 2 ].islower () or string [i + 2 ].isupper () or any (
119+ c in special_characters for c in string [i + 2 ]) or string [i + 2 ].isspace ()):
120+ string = string [:i + 1 ] + " " + string [i + 2 :]
121+ i += 1
122+
123+ # Used a dictionary to replace unwanted or UTF-16 characters to UTF8 characters
124+ for word , replacement in char_switch .items ():
125+ string = string .replace (word , replacement )
126+
127+ # Final replacement of double spaces and spaces with a dash of compound names
128+ string = string .replace (" " , " " )
129+ string = string .replace ("- " , "-" )
130+ string = string .replace ("\n )" , ")" )
131+
132+ finalString = finalprocessing (string )
133+
134+ return finalString
135+
136+ def fitzPostProcess (string ):
137+ string = str (string )
138+ string = string .replace ("-\n " , "" )
139+ i = 0
140+ while i + 2 < len (string ):
141+ if string [i ].isspace () and string [i + 1 ] == "\n " and string [i + 2 ].isprintable ():
142+ string = string [:i + 1 ] + string [i + 2 :]
143+
144+
145+ i += 1
146+
147+ return string
0 commit comments