-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWordFrequencyReader.py
More file actions
132 lines (82 loc) · 3.32 KB
/
WordFrequencyReader.py
File metadata and controls
132 lines (82 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
## Author: Jose F. Martinez Rivera
## Student Number: 802-10-4088
## Course: ICOM4036 - 040
## Professor: Dr. Wilson Rivera
## Hand-In Date: March 12, 2013
# 1. Write a program that counts the frequencies of each word in a text,
# and output each word with its count and line numbers where it appears.
# We define a word as a contiguous sequence of non-white-space characters.
# Different capitalizations of the same character sequence should be
# considered same word (e.g. Python and python). The output is formatted as
# follows: each line begins with a number indicating the frequency of the word,
# a white space, then the word itself, and a list of line numbers containing this
# word. You should output from the most frequent word to the least frequent. In case
# two words have the same frequency, the lexicographically smaller one comes first.
# All words are in lower case in the output.
import re
#Stores tuples that contain the frequency of a word, the word itself and the line
#in which the number appears.
wordDictionary = []
def main():
#Text File location
fileInput = 'textfile.txt'
file = read(fileInput)
#We count line numbers from 1
lineNumber = 1
while True:
text = file.readline()
if text == "": #Marks if the file has ended
break
else:
wordsInText = match(text)
addToDictionary(wordsInText, lineNumber)
lineNumber += 1
file.close()
sort()
printOut()
#Opens the file at the given file location
def read(input):
file = open(input, 'r')
return file
#Matches all the words (sequences of non-whitespace characters)
def match(text):
wordMatches = re.findall('[\S]+', text)
return wordMatches
#Adds all the words found in the text file to the dictionary
def addToDictionary(wordList, lineNumber):
smallDictionary = []
for i in range(0, len(wordList)):
boolTuple = False
for j in range(0, len(wordDictionary)):
#If the word is already in the list, we make a new tuple, increase the frecuency and add the line number
if wordDictionary[j][1].lower() == wordList[i].lower():
replacementLines = wordDictionary[j][2]
replacementLines.append(lineNumber)
replacementTuple = wordDictionary[j][0] + 1, wordDictionary[j][1].lower(), replacementLines
wordDictionary[j] = replacementTuple
boolTuple = True
break
if boolTuple == False:
newTuple = 1, wordList[i].lower(), [lineNumber]
wordDictionary.append(newTuple)
#Prints out the words found, their frequency and line position.
def printOut():
output = open('output.txt', 'w')
for k in range(0, len(wordDictionary)):
output.write("{: <5}{: <20}{: <25}".format(str(wordDictionary[k][0]), wordDictionary[k][1], str(wordDictionary[k][2])))
output.write("\n") #Prints out a newline
output.close()
#Bubblesort algorithm
def sort():
for i in range(0, len(wordDictionary)-1):
for j in range(0, len(wordDictionary)-1):
if wordDictionary[j][0] < wordDictionary[j+1][0]:
temp = wordDictionary[j]
wordDictionary[j] = wordDictionary[j+1]
wordDictionary[j+1] = temp
elif wordDictionary[j][0] == wordDictionary[j+1][0]:
if wordDictionary[j][1] > wordDictionary[j+1][1]:
temp = wordDictionary[j]
wordDictionary[j] = wordDictionary[j+1]
wordDictionary[j+1] = temp
main()