Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added WriteUp.pdf
Binary file not shown.
Binary file added alice.pickle
Binary file not shown.
Binary file added bible.pickle
Binary file not shown.
Binary file added cant.pickle
Binary file not shown.
Binary file added don.pickle
Binary file not shown.
Binary file added frank.pickle
Binary file not shown.
Binary file added iliad.pickle
Binary file not shown.
Binary file added meta.pickle
Binary file not shown.
99 changes: 99 additions & 0 deletions mining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import pickle
import nltk
import numpy
from nltk.tokenize import TweetTokenizer
import collections


class bookDater():
def __init__(self):
#list names of pickled books and corresponding dates from project gutenburg
self.books = ['odyssey', 'iliad','bible', 'cant', 'shakes', 'don', 'pride', 'frank', 'alice', 'sherlock', 'wilde']
self.times = ['~ 800BC', '~ 800BC', '~100', '~1400', '~1600', '~1600', '~1810', '~1820', '~1860', '~1890', '~1900']
#name of book that program is guessing age
self.unknown = 'suprise'
#finds distance between books given word count and flatens into a 1xn list of values
self.over = numpy.absolute(self.allWords().A1)
self.a0 = numpy.array(self.over).tolist()
#finds the minimum difference between books
self.ind = self.a0.index(min(self.over))
#prints estimated publication date and differnces
print(self.times[self.ind])
print(self.over)

def makeWords(self):
#makes a dictionary of all of the dictioaries of word frequencies of each book
#also returns number of words in each book in 'count' variable
count = []
dictionaries = []
i = 0
while i < len(self.books):
p = processText()
sText = p.tokenizeText(p.openFile(self.books[i]))
words = p.wordFreq(sText)
dictionaries.append(words)
length = len(sText)
count.append(length)
i+=1
return (dictionaries, count)

def fgivenWord(self, word, dictionaries, count, u, ulen):
#calculates percentage of each book that is made up of each word
i = 0
freq = []
while i < len(dictionaries):
bookDict = dictionaries[i]
freqWord = bookDict[word]
percent = freqWord/count[i]
freq.append(percent)
i += 1
#tells how much of the unknown book is made up of each word
percentUnknown = u[word]/ulen
return (freq, percentUnknown)

def allWords(self):
#opens and tokenizes unknown text and determines word frequency
p = processText()
u0 = p.tokenizeText(p.openFile(self.unknown))
u = p.wordFreq(u0)
ulen = len(u0)

dicts, counts = self.makeWords()

distance = []
for word in u.keys():
f, k = self.fgivenWord(word, dicts, counts, u, ulen)
array = numpy.array(f)
diff = array - k
distance.append(diff)
mat = numpy.matrix(distance)
interesting = numpy.mean(mat, axis = 0)
return interesting

class processText():
#basic word processing
def openFile(self, name):
#opens each pickled file, removes nonalphabetic characters and makes all words lowercase.
pickledName = name + '.pickle'
opened = open(pickledName, 'rb')
txt = pickle.load(opened)
txt = txt.lower()
txt = txt.strip(',.?/1234567890~*()')
return txt

def tokenizeText(self, t):
#tokenizes text such that each word is its own element in a list
tknzr = TweetTokenizer()
token = tknzr.tokenize(t)
return token

def wordFreq(self, s):
#lists the frequency of each word in the book in a dictionary
c = collections.Counter()
for word in s:
c[word] += 1
return c


if __name__ == '__main__':
bookDater()
Binary file added odyssey.pickle
Binary file not shown.
69 changes: 69 additions & 0 deletions pickled.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import requests
import pickle

# Save data to a file (will be part of your data fetching script)
odyssey = requests.get('https://www.gutenberg.org/files/48895/48895-0.txt').text
a = open('odyssey.pickle', 'wb')
pickle.dump(odyssey, a)
a.close()

iliad = requests.get('http://www.gutenberg.org/cache/epub/6130/pg6130.txt').text
a0 = open('iliad.pickle', 'wb')
pickle.dump(iliad, a0)
a0.close()


bible = requests.get('http://www.gutenberg.org/cache/epub/30/pg30.txt').text
b = open('bible.pickle', 'wb')
pickle.dump(bible, b)
b.close()

cant = requests.get('http://www.gutenberg.org/cache/epub/2383/pg2383.txt').text
c = open('cant.pickle', 'wb')
pickle.dump(cant, c)
c.close()

shakes = requests.get('http://www.gutenberg.org/cache/epub/2243/pg2243.txt').text
d = open('shakes.pickle', 'wb')
pickle.dump(shakes, d)
d.close()

don = requests.get('http://www.gutenberg.org/cache/epub/996/pg996.txt').text
d0 = open('don.pickle', 'wb')
pickle.dump(don, d0)
d0.close()

pride = requests.get('http://www.gutenberg.org/files/1342/1342-0.txt').text
e = open('pride.pickle', 'wb')
pickle.dump(pride, e)
e.close()

frank = requests.get('http://www.gutenberg.org/cache/epub/84/pg84.txt').text
e0 = open('frank.pickle', 'wb')
pickle.dump(frank, e0)
e0.close()

alice = requests.get('http://www.gutenberg.org/files/11/11-0.txt').text
f = open('alice.pickle', 'wb')
pickle.dump(alice, f)
f.close()

sherlock = requests.get('http://www.gutenberg.org/cache/epub/1661/pg1661.txt').text
g = open('sherlock.pickle', 'wb')
pickle.dump(sherlock, g)
g.close()

wilde = requests.get('http://www.gutenberg.org/cache/epub/844/pg844.txt').text
g0 = open('wilde.pickle', 'wb')
pickle.dump(wilde, g0)
g0.close()

#meta = requests.get('http://www.gutenberg.org/cache/epub/5200/pg5200.txt').text
#h = open('meta.pickle', 'wb')
#pickle.dump(meta, h)
#h.close()

suprise = requests.get('http://www.gutenberg.org/cache/epub/147/pg147.txt').text
i = open('suprise.pickle', 'wb')
pickle.dump(suprise, i)
i.close()
Binary file added pride.pickle
Binary file not shown.
Binary file added shakes.pickle
Binary file not shown.
Binary file added sherlock.pickle
Binary file not shown.
Binary file added suprise.pickle
Binary file not shown.
Binary file added wilde.pickle
Binary file not shown.