diff --git a/WriteUp.pdf b/WriteUp.pdf new file mode 100644 index 0000000..57b6668 Binary files /dev/null and b/WriteUp.pdf differ diff --git a/alice.pickle b/alice.pickle new file mode 100644 index 0000000..6f33832 Binary files /dev/null and b/alice.pickle differ diff --git a/bible.pickle b/bible.pickle new file mode 100644 index 0000000..e3677c9 Binary files /dev/null and b/bible.pickle differ diff --git a/cant.pickle b/cant.pickle new file mode 100644 index 0000000..47ebd7b Binary files /dev/null and b/cant.pickle differ diff --git a/don.pickle b/don.pickle new file mode 100644 index 0000000..69e3dde Binary files /dev/null and b/don.pickle differ diff --git a/frank.pickle b/frank.pickle new file mode 100644 index 0000000..812272b Binary files /dev/null and b/frank.pickle differ diff --git a/iliad.pickle b/iliad.pickle new file mode 100644 index 0000000..3f9b04e Binary files /dev/null and b/iliad.pickle differ diff --git a/meta.pickle b/meta.pickle new file mode 100644 index 0000000..d0de415 Binary files /dev/null and b/meta.pickle differ diff --git a/mining.py b/mining.py new file mode 100644 index 0000000..b12cf50 --- /dev/null +++ b/mining.py @@ -0,0 +1,99 @@ +import pickle +import nltk +import numpy +from nltk.tokenize import TweetTokenizer +import collections + + +class bookDater(): + def __init__(self): + #list names of pickled books and corresponding dates from project gutenburg + self.books = ['odyssey', 'iliad','bible', 'cant', 'shakes', 'don', 'pride', 'frank', 'alice', 'sherlock', 'wilde'] + self.times = ['~ 800BC', '~ 800BC', '~100', '~1400', '~1600', '~1600', '~1810', '~1820', '~1860', '~1890', '~1900'] + #name of book that program is guessing age + self.unknown = 'suprise' + #finds distance between books given word count and flatens into a 1xn list of values + self.over = numpy.absolute(self.allWords().A1) + self.a0 = numpy.array(self.over).tolist() + #finds the minimum difference between books + self.ind = self.a0.index(min(self.over)) + #prints estimated publication date and differnces + print(self.times[self.ind]) + print(self.over) + + def makeWords(self): + #makes a dictionary of all of the dictioaries of word frequencies of each book + #also returns number of words in each book in 'count' variable + count = [] + dictionaries = [] + i = 0 + while i < len(self.books): + p = processText() + sText = p.tokenizeText(p.openFile(self.books[i])) + words = p.wordFreq(sText) + dictionaries.append(words) + length = len(sText) + count.append(length) + i+=1 + return (dictionaries, count) + + def fgivenWord(self, word, dictionaries, count, u, ulen): + #calculates percentage of each book that is made up of each word + i = 0 + freq = [] + while i < len(dictionaries): + bookDict = dictionaries[i] + freqWord = bookDict[word] + percent = freqWord/count[i] + freq.append(percent) + i += 1 + #tells how much of the unknown book is made up of each word + percentUnknown = u[word]/ulen + return (freq, percentUnknown) + + def allWords(self): + #opens and tokenizes unknown text and determines word frequency + p = processText() + u0 = p.tokenizeText(p.openFile(self.unknown)) + u = p.wordFreq(u0) + ulen = len(u0) + + dicts, counts = self.makeWords() + + distance = [] + for word in u.keys(): + f, k = self.fgivenWord(word, dicts, counts, u, ulen) + array = numpy.array(f) + diff = array - k + distance.append(diff) + mat = numpy.matrix(distance) + interesting = numpy.mean(mat, axis = 0) + return interesting + +class processText(): + #basic word processing + def openFile(self, name): + #opens each pickled file, removes nonalphabetic characters and makes all words lowercase. + pickledName = name + '.pickle' + opened = open(pickledName, 'rb') + txt = pickle.load(opened) + txt = txt.lower() + txt = txt.strip(',.?/1234567890~*()') + return txt + + def tokenizeText(self, t): + #tokenizes text such that each word is its own element in a list + tknzr = TweetTokenizer() + token = tknzr.tokenize(t) + return token + + def wordFreq(self, s): + #lists the frequency of each word in the book in a dictionary + c = collections.Counter() + for word in s: + c[word] += 1 + return c + + +if __name__ == '__main__': + bookDater() diff --git a/odyssey.pickle b/odyssey.pickle new file mode 100644 index 0000000..c7ee610 Binary files /dev/null and b/odyssey.pickle differ diff --git a/pickled.py b/pickled.py new file mode 100644 index 0000000..4ba0f76 --- /dev/null +++ b/pickled.py @@ -0,0 +1,69 @@ +import requests +import pickle + +# Save data to a file (will be part of your data fetching script) +odyssey = requests.get('https://www.gutenberg.org/files/48895/48895-0.txt').text +a = open('odyssey.pickle', 'wb') +pickle.dump(odyssey, a) +a.close() + +iliad = requests.get('http://www.gutenberg.org/cache/epub/6130/pg6130.txt').text +a0 = open('iliad.pickle', 'wb') +pickle.dump(iliad, a0) +a0.close() + + +bible = requests.get('http://www.gutenberg.org/cache/epub/30/pg30.txt').text +b = open('bible.pickle', 'wb') +pickle.dump(bible, b) +b.close() + +cant = requests.get('http://www.gutenberg.org/cache/epub/2383/pg2383.txt').text +c = open('cant.pickle', 'wb') +pickle.dump(cant, c) +c.close() + +shakes = requests.get('http://www.gutenberg.org/cache/epub/2243/pg2243.txt').text +d = open('shakes.pickle', 'wb') +pickle.dump(shakes, d) +d.close() + +don = requests.get('http://www.gutenberg.org/cache/epub/996/pg996.txt').text +d0 = open('don.pickle', 'wb') +pickle.dump(don, d0) +d0.close() + +pride = requests.get('http://www.gutenberg.org/files/1342/1342-0.txt').text +e = open('pride.pickle', 'wb') +pickle.dump(pride, e) +e.close() + +frank = requests.get('http://www.gutenberg.org/cache/epub/84/pg84.txt').text +e0 = open('frank.pickle', 'wb') +pickle.dump(frank, e0) +e0.close() + +alice = requests.get('http://www.gutenberg.org/files/11/11-0.txt').text +f = open('alice.pickle', 'wb') +pickle.dump(alice, f) +f.close() + +sherlock = requests.get('http://www.gutenberg.org/cache/epub/1661/pg1661.txt').text +g = open('sherlock.pickle', 'wb') +pickle.dump(sherlock, g) +g.close() + +wilde = requests.get('http://www.gutenberg.org/cache/epub/844/pg844.txt').text +g0 = open('wilde.pickle', 'wb') +pickle.dump(wilde, g0) +g0.close() + +#meta = requests.get('http://www.gutenberg.org/cache/epub/5200/pg5200.txt').text +#h = open('meta.pickle', 'wb') +#pickle.dump(meta, h) +#h.close() + +suprise = requests.get('http://www.gutenberg.org/cache/epub/147/pg147.txt').text +i = open('suprise.pickle', 'wb') +pickle.dump(suprise, i) +i.close() diff --git a/pride.pickle b/pride.pickle new file mode 100644 index 0000000..4768db0 Binary files /dev/null and b/pride.pickle differ diff --git a/shakes.pickle b/shakes.pickle new file mode 100644 index 0000000..c43b209 Binary files /dev/null and b/shakes.pickle differ diff --git a/sherlock.pickle b/sherlock.pickle new file mode 100644 index 0000000..684bb23 Binary files /dev/null and b/sherlock.pickle differ diff --git a/suprise.pickle b/suprise.pickle new file mode 100644 index 0000000..325f9e7 Binary files /dev/null and b/suprise.pickle differ diff --git a/wilde.pickle b/wilde.pickle new file mode 100644 index 0000000..b7df249 Binary files /dev/null and b/wilde.pickle differ