sd17spring · anniekroo · Feb 24, 2017 · Apr 24, 2017
diff --git a/WriteUp.pdf b/WriteUp.pdf
diff --git a/alice.pickle b/alice.pickle
diff --git a/bible.pickle b/bible.pickle
diff --git a/cant.pickle b/cant.pickle
diff --git a/don.pickle b/don.pickle
diff --git a/frank.pickle b/frank.pickle
diff --git a/iliad.pickle b/iliad.pickle
diff --git a/meta.pickle b/meta.pickle
diff --git a/mining.py b/mining.py
@@ -0,0 +1,99 @@
+import pickle
+import nltk
+import numpy
+from nltk.tokenize import TweetTokenizer
+import collections
+
+
+class bookDater():
+    def __init__(self):
+        #list names of pickled books and corresponding dates from project gutenburg
+        self.books = ['odyssey', 'iliad','bible', 'cant', 'shakes', 'don', 'pride', 'frank', 'alice', 'sherlock', 'wilde']
+        self.times = ['~ 800BC', '~ 800BC', '~100', '~1400', '~1600', '~1600', '~1810', '~1820', '~1860', '~1890', '~1900']
+        #name of book that program is guessing age
+        self.unknown = 'suprise'
+        #finds distance between books given word count and flatens into a 1xn list of values
+        self.over = numpy.absolute(self.allWords().A1)
+        self.a0 = numpy.array(self.over).tolist()
+        #finds the minimum difference between books
+        self.ind = self.a0.index(min(self.over))
+        #prints estimated publication date and differnces
+        print(self.times[self.ind])
+        print(self.over)
+
+    def makeWords(self):
+        #makes a dictionary of all of the dictioaries of word frequencies of each book
+        #also returns number of words in each book in 'count' variable
+        count = []
+        dictionaries = []
+        i = 0
+        while i < len(self.books):
+            p = processText()
+            sText = p.tokenizeText(p.openFile(self.books[i]))
+            words = p.wordFreq(sText)
+            dictionaries.append(words)
+            length = len(sText)
+            count.append(length)
+            i+=1
+        return (dictionaries, count)
+
+    def fgivenWord(self, word, dictionaries, count, u, ulen):
+        #calculates percentage of each book that is made up of each word
+        i = 0
+        freq = []
+        while i < len(dictionaries):
+            bookDict = dictionaries[i]
+            freqWord = bookDict[word]
+            percent = freqWord/count[i]
+            freq.append(percent)
+            i += 1
+        #tells how much of the unknown book is made up of each word
+        percentUnknown = u[word]/ulen
+        return (freq, percentUnknown)
+
+    def allWords(self):
+        #opens and tokenizes unknown text and determines word frequency
+        p = processText()
+        u0 = p.tokenizeText(p.openFile(self.unknown))
+        u = p.wordFreq(u0)
+        ulen = len(u0)
+
+        dicts, counts = self.makeWords()
+
+        distance = []
+        for word in u.keys():
+            f, k = self.fgivenWord(word, dicts, counts, u, ulen)
+            array = numpy.array(f)
+            diff = array - k
+            distance.append(diff)
+        mat = numpy.matrix(distance)
+        interesting = numpy.mean(mat, axis = 0)
+        return interesting
+
+class processText():
+    #basic word processing
+    def openFile(self, name):
+        #opens each pickled file, removes nonalphabetic characters and makes all words lowercase.
+        pickledName = name + '.pickle'
+        opened = open(pickledName, 'rb')
+        txt = pickle.load(opened)
+        txt = txt.lower()
+        txt = txt.strip(',.?/1234567890~*()')
+        return txt
+
+    def tokenizeText(self, t):
+        #tokenizes text such that each word is its own element in a list
+        tknzr = TweetTokenizer()
+        token = tknzr.tokenize(t)
+        return token
+
+    def wordFreq(self, s):
+        #lists the frequency of each word in the book in a dictionary
+        c = collections.Counter()
+        for word in s:
+            c[word] += 1
+        return c
+
+
+if __name__ == '__main__':
+    bookDater()
diff --git a/odyssey.pickle b/odyssey.pickle
diff --git a/pickled.py b/pickled.py
@@ -0,0 +1,69 @@
+import requests
+import pickle
+
+# Save data to a file (will be part of your data fetching script)
+odyssey = requests.get('https://www.gutenberg.org/files/48895/48895-0.txt').text
+a = open('odyssey.pickle', 'wb')
+pickle.dump(odyssey, a)
+a.close()
+
+iliad = requests.get('http://www.gutenberg.org/cache/epub/6130/pg6130.txt').text
+a0 = open('iliad.pickle', 'wb')
+pickle.dump(iliad, a0)
+a0.close()
+
+
+bible = requests.get('http://www.gutenberg.org/cache/epub/30/pg30.txt').text
+b = open('bible.pickle', 'wb')
+pickle.dump(bible, b)
+b.close()
+
+cant = requests.get('http://www.gutenberg.org/cache/epub/2383/pg2383.txt').text
+c = open('cant.pickle', 'wb')
+pickle.dump(cant, c)
+c.close()
+
+shakes = requests.get('http://www.gutenberg.org/cache/epub/2243/pg2243.txt').text
+d = open('shakes.pickle', 'wb')
+pickle.dump(shakes, d)
+d.close()
+
+don = requests.get('http://www.gutenberg.org/cache/epub/996/pg996.txt').text
+d0 = open('don.pickle', 'wb')
+pickle.dump(don, d0)
+d0.close()
+
+pride = requests.get('http://www.gutenberg.org/files/1342/1342-0.txt').text
+e = open('pride.pickle', 'wb')
+pickle.dump(pride, e)
+e.close()
+
+frank = requests.get('http://www.gutenberg.org/cache/epub/84/pg84.txt').text
+e0 = open('frank.pickle', 'wb')
+pickle.dump(frank, e0)
+e0.close()
+
+alice = requests.get('http://www.gutenberg.org/files/11/11-0.txt').text
+f = open('alice.pickle', 'wb')
+pickle.dump(alice, f)
+f.close()
+
+sherlock = requests.get('http://www.gutenberg.org/cache/epub/1661/pg1661.txt').text
+g = open('sherlock.pickle', 'wb')
+pickle.dump(sherlock, g)
+g.close()
+
+wilde = requests.get('http://www.gutenberg.org/cache/epub/844/pg844.txt').text
+g0 = open('wilde.pickle', 'wb')
+pickle.dump(wilde, g0)
+g0.close()
+
+#meta = requests.get('http://www.gutenberg.org/cache/epub/5200/pg5200.txt').text
+#h = open('meta.pickle', 'wb')
+#pickle.dump(meta, h)
+#h.close()
+
+suprise = requests.get('http://www.gutenberg.org/cache/epub/147/pg147.txt').text
+i = open('suprise.pickle', 'wb')
+pickle.dump(suprise, i)
+i.close()
diff --git a/pride.pickle b/pride.pickle
diff --git a/shakes.pickle b/shakes.pickle
diff --git a/sherlock.pickle b/sherlock.pickle
diff --git a/suprise.pickle b/suprise.pickle
diff --git a/wilde.pickle b/wilde.pickle