pyMSSQL/textCleanUp.py at master · FoodSentimentObservatory/pyMSSQL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import spacy
import re
from spacy.attrs import ORTH
from spacy import en
import sys
import os
from operator import itemgetter
import fileFunctions
#function to remove stop words, urls, punctuation, numbers and symbols using spacy. Goes through each tweet text and appends the result to two lists, one for the tweet and one for general statistics
def textCleanup(allWords,text, someList):
    for word in text:
                words =re.sub(r'@', "", str(word))
                if word.is_stop != True and word.like_url != True and word.is_punct !=True and word.like_num != True and words.isalpha()== True and len(words)> 1:
                    someList.append(words)
                    allWords.append(words.lower())
#does a frequency count of words across the whole corpus given to it using spacy and also generates a list of repeated words and unique words
def frequencyCount(nlp, allWords, repeatedWords, uniqueWords, allWordsFrequency,counter):
    allWordsStr = ' '.join(allWords)
    doc = nlp(allWordsStr)
    counts = doc.count_by(ORTH)
    i = 0
    #checks how many times each word appears in the whole word list and puts it either in repeated words or in unique words
    for word_id, count in sorted(counts.items(), reverse=True, key=lambda item: item[1]):
        words = nlp.vocab.strings[word_id]

        #if words.isalpha() == True:
        frequencyTuple = (str(count), words.lower())
        frequencyTupleStr = ' '.join(frequencyTuple)
        allWordsFrequency.append(frequencyTupleStr)
        if count > 1:
                    repeatedWordsTuple = (str(count), words.lower())
                    repeatedWordsTupleStr = ' '.join(repeatedWordsTuple)
                    repeatedWords.append(repeatedWordsTupleStr)

        else:
                    uniqueWords.append(words.lower())
    #generate a txt file with all words that have been repeated at least twice
    fileNameStringAllFreq = "allFrequencies"
    fileFunctions.writeTxtFrequencyFile(allWordsFrequency,fileNameStringAllFreq, counter)
    print ("File with the frequencies of all words has been generated. ")

    fileNameStringRep = "repeatedWords"
    fileFunctions.writeTxtFrequencyFile(repeatedWords,fileNameStringRep,counter)

    print ("File with the frequencies of all repeated words has been generated. ")

#removes any word from a tweet which doesn't appear at least twice across the whole tweet corpus
def removeUniqueWords(uniqueWords, allTweets, finalTweetTexts, finalTextCount):
        for tweet in allTweets:
            tweetText = []
            text = tweet[2]
            for word in text:
                words = str(word)
                if words not in uniqueWords:
                    tweetText.append(words)
            #generating a list with the processed text and word count of each tweet for the csv file
            tweetTextS = str(tweetText)
            tweetTextCount = len(tweetText)
            tweetTextTuple = (tweet[0], tweetTextS, tweetTextCount)
            finalTextCount.append(tweetTextTuple)
            #generating a list with the tweet data+filtered text
            tweetList = [tweet[0], tweet[1], tweetText, tweet[3]]
            finalTweetTexts.append(tweetList)
        print ("The unique words have been removed from all the tweets. ")
#removes duplicating tweets (based on platfrom ID) and retweets*
#* if we don't have the original tweet text, one retweet can get through the filtering..
# ..the retweet is stripped from the 'rt' and any mentions and the remaining string is compared to a list of originals
def removeDupsAndRetweets(row, location):
    twitterIDs = []
    origTexts = []
    retweetTexts = []
    rowS=[]
    i = 0
    sortedRow = sorted(row, key=itemgetter(3))
    for r in sortedRow:
        i = i+1
        if r[4] not in twitterIDs:
            twitterIDs.append(r[4])
            #rowS.append(r)
            text = r[2].lower()
            textList = text.split()
            textStrList = [word for word in textList if word.isalpha()]
            textStrRt = ' '.join(textStrList)
            textStr = re.sub(r'rt','',textStrRt)
            textStrStripped = textStr.strip()
            if textList[0]=="rt":
                if textStrStripped in origTexts:
                    retweetTexts.append(textStrStripped)
                else:
                    origTexts.append(textStrStripped)
                    rowS.append(r)

            elif textList[0]!="rt" and textStrStripped not in origTexts:
                origTexts.append(textStrStripped)
                rowS.append(r)

        if i%10000==0:
            print("processed "+ str(i) +  " tweets")

    #print ("All tweets from "+location+" have been processed.")

    return rowS
#needs to be fixed in order to filter out properly tweets with shorter strings
def searchForKeywordCombos(filterKeywords, text, filterWords,nlp):
    filterCount = 0
    check = 0
    for word in filterKeywords:
        if str(word)!='fhis' and str(word)!='fhrs' and str(word)!='fsa' and str(word)!='fss':
            wordS=word
            if word in text:
                if word not in filterWords:
                    filterCount += 1
                    filterWords.append(word)
                if filterCount >=1:
                    check = 1
                else:check = 2
        else:
            wordS = "o"+word+"t"
            if word in text:
                textList=text.split()
                for t in textList:
                    if t==word and t.isalpha()==True and wordS not in str(t):
                        if word not in filterWords:
                            filterCount += 1
                            filterWords.append(word)
                        if filterCount >=1:
                            check = 1
                        else:check = 2
    return check

def wordCountGen(wordCount, finalTextCount, counter):
    fileNameString = counter+"_wordCount"
    wordCountFinal = []
    for count in wordCount:
        tweetID = count[0]
        oldCount = count[2]
        origText = str(count[1]).encode(sys.stdout.encoding, errors='replace')
        for cleanCount in finalTextCount:
            if cleanCount[0] == tweetID:
                newText = str(cleanCount[1]).encode(sys.stdout.encoding, errors='replace')
                finalCountTuple = (tweetID, origText, oldCount, newText, cleanCount[2])
                wordCountFinal.append(finalCountTuple)

    fileFunctions.writeCsvFile(wordCountFinal, fileNameString)