From 4832334dee3afddcea2c2882d932bb657f85d91b Mon Sep 17 00:00:00 2001 From: prrao87 Date: Wed, 5 Aug 2020 17:23:16 -0700 Subject: [PATCH 1/3] Fix bug: Lowercasing w/ lemmas in dependency bigrams works now --- corpus_toolkit_dev/corpus_tools.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/corpus_toolkit_dev/corpus_tools.py b/corpus_toolkit_dev/corpus_tools.py index bf75a42..37d5e87 100644 --- a/corpus_toolkit_dev/corpus_tools.py +++ b/corpus_toolkit_dev/corpus_tools.py @@ -30,7 +30,7 @@ dirsep = os.path.sep default_punct_list = [",",".","?","'",'"',"!",":",";","(",")","[","]","''","``","--"] #we can add more items to this if needed -default_space_list = ["\n","\t"," "," "," "] +default_space_list = ["\n","\t"," "," "," "] def doc_check(f_list,dirname,ending): if len(f_list) == 0: @@ -191,7 +191,7 @@ def write_corpus(new_dirname,corpus, dirname = False, ending = "txt"): outf.flush() outf.close() -ignore_list = [""," ", " ", " ", " "] #list of items we want to ignore in our frequency calculations +ignore_list = [""," ", " ", " ", " "] #list of items we want to ignore in our frequency calculations def frequency(corpus_list, ignore = ignore_list, calc = 'freq', normed = False): #options for calc are 'freq' or 'range' freq_dict = {} #empty dictionary @@ -430,11 +430,12 @@ def dicter(item,d): #d is a dictinoary dependent = token.text.lower() #then use the raw form of the word headt = token.head.text.lower() else: - dependent = token.lemma_ - headt = token.head.lemma_ - else: - dependent = token.lemma_ - headt = token.head.lemma_ + if lower: + dependent = token.lemma_.lower() + headt = token.head.lemma_.lower() + else: # If lower is false, don't lower + dependent = token.lemma_ + headt = token.head.lemma_ if lemma == False: #if lemma is false, use the token form if lower == True: #if lower is true, lower it From 2e62b0b31ef3465306043599178ed106e63003ef Mon Sep 17 00:00:00 2001 From: prrao87 Date: Wed, 5 Aug 2020 17:26:54 -0700 Subject: [PATCH 2/3] Fix space_list and ignore_list to be consistent with original version --- corpus_toolkit_dev/corpus_tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/corpus_toolkit_dev/corpus_tools.py b/corpus_toolkit_dev/corpus_tools.py index 37d5e87..107ff06 100644 --- a/corpus_toolkit_dev/corpus_tools.py +++ b/corpus_toolkit_dev/corpus_tools.py @@ -30,7 +30,7 @@ dirsep = os.path.sep default_punct_list = [",",".","?","'",'"',"!",":",";","(",")","[","]","''","``","--"] #we can add more items to this if needed -default_space_list = ["\n","\t"," "," "," "] +default_space_list = ["\n","\t"," "," "," "] def doc_check(f_list,dirname,ending): if len(f_list) == 0: @@ -191,7 +191,7 @@ def write_corpus(new_dirname,corpus, dirname = False, ending = "txt"): outf.flush() outf.close() -ignore_list = [""," ", " ", " ", " "] #list of items we want to ignore in our frequency calculations +ignore_list = [""," ", " ", " ", " "] #list of items we want to ignore in our frequency calculations def frequency(corpus_list, ignore = ignore_list, calc = 'freq', normed = False): #options for calc are 'freq' or 'range' freq_dict = {} #empty dictionary From d6b64d283e5b977ae5d89c71b9db74dce39dedc5 Mon Sep 17 00:00:00 2001 From: prrao87 Date: Wed, 5 Aug 2020 17:45:54 -0700 Subject: [PATCH 3/3] Forgot to include else block for -PRON- case (fixed now) --- corpus_toolkit_dev/corpus_tools.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/corpus_toolkit_dev/corpus_tools.py b/corpus_toolkit_dev/corpus_tools.py index 107ff06..6ccccd2 100644 --- a/corpus_toolkit_dev/corpus_tools.py +++ b/corpus_toolkit_dev/corpus_tools.py @@ -430,12 +430,15 @@ def dicter(item,d): #d is a dictinoary dependent = token.text.lower() #then use the raw form of the word headt = token.head.text.lower() else: - if lower: + if lower == True: dependent = token.lemma_.lower() headt = token.head.lemma_.lower() else: # If lower is false, don't lower dependent = token.lemma_ headt = token.head.lemma_ + else: #if we want Spacy's pronoun lemma + dependent = token.lemma_ + headt = token.head.lemma_ if lemma == False: #if lemma is false, use the token form if lower == True: #if lower is true, lower it