forked from Khooshrin/Boolean-Information-Retrieval-System
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpermeditted.py
More file actions
93 lines (82 loc) · 2.6 KB
/
permeditted.py
File metadata and controls
93 lines (82 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import numpy as np, glob, re,os, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
class Node:
def __init__(self ,DocID, freq = None):
self.freq = freq
self.doc = DocID
self.nextval = None
class LinkedList:
def __init__(self ,head = None):
self.head = head
def uniqueWordFreq(doc):
uniqueWords = []
freq = {}
for word in doc:
word=ps.stem(word)
if word not in uniqueWords:
uniqueWords.append(word)
for word in uniqueWords:
freq[word] = doc.count(word)
return freq
with open('Stopwords.txt') as f:
for line in f:
Stopwords = line.split(", ")
wordsInDocs = {}
docFolder = 'C:\\Users\\Anshul\\Downloads\\dataset\*'
DocID = 1
fileIndex = {}
#using another method for opening files
for file in glob.glob(docFolder):
fname = file
file = open(file , "r")
doc = file.read()
regex = re.compile('[^a-zA-Z\s]')
doc = re.sub(regex,'',doc)
words = word_tokenize(doc)
for word in words:
if word not in Stopwords:
word=ps.stem(word)
word.lower()
words.append(word)
wordsInDocs.update(uniqueWordFreq(words))
fileIndex[DocID] = os.path.basename(fname)
DocID = DocID + 1
uniqueWords = set(wordsInDocs.keys())
wordLinkedList = {}
for word in uniqueWords:
wordLinkedList[word] = LinkedList()
wordLinkedList[word].head = Node(1,Node)
DocID = 1
for file in glob.glob(docFolder):
file = open(file, "r")
doc = file.read()
regex = re.compile('[^a-zA-Z\s]')
doc = re.sub(regex,'',doc)
words = word_tokenize(doc)
words=ps.stem(word)
words = [word.lower() for word in words if word not in Stopwords]
wordsInDocs=uniqueWordFreq(words)
for word in wordsInDocs.keys():
current = wordLinkedList[word].head
while current.nextval is not None:
current = current.nextval
current.nextval = Node(DocID ,wordsInDocs[word])
DocID = DocID + 1
#add all the permuterms in a separate doc
def rot(str,n):
return str[n:]+str[:n]
f = open("PermutermIndex.txt","w")
for key in uniqueWords:
dockey = key + "$"
for i in range(len(dockey),0,-1):
out = rot(dockey,i)
f.write(out)
f.write(" ")
f.write(key)
f.write("\n")
f.close()