Boolean-Information-Retrieval-System/BooleanIR.html at main · itsdevandy/Boolean-Information-Retrieval-System · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
<meta name="generator" content="pdoc 0.10.0" />
<title>BooleanIR API documentation</title>
<meta name="description" content="" />
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
</head>
<body>
<main>
<article id="content">
<header>
<h1 class="title">Module <code>BooleanIR</code></h1>
</header>
<section id="section-intro">
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">import numpy as np, glob, re, os, nltk, sys
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.metrics.distance import edit_distance

class Node:

    &#34;&#34;&#34; Class which defines a node of the linked list.
        Each node has a document ID along with the frequency
        which stores the frequency of the term in the document
        with the respective document ID &#34;&#34;&#34;

    def __init__(self ,DocID, freq = None):     #Constructor to initialize node
        self.freq = freq                        #with frequency of a term freq in
        self.doc = DocID                        #a particular document having DocID as the document ID
        self.nextval = None                     #the next pointer points to NULL

class LinkedList:

    &#34;&#34;&#34; Class to store the frequency of the term in a
        particular document ID only if the document
        contains the word at least once &#34;&#34;&#34;

    def __init__(self ,head = None):            #Constructor to define a linked list
        self.head = head                        #in which the head points to the first element of the linked list

def uniqueWordFreq(doc):

    &#34;&#34;&#34; Function to find all the unique words in the document
        passed as a parameter and then calculate the frequency
        of the unique word in the document and returns it &#34;&#34;&#34;

    uniqueWords = []                            #List that contains the unique words across all documents and their permuterms
    freq = {}                                   #List that contains the frequency of the corresponding word in unique words
    for word in doc:
        if word not in uniqueWords:
            ps.stem(word)                       #Performing stemming operation using PorterStemmer in nltk
            lemmatizer.lemmatize(word)          #Performing lemmatization operation using WordNetLemmatizer in nltk
            uniqueWords.append(word)            #Adding a word to the uniqueWords list if it did not exist in the list
    for word in uniqueWords:
        freq[word] = doc.count(word)            #Calculating the frequency of a term in the document
    return freq

def rot(str,n):

    &#34;&#34;&#34; Function to rotate the string passed as a parameter
        by n places and then return it. It is used to
        calculate all the permuterm combinations possible
        of the string that is passed as a parameter. &#34;&#34;&#34;

    return str[n:]+str[:n]                      #Returns the string after reforming right rotation n times

Stopwords = set(stopwords.words(&#39;english&#39;))     #Creating a list of all the stop words in the english language
ps = PorterStemmer()                            #Defining the stemmer to use it to perform stemming on the words in the document and the query
lemmatizer = WordNetLemmatizer()                #Defining the lemmatizer to use it to perform lemmatization on the words in the document and the query

&#34;&#34;&#34; Iterate through the list of documents in the folder to find
    all the unique words present after deleting numbers and
    special characters. Ignore the stopwords while finding the
    unique words. Creates a dictionary of the document ID and
    the respective document name. &#34;&#34;&#34;

wordsInDocs = {}                                #Dictionary to store the unique words in all the douments as the key and their frequecies as the value
docFolder = &#39;C:/Users/KHOOSHRIN/Documents/PythonPrograms/DataSetFiles/*&#39;        #Path for all the documents in the retrieval system
DocID = 1
fileIndex = {}                                  #Dictionary to store th file index number as the key and the file name as the frequency
for file in glob.glob(docFolder):               #Iterating through all files in the folder
    fname = file                                #To store the name of the file
    file = open(file , &#34;r&#34;)                     #Granting only reading permissions
    doc = file.read()                           #Reading all the text in the docoument
    regex = re.compile(&#39;[^a-zA-Z\s]&#39;)           #Creating a regex to remove all characters except letters and whitespaces from the document
    doc = re.sub(regex,&#39;&#39;,doc)                  #Removing all digits and special characters from the document
    words = word_tokenize(doc)                  #Tokenizing the words in the document to get all the unique words in the document
    words = [word for word in words if word not in Stopwords]       #Eliminating the stopwords from the document
    words = [word.lower() for word in words]                        #Converting all the words to lower case to maintain uniformity
    words = [ps.stem(word) for word in words]                       #Stemming all the unique non-stopwords in the document
    words = [lemmatizer.lemmatize(word) for word in words]          #Lemmatizing all the unique non-stopwords in the document
    wordsInDocs.update(uniqueWordFreq(words))                       #Invoking function to calculate frequency of the unique words and the modify it in the dictionary
    fileIndex[DocID] = os.path.basename(fname)                      #Storing the corresponding file name and document ID in a dictionary
    DocID = DocID + 1                           #Incrementing to the next document ID

uniqueWords = set(wordsInDocs.keys())           #Set which stores all the unique words and their permuterm combinations

&#34;&#34;&#34; Iterate through the list of unique words stemming and lemmatizing
    each termto create the linked list for each term and then find all
    the permuterms for the given term and copy the same linked list
    for the permuterms. This helps in wildcard query handling. &#34;&#34;&#34;

wordLinkedList = {}                             #Linked list of each term in the document which stores the Document ID of the document
#permuterm = {}
#termPermuterm={}
for word in uniqueWords:                        #Iterating through all the unique words across the documents
    wordLinkedList[word] = LinkedList()         #Initialising a linked list for each unique word
    wordLinkedList[word].head = Node(1,Node)
DocID = 1
for file in glob.glob(docFolder):               #Iterating through all files in the folder
    file = open(file, &#34;r&#34;)                      #To store the name of the file
    doc = file.read()                           #Reading all the text in the docoument
    regex = re.compile(&#39;[^a-zA-Z\s]&#39;)           #Creating a regex to remove all characters except letters and whitespaces from the document
    doc = re.sub(regex,&#39;&#39;,doc)                  #Removing all digits and special characters from the document
    words = word_tokenize(doc)                  #Tokenizing the words in the document to get all the unique words in the document
    words = [word for word in words if word not in Stopwords]       #Eliminating the stopwords from the document
    words = [word.lower() for word in words]                        #Converting all the words to lower case to maintain uniformity
    words = [ps.stem(word) for word in words]                       #Stemming all the unique non-stopwords in the document
    words = [lemmatizer.lemmatize(word) for word in words]          #Lemmatizing all the unique non-stopwords in the document
    wordsInDocs=uniqueWordFreq(words)                               #Invoking function to calculate frequency of the unique words and the modify it in the dictionary
    for word in wordsInDocs.keys():                                 #Iterating through each unique word to create its linked list
        current = wordLinkedList[word].head                         #Initialising pointer to point at the head of the linked list for that unique word
        while current.nextval is not None:                          #Traversing through the nodes to reach the last node
            current = current.nextval
        current.nextval = Node(DocID ,wordsInDocs[word])            #Adding a node at the end indictaing the document ID and the frequency of the word in that document ID
        for i in range(len(word+&#34;$&#34;),0,-1):                         #Iterating through the length of the unique word
            pterm = rot(word+&#34;$&#34;,i)                                 #Invoking function to create the permuterm and then store it
            uniqueWords.add(pterm)                                  #Adding all combinations of the permuterm to the list of unique words
            wordLinkedList[pterm] = wordLinkedList[word]            #Creating a linked list for the the permuterms of the unique word which is the same linked list as the unique word
    DocID = DocID + 1                           #Incrementing to the next document ID

&#34;&#34;&#34; Accepting query as input from the user and splitting
    the query into boolean words (and, or, not) and
    query words(all other words with the exception of the
    three boolean words). &#34;&#34;&#34;

booleanQuery = input(&#39;Enter your query:&#39;)           #Prompting user to enter a query and then accepting and storing the query entered by the user
regex = re.compile(&#39;[^a-zA-Z*\s]&#39;)                  #Creating a regex to remove all characters except letters, wildcard characters and whitespaces from the query
booleanQuery = re.sub(regex,&#39;&#39;,booleanQuery)        #Removing all digits and special characters except &#39;*&#39; from the query
query = booleanQuery.split()                        #Tokenizing the words in the query to get all the unique words in the query
queryWords = []                                     #List to store the query words in the query entered by the user
booleanWords = []                                   #List to store the boolean words (and, or, not) entered by the user in the query
for word in query:                                  #Iterating through all the words in the query
    if word.lower() != &#34;and&#34; and word.lower() != &#34;or&#34; and word.lower() != &#34;not&#34;:    #If the word is not a boolean word
        queryWords.append(word.lower())             #Add the word to the list of query words
    else:
        booleanWords.append(word.lower())           #Add the word to the list of boolean words

&#34;&#34;&#34; Performing Stemming and Lemmatization on each query
    word and then add a $ to the end of the word indicating
    it is the end of the word. &#34;&#34;&#34;

queryWords = [ps.stem(word) for word in queryWords]                 #Stemming all the query words in the query
queryWords = [lemmatizer.lemmatize(word) for word in queryWords]    #Lemmatizing all the query words in the query
queryWords = [word+&#34;$&#34; for word in queryWords]                      #Adding a &#39;$&#39; at the end of the query word signifying the end of the query word

&#34;&#34;&#34; Performing a spell check and correction on all the query
    words if the spelling is wrong. This is done by comparing
    the edit distance between the query words with all the
    unique words across all the documents. The word is then
    replaced by the word which has the minimum edit distance.
    If the query word exists in the documents, the minimum edit
    distance is zero and the word remains unchanged. &#34;&#34;&#34;


countQueryWords = 0                             #Counter to count the number of query words
for word in queryWords:                         #Iterating through all the words in query words
    distance = -1                               #int variable to store edit distance
    minDistance = sys.maxsize                   #int variable to store minimum edit distance
    minWord = &#34;&#34;                                #string variable to store word having the minmum edit distance
    for w in uniqueWords:                       #Iterating through all the words in unique words
        distance = edit_distance(word,w)        #Calculate the edit distance between the current query word and every word in unique words
        if distance&lt;minDistance :               #Relacing the minimum word and minumum edit distance
            minDistance = distance              #if the calculated edit distnace is smaller
            minWord = w
    queryWords.remove(word)                     #Removing the misspelt word from the list of query words
    word = minWord                              #The rightly spelt word is the word which will have minimum edit distance value from the orginial word so it is replaced
    queryWords.insert(countQueryWords,word)     #The rightly spelt word is added into the query words list
    countQueryWords = countQueryWords + 1       #Incrementing the count of query words

&#34;&#34;&#34; In case the query is a wildcard query, we find its permuterms
    by performing rotations until &#39;*&#39; is the last character. We
    then replace the query word with its perumterm that has &#39;*&#39;
    as the last character which helps in wildcard query processing. &#34;&#34;&#34;

countQueryWords = 0                                             #Counter to count the number of query words
for word in queryWords:                                         #Iterating through all the words in query words
        for i in range(len(word),0,-1):                         #Iterating throught the length of the string to create the permuterms
            pterm = rot(word,i)                                 #Invoking function to create the permuterm and then store it
            if pterm[-1]==&#39;*&#39;:                                  #Checking if the permuterms of the query word contains &#39;*&#39;as the last character
                queryWords.remove(word)                         #If true then the orginal word is removed
                queryWords.insert(countQueryWords,pterm)        #It is replaced by its permuterm having &#39;*&#39; as the last character
        countQueryWords = countQueryWords + 1                   #Incrementing the count of query words

TermDocumentValue = []                                          #List to store the vector of each term across all documents
TermDocumentIncidenceMatrix = []                                #List to store the vectors of all terms across all documents therby making a matrix
#PermuTermIncidenceMatrix = []

&#34;&#34;&#34; The term document incidence matrix is created by first creating
    the vector for that term across all documents using the linked
    list created for that term. The vector which is in the form of
    list is then added to another list which contains the vectors
    for all query words. In this way, the term document incidence
    matrix is created for all the query terms. &#34;&#34;&#34;

for word in queryWords:                                                 #Iterating through all the words in query words
    if word[-1] == &#39;*&#39;:                                                 #Checking if the query word contains &#39;*&#39;as the last character
      TermDocumentValue = [0] * len(fileIndex)                          #Initialising the vector for the query to term to be zero for all documents
      for uniqueWord in uniqueWords:                                    #Iterating through all the words in unique words
          if uniqueWord.lower().startswith(word[:len(word)-1]):         #Checking if there is a match between the wildcard query containing permuterm and a unique word
            doc = wordLinkedList[uniqueWord].head                       #If true then a pointer which points to the head of the linked list of all such matching unique words is created
            while doc.nextval is not None:                              #If the frequency in the document is not equal to None
                TermDocumentValue[doc.nextval.doc - 1] = 1              #then replace the initialised 0 by 1 as it exists in that document
                doc = doc.nextval                                       #Incrementing to the next node in a linked list
            TermDocumentIncidenceMatrix.append(TermDocumentValue)       #Adding the vector to the incidence matrix
    elif word.lower() in uniqueWords:                                   #If the word does not contain &#39;*&#39;as the last character, it is checked if it is present in unique words list
        TermDocumentValue = [0] * len(fileIndex)                        #Initialising the vector for the query to term to be zero for all documents
        doc = wordLinkedList[word].head                                 #Pointer which points to the head of the linked list of the word
        while doc.nextval is not None:                                  #If the frequency in the document is not equal to None
            TermDocumentValue[doc.nextval.doc - 1] = 1                  #then replace the initialised 0 by 1 as it exists in that document
            doc = doc.nextval                                           #Incrementing to the next node in a linked list
        TermDocumentIncidenceMatrix.append(TermDocumentValue)           #Adding the vector to the incidence matrix

&#34;&#34;&#34; Applies the unary not operator on the relevant query term.
    This is used to invert the values present in the vector
    for that term. The uninverted vector is then deleted from
    incidence matrix and the inverted vector is inserted into
    it at the same index. &#34;&#34;&#34;

countQueryWords = 0                                                    #Counter to count the number of query words
for word in booleanWords:                                              #Iterating through all the words in boolean words
    if word == &#34;not&#34; :                                                 #If the boolean term is not
        #print(countQueryWords)
        list1 = TermDocumentIncidenceMatrix[countQueryWords]           #A copy is made of the vector of the query word on which the not operator is to be applied
        res = []                                                       #To store the vector after the not operation has been carried out
        for doc in list1 :                                             #for every int value in the vector list
            if doc == 0 :                                              #if the value is zero
                res.append(1)                                          #the value 1 is added to the resultant vector
            else :
                res.append(0)                                          #else the value 0 is added to the resultant vector
        TermDocumentIncidenceMatrix.remove(list1)                      #Remove the univerted vector from the incidence matrix
        TermDocumentIncidenceMatrix.insert(countQueryWords, res)       #Add the inverted vector at the same index position
        TermDocumentValue = res
        booleanWords.remove(word)                                      #Removing the not from the list of boolean words as its operation has been carried out
    else :
        countQueryWords = countQueryWords + 1                          #Incrementing the count of query words

&#34;&#34;&#34; Used to perform the &#39;and&#39; and &#39;or&#39; boolean query operations.
    Two lists are created which store the first 2 rows of the
    incidence matrix. Depending on the boolean word either
    bitwise and operation or bitwise or operation is applied
    on the query word vectors. The result is then replaced as
    the first row of the incidence matrix and the second row is deleted. &#34;&#34;&#34;

for word in booleanWords:                                           #Iterating through all the words in boolean words
    list1 = TermDocumentIncidenceMatrix[0]                          #In a separate list store the first vector in the incidence matrix
    list2 = TermDocumentIncidenceMatrix[1]                          #In a separate list store the second vector in the incidence matrix
    if word == &#34;and&#34;:                                               #Check if the boolean word entered is and
        res = [w1 &amp; w2 for (w1,w2) in zip(list1,list2)]             #Performing bitwise and on the two vectors and storing their result
        TermDocumentIncidenceMatrix.remove(list1)                   #Removing the first vector from the incidence matrix
        TermDocumentIncidenceMatrix.remove(list2)                   #Removing the second vector from the incidence matrix
        TermDocumentIncidenceMatrix.insert(0, res)                  #Inserting the resultant vector at the first position
    elif word == &#34;or&#34;:                                              #Check if the boolean word entered is or
        res = [w1 | w2 for (w1,w2) in zip(list1,list2)]             #Performing bitwise or on the two vectors and storing their result
        TermDocumentIncidenceMatrix.remove(list1)                   #Removing the first vector from the incidence matrix
        TermDocumentIncidenceMatrix.remove(list2)                   #Removing the second vector from the incidence matrix
        TermDocumentIncidenceMatrix.insert(0, res)                  #Inserting the resultant vector at the first position

&#34;&#34;&#34; The final result is calculated and stored in the first row
    of the incidence matrix. This list is then iterated through
    and whenever the value is 1, it implies that the document
    satisfies the given boolean query and its name is displayed.
    If the value is 0, it skips to the value of the next document
    in the resultant vector. &#34;&#34;&#34;

result = TermDocumentIncidenceMatrix[0]       #Storing the resultant vector in a list
DocID = 1                                     #int variable to store document ID
for index in result:                          #for every int value in the resultant vector list
    if index == 1:                            #if the value is 1 then that document satifies the boolean query entered
        print(fileIndex[DocID])               #Name of the file is printed out
    DocID = DocID+1</code></pre>
</details>
</section>
<section>
</section>
<section>
<h2 class="section-title" id="header-variables">Global variables</h2>
<dl>
<dt id="BooleanIR.TermDocumentIncidenceMatrix"><code class="name">var <span class="ident">TermDocumentIncidenceMatrix</span></code></dt>
<dd>
<div class="desc"><p>The term document incidence matrix is created by first creating
the vector for that term across all documents using the linked
list created for that term. The vector which is in the form of
list is then added to another list which contains the vectors
for all query words. In this way, the term document incidence
matrix is created for all the query terms.</p></div>
</dd>
<dt id="BooleanIR.lemmatizer"><code class="name">var <span class="ident">lemmatizer</span></code></dt>
<dd>
<div class="desc"><p>Iterate through the list of documents in the folder to find
all the unique words present after deleting numbers and
special characters. Ignore the stopwords while finding the
unique words. Creates a dictionary of the document ID and
the respective document name.</p></div>
</dd>
<dt id="BooleanIR.queryWords"><code class="name">var <span class="ident">queryWords</span></code></dt>
<dd>
<div class="desc"><p>Performing a spell check and correction on all the query
words if the spelling is wrong. This is done by comparing
the edit distance between the query words with all the
unique words across all the documents. The word is then
replaced by the word which has the minimum edit distance.
If the query word exists in the documents, the minimum edit
distance is zero and the word remains unchanged.</p></div>
</dd>
<dt id="BooleanIR.uniqueWords"><code class="name">var <span class="ident">uniqueWords</span></code></dt>
<dd>
<div class="desc"><p>Iterate through the list of unique words stemming and lemmatizing
each termto create the linked list for each term and then find all
the permuterms for the given term and copy the same linked list
for the permuterms. This helps in wildcard query handling.</p></div>
</dd>
</dl>
</section>
<section>
<h2 class="section-title" id="header-functions">Functions</h2>
<dl>
<dt id="BooleanIR.rot"><code class="name flex">
<span>def <span class="ident">rot</span></span>(<span>str, n)</span>
</code></dt>
<dd>
<div class="desc"><p>Function to rotate the string passed as a parameter
by n places and then return it. It is used to
calculate all the permuterm combinations possible
of the string that is passed as a parameter.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def rot(str,n):

    &#34;&#34;&#34; Function to rotate the string passed as a parameter
        by n places and then return it. It is used to
        calculate all the permuterm combinations possible
        of the string that is passed as a parameter. &#34;&#34;&#34;

    return str[n:]+str[:n]                      #Returns the string after reforming right rotation n times</code></pre>
</details>
</dd>
<dt id="BooleanIR.uniqueWordFreq"><code class="name flex">
<span>def <span class="ident">uniqueWordFreq</span></span>(<span>doc)</span>
</code></dt>
<dd>
<div class="desc"><p>Function to find all the unique words in the document
passed as a parameter and then calculate the frequency
of the unique word in the document and returns it</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def uniqueWordFreq(doc):

    &#34;&#34;&#34; Function to find all the unique words in the document
        passed as a parameter and then calculate the frequency
        of the unique word in the document and returns it &#34;&#34;&#34;

    uniqueWords = []                            #List that contains the unique words across all documents and their permuterms
    freq = {}                                   #List that contains the frequency of the corresponding word in unique words
    for word in doc:
        if word not in uniqueWords:
            ps.stem(word)                       #Performing stemming operation using PorterStemmer in nltk
            lemmatizer.lemmatize(word)          #Performing lemmatization operation using WordNetLemmatizer in nltk
            uniqueWords.append(word)            #Adding a word to the uniqueWords list if it did not exist in the list
    for word in uniqueWords:
        freq[word] = doc.count(word)            #Calculating the frequency of a term in the document
    return freq</code></pre>
</details>
</dd>
</dl>
</section>
<section>
<h2 class="section-title" id="header-classes">Classes</h2>
<dl>
<dt id="BooleanIR.LinkedList"><code class="flex name class">
<span>class <span class="ident">LinkedList</span></span>
<span>(</span><span>head=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Class to store the frequency of the term in a
particular document ID only if the document
contains the word at least once</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class LinkedList:

    &#34;&#34;&#34; Class to store the frequency of the term in a
        particular document ID only if the document
        contains the word at least once &#34;&#34;&#34;

    def __init__(self ,head = None):            #Constructor to define a linked list
        self.head = head                        #in which the head points to the first element of the linked list</code></pre>
</details>
</dd>
<dt id="BooleanIR.Node"><code class="flex name class">
<span>class <span class="ident">Node</span></span>
<span>(</span><span>DocID, freq=None)</span>
</code></dt>
<dd>
<div class="desc"><p>Class which defines a node of the linked list.
Each node has a document ID along with the frequency
which stores the frequency of the term in the document
with the respective document ID</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class Node:

    &#34;&#34;&#34; Class which defines a node of the linked list.
        Each node has a document ID along with the frequency
        which stores the frequency of the term in the document
        with the respective document ID &#34;&#34;&#34;

    def __init__(self ,DocID, freq = None):     #Constructor to initialize node
        self.freq = freq                        #with frequency of a term freq in
        self.doc = DocID                        #a particular document having DocID as the document ID
        self.nextval = None                     #the next pointer points to NULL</code></pre>
</details>
</dd>
</dl>
</section>
</article>
<nav id="sidebar">
<h1>Index</h1>
<div class="toc">
<ul></ul>
</div>
<ul id="index">
<li><h3><a href="#header-variables">Global variables</a></h3>
<ul class="">
<li><code><a title="BooleanIR.TermDocumentIncidenceMatrix" href="#BooleanIR.TermDocumentIncidenceMatrix">TermDocumentIncidenceMatrix</a></code></li>
<li><code><a title="BooleanIR.lemmatizer" href="#BooleanIR.lemmatizer">lemmatizer</a></code></li>
<li><code><a title="BooleanIR.queryWords" href="#BooleanIR.queryWords">queryWords</a></code></li>
<li><code><a title="BooleanIR.uniqueWords" href="#BooleanIR.uniqueWords">uniqueWords</a></code></li>
</ul>
</li>
<li><h3><a href="#header-functions">Functions</a></h3>
<ul class="">
<li><code><a title="BooleanIR.rot" href="#BooleanIR.rot">rot</a></code></li>
<li><code><a title="BooleanIR.uniqueWordFreq" href="#BooleanIR.uniqueWordFreq">uniqueWordFreq</a></code></li>
</ul>
</li>
<li><h3><a href="#header-classes">Classes</a></h3>
<ul>
<li>
<h4><code><a title="BooleanIR.LinkedList" href="#BooleanIR.LinkedList">LinkedList</a></code></h4>
</li>
<li>
<h4><code><a title="BooleanIR.Node" href="#BooleanIR.Node">Node</a></code></h4>
</li>
</ul>
</li>
</ul>
</nav>
</main>
<footer id="footer">
<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
</footer>
</body>
</html>