-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
595 lines (541 loc) · 26.7 KB
/
main.py
File metadata and controls
595 lines (541 loc) · 26.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
import os
from bs4 import BeautifulSoup, SoupStrainer
from collections import defaultdict
import loadbard
import ctoken
import sys
from psutil import virtual_memory
import time
import orjson
import json
import math
import utils
import warnings
import page_duplicate_util
import hashlib
from nltk.corpus import stopwords
import validators
import csimhash
import cpagerank
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
# INSTANTIATION
Bard = loadbard.LoadBard()
# FILE NAMES
DATA_DIR = "DATA\\"
# DATA_DIR = ""
INDEX_DIR = "INDEX\\"
INVERT_DIR = "INVERT\\"
# INDEX
termID_map_filename = f"{INDEX_DIR}termID.map"
docID_store_file_filename = f"{INDEX_DIR}docid.map"
supplemental_info_filename = f"{INDEX_DIR}bold-links.map"
docID_hash_filename = f"{INDEX_DIR}docID_hash.map"
invert_docID_filename = f"{INDEX_DIR}inverted_docID.map"
# INVERT
token_seek_map_filename = f"{INVERT_DIR}token_seek.map"
corpus_token_frequency_filename = f"{INVERT_DIR}corpus_token_freq.map"
inverted_bolds_filename = f"{INVERT_DIR}inverted_bolds.map"
pagerank_filename = f"{INVERT_DIR}pagerank.map"
# CONSTANTS
FILE_COUNT = utils.count_docs(DATA_DIR)
# INDEX_BUFFER = (utils.get_size_directory(DATA_DIR) / FILE_COUNT) * 55 # size/1000 BEST FOR FULL DATA SET
INDEX_BUFFER = utils.get_size_directory(DATA_DIR) / 150
FINDEX_MAX_LINES = 100000 # lines updated from 50,000 to 100,000
FINDEX_BUFFER = 10000 # is limited to 10,000 due to it having to be sorted O(nlogn) every time
# RANKING
LINKS_KEY = "links"
BOLDS_KEY = "bolds"
POSITION_KEY = "position"
TFIDF_KEY = "tfidf"
# SEARCH OPTIONS
TEMP_WEIGHT = 1
BOLDS_WEIGHTING = True
POSITIONAL_WEIGHTING = False
PAGERANK_WEIGHTING = True
MD5_DUPLICATE_CHECK = True
USE_SIMHASH = False
BOOLEAN_AND = False
# NOTE: if SIMHASH is enabled, it will replace the MD5 hash check.
LOG = True
def write_bucket(inverted_index_bucket_ref, INDEX_DIR, docID):
with open(f"{INDEX_DIR}{docID}.index", "w+") as f:
# {token:{docID:{positions:}}}
for token in sorted(inverted_index_bucket_ref.keys()):
f.write(f"{token}~{dict(inverted_index_bucket_ref[token])}\n")
f.close()
def normalization_term(weighted_vector):
# sqrt(sum(term_score^2) # term scores == "weighted vector"
return math.sqrt(sum([term_score * term_score for term_score in weighted_vector]))
def make_fragment_index():
"""
Creates a fragmented index in the form of a few .txt files and a json for termID's and a json for docID's.
:return:
Creates docid.map, a json file that is a list of urls where the index is the docID
Creates termID.map, a json file with a list of terms, where the termID is the index.
Creates a few 'bucket' txt files, where the filename refers to the last added termID.
Each bucket file contains alphabetical rows of "term~{document:[positions]}"
These are partial datasets that must be merged in next step.
"""
if Bard:
print("================================================")
print(f"LIBRARY SIZE: {utils.get_size_directory(DATA_DIR) / 1000000}mb")
print(f"LIBRARY SIZE: {FILE_COUNT} files")
print(f"INDEX BUFFER SIZE: {INDEX_BUFFER / 1000000}mb")
print(f"BUFFER FRACTION: {INDEX_BUFFER / utils.get_size_directory(DATA_DIR)}")
print("================================================")
print("<--------------MAKING FRAGMENTED INDEX-----------------> 1/2")
document_store = list()
inverted_index_bucket = defaultdict(lambda: defaultdict(list)) # {token:{docID:{positions:}}}
supplemental_info = defaultdict(lambda: defaultdict(list)) # {docID:{links:[], bolds:[]}}
docID_hashes = defaultdict(int)
tokenIDs = set()
docID = 0
doc_count = utils.count_docs(DATA_DIR)
# BARD IN!
if Bard:
Bard.start("WALKING DATA")
for subdir, dirs, files in os.walk(DATA_DIR):
for file in files:
if ".json" in file:
# If bucket is at-size, dump bucket
if sys.getsizeof(inverted_index_bucket) > INDEX_BUFFER:
print("\n<---------WRITING BUCKET--------->")
write_bucket(inverted_index_bucket, INDEX_DIR, docID)
inverted_index_bucket = defaultdict(lambda: defaultdict(list)) # {token:{docID:{positions:}}}
# Open url-database file
with open(os.path.join(subdir, file), "r") as f:
jsonfile = json.load(f)
f.close()
document_store.append(jsonfile["url"])
soup = BeautifulSoup(jsonfile["content"], features="lxml")
tokens = ctoken.tokenize(soup.text)
# GENERATE TOKEN HASH
if not USE_SIMHASH:
docID_hashes[docID] = hashlib.md5(soup.text.encode('utf-8')).hexdigest()
# EXTRACT BOLDS
for bold in soup.find_all(["b", "strong", "h1", "h2", "h3", "title"]):
[supplemental_info[docID][BOLDS_KEY].append(word) for word in ctoken.tokenize(bold.text)]
# EXTRACT LINKS
# [(link, anchor_text)]
links = list()
for link in BeautifulSoup(jsonfile["content"], features="lxml", parse_only=SoupStrainer("a")).find_all("a"):
try:
if hasattr(link, "href") and validators.url(link["href"]):
links.append((link["href"], ctoken.tokenize(link.text)))
except KeyError:
pass
supplemental_info[docID][LINKS_KEY] = [link[0] for link in links]
# TOKENIZE!
position = 0
# INDEX ANCHOR TEXT
for link in links:
if link[1]:
for token in link[1]:
inverted_index_bucket[token][str(docID)].append(position)
position += 1
tokenIDs.add(token)
# add rest of tokens
for token in tokens:
inverted_index_bucket[token][str(docID)].append(position)
position += 1
tokenIDs.add(token)
# BARD BARDING!
percent_progress = "{:.5f}".format(docID / doc_count)
percent_progress_human = "{:.5f}".format((docID / doc_count) * 100)
if Bard: # and percent_progress % 0.000001 == 0:
log = f"{percent_progress_human}% bucket size:{sys.getsizeof(inverted_index_bucket) / 1000000}mb " \
f"{subdir}/{file} tokens:{len(tokens)} bolds:{len(supplemental_info[docID][BOLDS_KEY])} " \
f"links:{len(links)}"
Bard.update(log, replace=True)
# INCREMENTING DOCID COUNTER
docID += 1
print("<---------WRITING FINAL BUCKET--------->")
write_bucket(inverted_index_bucket, INDEX_DIR, docID)
inverted_index_bucket.clear()
print("<---WRITING DOCUMENT STORE------->")
with open(docID_store_file_filename, "w") as f:
json.dump(document_store, f)
f.close()
print("<-----WRITING TERM ID STORE------>")
sorted_tokenIDs = sorted(list(tokenIDs))
with open(termID_map_filename, "w") as f:
json.dump(sorted_tokenIDs, f)
f.close()
print("<-----WRITING SUPPLEMENTAL LINK/BOLD STORE------>")
with open(supplemental_info_filename, "w") as f:
json.dump(supplemental_info, f)
f.close()
print("<-----WRITING HASH STORE------>")
with open(docID_hash_filename, "w") as f:
json.dump(docID_hashes, f)
f.close()
print("<-----WRITING INVERTED DOCID MAP------>")
with open(docID_store_file_filename, "r") as f:
inverted_docID_map = page_duplicate_util.gen_inverted_docID_map(json.load(f))
with open(invert_docID_filename, "w") as f:
json.dump(inverted_docID_map, f)
print("<-----WRITING INVERTED BOLDS MAP------>")
utils.make_invert_bolds_term_docID(supplemental_info_filename, inverted_bolds_filename)
print("<-----WRITING PAGERANK STORE------>")
utils.make_pagerank_lib(supplemental_info_filename, docID_store_file_filename, pagerank_filename)
# BARD OUT!
if Bard:
Bard.end()
def merge_frag_index():
"""
All .txt files of fragmented index are opened and a line is read from each. The 'minimum' term/token that
exists in the read buffer (that which has the lowest index number in the tokenID map) is written.
:return:
"""
print(f"FINDEX MAX LINES: {FINDEX_MAX_LINES}")
print(f"FINDEX BUFFER SIZE: {FINDEX_BUFFER/1000000}mb")
print("<--------------MERGING FRAGMENTED INDEX-----------------> 2/2")
# load sorted tokenIDs
with open(termID_map_filename, "r") as f:
sorted_tokenIDs = json.load(f)
if sorted_tokenIDs != sorted(sorted_tokenIDs):
raise IndexError
print("termID_map_tokens: ",len(sorted_tokenIDs))
# BARD IN!
if Bard:
Bard.start("MERGING FRAGMENT DATA")
# open all fragment files for reading
file_objects = list()
for subdir, dirs, files in os.walk(INDEX_DIR):
for file in files:
if ".index" in file:
file_objects.append(open(f"{INDEX_DIR}{file}", "r"))
# IO Token Map and Final Index
token_map_file = open(token_seek_map_filename, "w")
# open outfile for final database and the map file
outfile_ID = 0
outfile_name = f"{INVERT_DIR}{outfile_ID}.findex"
outfile = open(outfile_name, "w")
outfile_lines = 0
# establish read buffer, which contains terms that have been read but not been written to final index
read_buffer = defaultdict(lambda: defaultdict(list)) # {token:{docID:[positions]}}
# establish token map, which contains the seek index in the final output file
token_map = defaultdict(lambda: defaultdict(list)) # {token:{file:file_seek_position}}
tokenID = 0
# corpus token frequency metric
corpus_token_freq = defaultdict(int) # {token:count}
while tokenID < len(sorted_tokenIDs):
# check if outfile is sufficient line size to merit new outfile
if outfile_lines > FINDEX_MAX_LINES:
outfile.close()
print("\n<---------NEW OUTFILE--------->")
# start new outfile
outfile_ID += 1
outfile_name = f"{INVERT_DIR}{outfile_ID}.findex"
outfile = open(outfile_name, "w")
outfile_lines = 0
# iterate through all files and add next line to read buffer
for file in file_objects:
for i in range(0, FINDEX_BUFFER):
line = file.readline().split("~")
if len(line) != 2: # [term, positional_data]
file_objects.remove(file)
break
token = line[0]
dictionary = line[1]
docIDs_to_positions = json.loads(dictionary.replace("'", "\"")) # required for embedded json
for docID, positions in docIDs_to_positions.items():
for position in positions:
read_buffer[token][docID].append(position)
corpus_token_freq[token] += 1
Bard.update(f"readbuffer size:{sys.getsizeof(read_buffer) / 1000000}mb\t{file.name}", replace=True)
sorted_readbuffer_ids = sorted(list(map(lambda x: sorted_tokenIDs.index(x), read_buffer.keys())))
while len(read_buffer.keys()) > 0:
if outfile_lines > FINDEX_MAX_LINES:
outfile.close()
print("\n<---------NEW OUTFILE--------->")
# start new outfile
outfile_ID += 1
outfile_name = f"{INVERT_DIR}{outfile_ID}.findex"
outfile = open(outfile_name, "w")
outfile_lines = 0
# if sorted_tokenIDs[tokenID] not in read_buffer.keys():
# continue
# find lowest token index of tokens currently in the read buffer
# get actual token from termID/tokenID
min_tokenID = sorted_readbuffer_ids.pop(0)
this_token = sorted_tokenIDs[min_tokenID]
# gets current cursor position in output file, then writes from read buffer.
# Notes this outfile position in token map # {token:{file:file_seek_position}}
outfile_position = outfile.tell()
token_map[this_token][outfile_name].append(outfile_position)
outfile.write(f"{dict(read_buffer.pop(this_token))}\n")
# increment outfile line count
outfile_lines += 1
tokenID += 1
# BARD BARDING!
percent_progress = float(tokenID / len(sorted_tokenIDs))
if Bard and round(percent_progress, 3) % 0.001 == 0:
human = "{:.5f}".format(percent_progress * 100)
Bard.update(f"{human}%"
f"\t{outfile_name}\tRead buffer:{len(read_buffer)}\t"
f"{sys.getsizeof(read_buffer) / 1000000}mb\t{outfile_lines}lines\t{this_token}"
f"\t{sorted(read_buffer.keys())[0:5]}",
replace=True)
# dump token map which contains {token:{file:file_seek_position}}
print("<-----WRITING TOKEN SEEK MAP------>")
json.dump(token_map, token_map_file)
print("<-----WRITING ------>")
with open(corpus_token_frequency_filename, "w") as f:
json.dump(corpus_token_freq, f)
f.close()
# close all files
for file in file_objects:
file.close()
outfile.close()
token_map_file.close()
# BARD OUT!
if Bard:
Bard.end()
def search_multiple(search_queries_ref, token_map_ref, docid_store_ref,
findex_dict, duplicate_docIDS, bold_links_map, sorted_pagerank_ref, inverted_bolds):
ret_results = defaultdict(tuple)
for query_phrase in search_queries_ref:
time_search_start = time.perf_counter()
term_docID_positions = defaultdict(lambda: defaultdict(list)) # {term{docID:[positions]}}
union = set() # [docID]
# QUERY
tokenized_query = ctoken.tokenize(query_phrase, trim_stopwords=False) # [query_term]
unique_query_term_freqs = defaultdict(int) # {term:freqs}
# QUERY TFIDF
query_weighted_tfidf_dict = defaultdict(float) # {term:float}
query_normalized_weighted_tfidf_dict = defaultdict(float) # {term:float}
term_idf = defaultdict(float) # {term:weighted_idf}
# DCOUMENT TFIDF
document_normalized_weighted_tf_dict = defaultdict(lambda: defaultdict(float)) # {docID:{term:float}}
final_doc_scores = defaultdict(float) # {docID:float}}
all_stopwords = False
# FREQ'ING QUERY
# if the query is extremely long, stop words are removed
if len(tokenized_query) > 3:
tokenized_query = [token for token in tokenized_query if token not in set(stopwords.words('english'))]
unique_keys = set(tokenized_query)
for toke in tokenized_query:
unique_query_term_freqs[toke] += 1
# ACCESSING DOCID's AND POSITIONS
# accounts for posting information stored in multiple files
for toke in unique_keys:
# posting list(s)
try:
for file, seek in token_map_ref[toke].items():
f = findex_dict[file]
for seek_pos in seek:
f.seek(seek_pos) # sends cursor to position
posting = orjson.loads(f.readline().replace("'", "\"")) # {docID:[seek_positions]}
for docID in posting.keys():
# trim off duplicate pages
if docID not in duplicate_docIDS and docID not in term_docID_positions[toke]:
[term_docID_positions[toke][docID].append(pos) for pos in posting[docID]]
union.add(docID)
except KeyError:
# removing terms that do not exist in corpus
del unique_query_term_freqs[toke]
continue
file_time2 = time.perf_counter()
# print("filetime:", (file_time2-file_time1)*1000)
# QUERY TFIDF VECTOR
for term in unique_query_term_freqs.keys():
# weighted term frequency
weighted_tf = 1 + math.log(unique_query_term_freqs[term])
# weighted number of documents / number of documents containing term
weighted_idf = math.log(len(docid_store_ref) / len(term_docID_positions[term]))
query_tfidf = weighted_tf * weighted_idf
query_weighted_tfidf_dict[term] = query_tfidf
term_idf[term] = weighted_idf
query_n_term = normalization_term(query_weighted_tfidf_dict.values()) # sqrt(sum(query_vector))
for term in query_weighted_tfidf_dict.keys():
query_normalized_weighted_tfidf_dict[term] = query_weighted_tfidf_dict[term] / query_n_term
# BOLDS EXTRACTION
bolds_docID = set()
for term in [query for query in query_weighted_tfidf_dict.keys() if query not in set(stopwords.words('english'))]:
try:
[bolds_docID.add(docID) for docID in inverted_bolds[term]]
except IndexError and KeyError:
pass
# DOCUMENT PRUNING--------------------------------------------------
# boolean AND documents
# boolan AND + bolds
# boolean AND + bolds + iter(union each set of docIDs for term in reverse idf order)
# still none? Nuclear option -> union with all docs containing term
datasets = []
MINIMUM_ITERS = 1000
MAXIMUM_ITERS = 2000
try:
searchdocs = set.intersection(
*[set(term_docID_positions[term].keys()) for term in term_docID_positions.keys()])
datasets.append(f"boolean AND{len(searchdocs)}")
except TypeError:
searchdocs = set()
pass
predicted_iterations = len(searchdocs) * len(query_normalized_weighted_tfidf_dict.keys())
while predicted_iterations < MINIMUM_ITERS:
if all_stopwords:
break
counter = 0
for term, idf in sorted(term_idf.items(), key=lambda x: x[1], reverse=True):
searchdocs = set.union(*[searchdocs, [docID for docID in term_docID_positions[term]]])
datasets.append(f"union-termdocs{counter}-{term}-{len(searchdocs)}")
counter += 1
predicted_iterations = len(searchdocs)*len(query_normalized_weighted_tfidf_dict.keys())
if predicted_iterations > MINIMUM_ITERS:
break
if predicted_iterations > MINIMUM_ITERS:
break
searchdocs = set.union(*[searchdocs, bolds_docID])
datasets.append(f"union-bolds{len(searchdocs)}")
predicted_iterations = len(searchdocs) * len(query_normalized_weighted_tfidf_dict.keys())
if predicted_iterations > MINIMUM_ITERS:
break
searchdocs = set.union(*[searchdocs, union])
datasets.append(f"union-union{len(searchdocs)}")
predicted_iterations = len(searchdocs) * len(query_normalized_weighted_tfidf_dict.keys())
break
while predicted_iterations > MAXIMUM_ITERS:
# do some filtering, thats alot of results.
searchdocs = set.intersection(*[searchdocs, bolds_docID])
datasets.append(f"intersect-bolds{len(searchdocs)}")
predicted_iterations = len(searchdocs) * len(query_normalized_weighted_tfidf_dict.keys())
if predicted_iterations < MAXIMUM_ITERS:
break
searchdocs = set.intersection(*[searchdocs, union])
datasets.append(f"intersect-union{len(searchdocs)}")
break
datasets.append(f"iterations:{len(searchdocs) * len(query_normalized_weighted_tfidf_dict.keys())}")
# DOCUMENT TF-IDF SCORING ---------------------------------
# COMPUTE DOCUMENTS TF'S
# term frequency in document, list of score values() for all terms
for docID in searchdocs:
for term in unique_query_term_freqs.keys():
# weighted document tf == 1+log(term frequency in document)
# document_normalized_weighted_tf_dict[docID].values() == "document vector" or term_scores
if len(term_docID_positions[term][docID]) == 0:
document_normalized_weighted_tf_dict[docID][term] = \
1 + math.log(1)
else:
document_normalized_weighted_tf_dict[docID][term] =\
1 + math.log(len(term_docID_positions[term][docID]))
for term in unique_query_term_freqs.keys():
# NORMALIZE DOCUMENT TF
n_term = normalization_term(document_normalized_weighted_tf_dict[docID].values())
document_normalized_weighted_tf_dict[docID][term] =\
document_normalized_weighted_tf_dict[docID][term] / n_term
# COMPUTE FINAL TERM-DOCUMENT RELEVANCE SCORE-----------------------
# sum(for all terms: query_term_score*document_normalized_weighted_tf_dict)
non_tfidf_weighting_factor = 1
for docID in document_normalized_weighted_tf_dict.keys():
sum_list = list()
for term in document_normalized_weighted_tf_dict[docID].keys():
doc_score = query_normalized_weighted_tfidf_dict[term] * document_normalized_weighted_tf_dict[docID][term]
if doc_score == 0: # its an irrelevant document
continue
if len(document_normalized_weighted_tf_dict[docID].keys()) == 1: # the query was a single word
non_tfidf_weighting_factor = 10
if BOLDS_WEIGHTING and docID in bolds_docID:
doc_score += 0.0001 * non_tfidf_weighting_factor
# print("we struck BOLD!")
if PAGERANK_WEIGHTING:
try:
page_rank = sorted_pagerank_ref[int(docID)][1]
if page_rank > 0.001:
doc_score += page_rank*non_tfidf_weighting_factor
pass
except IndexError:
pass
sum_list.append(doc_score)
final_doc_scores[docid_store_ref[int(docID)]] = sum(sum_list)
# SORT AND SLICE RESULTS ----------------------------
search_results = sorted(final_doc_scores.items(), key=lambda x: x[1], reverse=True)[0:SEARCH_RESULTS]
time_taken = time.perf_counter() - time_search_start
# print(tokenized_query, "results", "\n", len(final_doc_scores), "datasets", datasets)
if union:
ret_results[query_phrase] = (search_results, time_taken)
else:
ret_results[query_phrase] = ([], time_taken)
return ret_results
if __name__ == '__main__':
frag_index_exists = True
all_index_exists = True
DEBUG = False
SEARCH_RESULTS = 10
# Build Indexes
if not frag_index_exists:
make_fragment_index()
frag_index_exists = True
if not all_index_exists:
merge_frag_index()
all_index_exists = True
print("loading files into memory.....")
# open FINDEX files
findex_file_objects = dict()
for subdir, dirs, files in os.walk(INVERT_DIR):
for file in files:
if ".findex" in file:
f = open(f"{INVERT_DIR}{file}", "r")
findex_file_objects[f.name] = f
# add maps back to memory
with open(token_seek_map_filename, "r") as f:
from_file_token_map = json.load(f)
with open(docID_store_file_filename, "r") as f:
docID_store = json.load(f)
with open(docID_hash_filename, "r") as f:
docID_hash_store = json.load(f)
with open(supplemental_info_filename, "r") as f:
bolds_links_store = json.load(f)
with open(corpus_token_frequency_filename, "r") as f:
corpus_token_frequency = json.load(f)
with open(invert_docID_filename, "r") as f:
invert_docID_map = json.load(f)
with open(inverted_bolds_filename, "r") as f:
bolds_terms_docIDs = json.load(f)
with open(pagerank_filename, "r") as f:
sorted_pagerank = json.load(f)
duplicate_docIDs = page_duplicate_util.find_duplicates(docID_hash_store)
# DEBUG ONLY
if DEBUG:
print("================================================")
PRINT_TERMS = 10
stopwords = [ctoken.tokenize(i)[0] for i in stopwords.words('english')]
sorted_corpus = [(term, count)
for term, count
in sorted(corpus_token_frequency.items(), key=lambda x: x[1], reverse=True)
if term not in stopwords]
# avg_freq = sum(list(corpus_token_frequency.values()))/len(corpus_token_frequency)
# foo = int(avg_freq)-PRINT_TERMS
# bar = int(avg_freq)+PRINT_TERMS
print("documents in docID store: ", len(docID_store))
print("documents in hash store: ", len(docID_hash_store))
print("duplicates: ", len(duplicate_docIDs))
print("tokens: ", len(from_file_token_map))
print(f"avg", len(sorted_corpus))
avg = int(len(sorted_corpus)/2)
print(f"median count {PRINT_TERMS} terms: {sorted_corpus[avg:PRINT_TERMS]}")
print(f"top count {PRINT_TERMS} terms: {sorted_corpus[0:PRINT_TERMS]}")
print(f"bottom count {PRINT_TERMS} terms: {list(reversed(sorted_corpus))[0:PRINT_TERMS]}")
top_pageranks = sorted_pagerank[0:PRINT_TERMS]
bottom_pageranks = list(reversed(sorted_pagerank))[0:PRINT_TERMS]
print(f"top pagerank {PRINT_TERMS} terms: {top_pageranks}")
print(f"bottom pagerank {PRINT_TERMS} terms: {bottom_pageranks}")
print("================================================")
print()
# Search
search_queries = ["ai club", "master of software engineering", "MOSFET", "Dingo ate me baby", "support document",
"browser", "the university of california irvine ai club workshop", "sourcer","lawks", "lawler",
"breast cancer wisconsin", "computer science", "informatics", "rwxrwxrwx", "laveman",
"language for distributed embedded systems", "a",
"krisberg org", "kovarik@mcmail.cis.mcmaster.ca", "cbcl", ]
search_queries = ["language for distributed embedded systems",]
results = search_multiple(search_queries, from_file_token_map, docID_store,
findex_file_objects, duplicate_docIDs, bolds_links_store,
sorted_pagerank, bolds_terms_docIDs)
for query in results.keys():
result = results[query][0]
time_taken = results[query][1]
print(f"Query '{query}' took {time_taken*1000} milliseconds.")
for index, url in enumerate(result):
print(f"{index + 1}. {url}")
print()