Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
7071CEM-CW/revIndex.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
98 lines (83 sloc)
3.07 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from collections import defaultdict | |
from array import array | |
from bs4 import BeautifulSoup as BS | |
import csv | |
from tqdm import tqdm | |
import pickle | |
hitList = defaultdict(list) | |
textList = [] | |
titles = [] | |
ids = [] | |
pickleTable = defaultdict(list) | |
class HashTable: | |
def __init__(self, source, grammerFile, output): | |
self.filename = source | |
self.csvFile = output | |
gFile = open(grammerFile, 'r') | |
self.grammerDoc = [line.strip() for line in gFile] # gets the list of the grammerFile we want to clear, strips the '\n' from the lines | |
def writeFile(self): | |
print('\n\nWriting...') | |
myFile = open(self.csvFile, 'w',encoding="UTF-8") | |
#for key in hitList.iterkeys(): # python 2.7 | |
for key in tqdm(hitList.keys()): # python 3.x | |
temp = [] | |
for val in hitList[key]: | |
docID = val[0] | |
occurence = val[1] | |
temp.append(':'.join([docID, ','.join(map(str,occurence))])) | |
inStr = ';'.join(temp) | |
myFile.write(key+","+str(inStr)+"\n") | |
key = key.lower() # lower to make searching easier | |
pickleTable[key] = inStr | |
myFile.close() | |
def getKeys(self, textLine): | |
line = re.sub('[^0-9a-zA-Z]+', ' ', textLine) #replaces the non-ASCII values with space | |
line = line.split() | |
line = [x for x in line if x not in self.grammerDoc] # eliminate the articles, prepositions etc. | |
return line | |
def parse(self): | |
wiki = open(self.filename, 'r', encoding="UTF-8") | |
pageList = [] | |
print('Reading the corpus...') | |
for line in tqdm(wiki): | |
pageList.append(line.strip().lower()) # comparisons are case sensitive so make all lower | |
soup = BS(' '.join(pageList), 'html.parser') | |
print('\nParsing IDs...') | |
for i in tqdm(soup.find_all('date')): | |
ids.append(i.text) | |
print('\nParsing Titles...') | |
for i in tqdm(soup.find_all('lecname')): | |
titles.append(i.text) | |
print('\nParsing Text...') | |
for i in tqdm(soup.find_all('publications')): | |
textList.append(i.text) | |
def createhashtable(self): | |
self.parse() | |
print('\n\nIndexing...') | |
for i in tqdm(range(len(textList))): | |
invertedIndex = {} | |
keys = self.getKeys(textList[i]) | |
for key in keys: | |
try: | |
invertedIndex[key][1].append(0) | |
except: | |
invertedIndex[key] = [textList[i], array('Q', [0])] # L for unsigned Long -> 4 Bytes, hashtable[id, [ArrayList]] | |
keys = self.getKeys(textList[i]) | |
for value, key in enumerate(keys): | |
try: | |
invertedIndex[key][1].append(value+1) | |
except: | |
invertedIndex[key] = [ids[i], array('Q', [value+1])] # L for unsigned Long -> 4 Bytes, hashtable[id, [ArrayList]] | |
#for curPage, invPag in invertedIndex.iteritems(): #python 2.7 | |
for curPage, invPag in invertedIndex.items(): #python 3.x | |
hitList[curPage].append(invPag) # updates the defaultdict with each new page. | |
self.writeFile() | |
def pickling(self): # Serializes the hashtable to be read for the searching part. | |
pickleFile = open("self.dumpedDict.pickle", "wb") #opening the pickle file to write the byte stream. | |
pickle.dump(pickleTable, pickleFile) | |
pickleFile.close() | |
if __name__ == "__main__": | |
invIndex = HashTable("output.dat", "grammer.rtf", "dumpedDict.pickle") | |
invIndex.createhashtable() | |
invIndex.pickling() |