Skip to content
Permalink
cfa14da4e8
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
98 lines (83 sloc) 3.07 KB
import re
from collections import defaultdict
from array import array
from bs4 import BeautifulSoup as BS
import csv
from tqdm import tqdm
import pickle
hitList = defaultdict(list)
textList = []
titles = []
ids = []
pickleTable = defaultdict(list)
class HashTable:
def __init__(self, source, grammerFile, output):
self.filename = source
self.csvFile = output
gFile = open(grammerFile, 'r')
self.grammerDoc = [line.strip() for line in gFile] # gets the list of the grammerFile we want to clear, strips the '\n' from the lines
def writeFile(self):
print('\n\nWriting...')
myFile = open(self.csvFile, 'w',encoding="UTF-8")
#for key in hitList.iterkeys(): # python 2.7
for key in tqdm(hitList.keys()): # python 3.x
temp = []
for val in hitList[key]:
docID = val[0]
occurence = val[1]
temp.append(':'.join([docID, ','.join(map(str,occurence))]))
inStr = ';'.join(temp)
myFile.write(key+","+str(inStr)+"\n")
key = key.lower() # lower to make searching easier
pickleTable[key] = inStr
myFile.close()
def getKeys(self, textLine):
line = re.sub('[^0-9a-zA-Z]+', ' ', textLine) #replaces the non-ASCII values with space
line = line.split()
line = [x for x in line if x not in self.grammerDoc] # eliminate the articles, prepositions etc.
return line
def parse(self):
wiki = open(self.filename, 'r', encoding="UTF-8")
pageList = []
print('Reading the corpus...')
for line in tqdm(wiki):
pageList.append(line.strip().lower()) # comparisons are case sensitive so make all lower
soup = BS(' '.join(pageList), 'html.parser')
print('\nParsing IDs...')
for i in tqdm(soup.find_all('date')):
ids.append(i.text)
print('\nParsing Titles...')
for i in tqdm(soup.find_all('lecname')):
titles.append(i.text)
print('\nParsing Text...')
for i in tqdm(soup.find_all('publications')):
textList.append(i.text)
def createhashtable(self):
self.parse()
print('\n\nIndexing...')
for i in tqdm(range(len(textList))):
invertedIndex = {}
keys = self.getKeys(textList[i])
for key in keys:
try:
invertedIndex[key][1].append(0)
except:
invertedIndex[key] = [textList[i], array('Q', [0])] # L for unsigned Long -> 4 Bytes, hashtable[id, [ArrayList]]
keys = self.getKeys(textList[i])
for value, key in enumerate(keys):
try:
invertedIndex[key][1].append(value+1)
except:
invertedIndex[key] = [ids[i], array('Q', [value+1])] # L for unsigned Long -> 4 Bytes, hashtable[id, [ArrayList]]
#for curPage, invPag in invertedIndex.iteritems(): #python 2.7
for curPage, invPag in invertedIndex.items(): #python 3.x
hitList[curPage].append(invPag) # updates the defaultdict with each new page.
self.writeFile()
def pickling(self): # Serializes the hashtable to be read for the searching part.
pickleFile = open("self.dumpedDict.pickle", "wb") #opening the pickle file to write the byte stream.
pickle.dump(pickleTable, pickleFile)
pickleFile.close()
if __name__ == "__main__":
invIndex = HashTable("output.dat", "grammer.rtf", "dumpedDict.pickle")
invIndex.createhashtable()
invIndex.pickling()