Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
7071CEM-CW/forwardInv.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
77 lines (63 sloc)
2.33 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pip install tqdm | |
import re | |
from collections import defaultdict, OrderedDict | |
from tqdm import tqdm | |
from bs4 import BeautifulSoup as BS | |
class ForwardIndex: | |
def __init__(self): | |
pass | |
hitlist = defaultdict(list) | |
corpusFile = "output.dat" | |
grammerFile = 'grammer.dat' | |
outputFile = "output.file" | |
textList = [] | |
ids = [] | |
def parse(self): | |
corpus = open(self.corpusFile, 'r') | |
pageList = [] | |
print('Reading the corpus...') | |
for line in tqdm(corpus): | |
pageList.append(line.strip().lower()) # comparisons are case sensitive so make all lower | |
soup = BS(' '.join(pageList), 'html.parser') | |
print('\nParsing IDs...') | |
for i in tqdm(soup.select('date')): | |
self.ids.append(i.text) | |
print('\nParsing Text...') | |
for i in tqdm(soup.findAll('publications')): | |
self.textList.append(i.text) | |
def cleargrammer(self): | |
gFile = open(self.grammerFile, 'r') | |
grammerdoc = [line.strip('\n') for line in gFile] # gets the list of the grammerFile we want to clear, strips the '\n'. | |
return grammerdoc | |
def getkeys(self, textLine): | |
gFile = self.cleargrammer() | |
line = re.sub('[^0-9a-zA-Z]+', ' ', textLine) # replaces the non-ASCII values with space | |
line = line.lower() | |
line = line.split() | |
line = [x for x in line if x not in gFile] # eliminate the articles, prepositions etc. | |
line = list(OrderedDict.fromkeys(line)) #remove duplicates. | |
return line | |
def forIndex(self): | |
self.parse() | |
for i in (range(len(self.textList))): | |
try: | |
pID = self.ids[i] | |
line = self.getkeys(self.textList[i]) | |
self.hitlist[pID] = line | |
except: | |
print ("Ops, ID list or Text list busted!") | |
def writeFile(self): | |
outFile = open(self.outputFile, 'w') | |
for docID in (self.hitlist.keys()): | |
inStr = ','.join(self.hitlist[docID]) # joining the list by comma separated. | |
wrtStr = docID + ":" + inStr | |
outFile.write(wrtStr) | |
outFile.write('\n') | |
print ("Done DocID: ", docID) | |
outFile.close() | |
def main(self): | |
self.forIndex() | |
self.writeFile() | |
if __name__ == '__main__': | |
fInd = ForwardIndex() | |
fInd.main() |