Skip to content
Permalink
cfa14da4e8
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
77 lines (63 sloc) 2.33 KB
pip install tqdm
import re
from collections import defaultdict, OrderedDict
from tqdm import tqdm
from bs4 import BeautifulSoup as BS
class ForwardIndex:
def __init__(self):
pass
hitlist = defaultdict(list)
corpusFile = "output.dat"
grammerFile = 'grammer.dat'
outputFile = "output.file"
textList = []
ids = []
def parse(self):
corpus = open(self.corpusFile, 'r')
pageList = []
print('Reading the corpus...')
for line in tqdm(corpus):
pageList.append(line.strip().lower()) # comparisons are case sensitive so make all lower
soup = BS(' '.join(pageList), 'html.parser')
print('\nParsing IDs...')
for i in tqdm(soup.select('date')):
self.ids.append(i.text)
print('\nParsing Text...')
for i in tqdm(soup.findAll('publications')):
self.textList.append(i.text)
def cleargrammer(self):
gFile = open(self.grammerFile, 'r')
grammerdoc = [line.strip('\n') for line in gFile] # gets the list of the grammerFile we want to clear, strips the '\n'.
return grammerdoc
def getkeys(self, textLine):
gFile = self.cleargrammer()
line = re.sub('[^0-9a-zA-Z]+', ' ', textLine) # replaces the non-ASCII values with space
line = line.lower()
line = line.split()
line = [x for x in line if x not in gFile] # eliminate the articles, prepositions etc.
line = list(OrderedDict.fromkeys(line)) #remove duplicates.
return line
def forIndex(self):
self.parse()
for i in (range(len(self.textList))):
try:
pID = self.ids[i]
line = self.getkeys(self.textList[i])
self.hitlist[pID] = line
except:
print ("Ops, ID list or Text list busted!")
def writeFile(self):
outFile = open(self.outputFile, 'w')
for docID in (self.hitlist.keys()):
inStr = ','.join(self.hitlist[docID]) # joining the list by comma separated.
wrtStr = docID + ":" + inStr
outFile.write(wrtStr)
outFile.write('\n')
print ("Done DocID: ", docID)
outFile.close()
def main(self):
self.forIndex()
self.writeFile()
if __name__ == '__main__':
fInd = ForwardIndex()
fInd.main()