Skip to content
Permalink
Browse files
Initial upload
  • Loading branch information
hawesa2 committed Nov 7, 2022
1 parent ff4ecc8 commit ca5806fdd5610015216c9149a9f9f42d9bd0855f
Showing 1 changed file with 34 additions and 0 deletions.
@@ -0,0 +1,34 @@
import urllib.request
import re
import database_interaction as databaseInteraction

if __name__ == '__main__':
url = 'https://www.talkenglish.com/vocabulary/top-2000-vocabulary.aspx'

#example code to get html source of webpage. source: https://pythonprogramming.net/parse-website-using-regular-expressions-urllib/
req = urllib.request.Request(url)
resp = urllib.request.urlopen(req)
respData = resp.read()
#end of example

#get html table of words
htmlWordsTable = re.findall(r'<table id="GridView3"(.*?)</table>', str(respData))[0]
#remove whitespace chars
htmlWordsTable = htmlWordsTable.replace('\\r','').replace('\\n','').replace('\\t', '')
#get each table row
htmlTableRows = re.findall(r'<tr>(.*?)</tr>', str(htmlWordsTable))

#add each word to a dictionary, the word being the key and the string list of tags being the value
words = {}
for row in htmlTableRows:
try:
word = re.findall(r'<td width="120"><a href="/how-to-use/(.*?)" target="_blank">(.*?)</a></td>', row)[0][1]
except IndexError:
word = re.findall(r'<td width="120">(.*?)</td>', row)[0]

types = re.findall(r'<td width="300">\((.*?)\)</td>', row)[0]
words[word] = types

#insert each word into database table
for word, tags in words.items():
databaseInteraction.InsertWordIntoDictionary(word, tags)

0 comments on commit ca5806f

Please sign in to comment.