From ca5806fdd5610015216c9149a9f9f42d9bd0855f Mon Sep 17 00:00:00 2001 From: "Andrew Hawes (hawesa2)" Date: Mon, 7 Nov 2022 02:56:06 +0000 Subject: [PATCH] Initial upload --- Databases/words_list_website_parser.py | 34 ++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 Databases/words_list_website_parser.py diff --git a/Databases/words_list_website_parser.py b/Databases/words_list_website_parser.py new file mode 100644 index 0000000..ccc0792 --- /dev/null +++ b/Databases/words_list_website_parser.py @@ -0,0 +1,34 @@ +import urllib.request +import re +import database_interaction as databaseInteraction + +if __name__ == '__main__': + url = 'https://www.talkenglish.com/vocabulary/top-2000-vocabulary.aspx' + + #example code to get html source of webpage. source: https://pythonprogramming.net/parse-website-using-regular-expressions-urllib/ + req = urllib.request.Request(url) + resp = urllib.request.urlopen(req) + respData = resp.read() + #end of example + + #get html table of words + htmlWordsTable = re.findall(r'', str(respData))[0] + #remove whitespace chars + htmlWordsTable = htmlWordsTable.replace('\\r','').replace('\\n','').replace('\\t', '') + #get each table row + htmlTableRows = re.findall(r'(.*?)', str(htmlWordsTable)) + + #add each word to a dictionary, the word being the key and the string list of tags being the value + words = {} + for row in htmlTableRows: + try: + word = re.findall(r'', row)[0][1] + except IndexError: + word = re.findall(r'', row)[0] + + types = re.findall(r'', row)[0] + words[word] = types + + #insert each word into database table + for word, tags in words.items(): + databaseInteraction.InsertWordIntoDictionary(word, tags)
(.*?)(.*?)\((.*?)\)