Initial upload

hawesa2 · Nov 7, 2022 · ca5806fdd5610015216c9149a9f9f42d9bd0855f · ca5806f
1 parent ff4ecc8
commit ca5806fdd5610015216c9149a9f9f42d9bd0855f
Showing 1 changed file with 34 additions and 0 deletions.
diff --git a/Databases/words_list_website_parser.py b/Databases/words_list_website_parser.py
@@ -0,0 +1,34 @@
+import urllib.request
+import re
+import database_interaction as databaseInteraction
+
+if __name__ == '__main__':
+    url = 'https://www.talkenglish.com/vocabulary/top-2000-vocabulary.aspx'
+
+    #example code to get html source of webpage. source: https://pythonprogramming.net/parse-website-using-regular-expressions-urllib/
+    req = urllib.request.Request(url)
+    resp = urllib.request.urlopen(req)
+    respData = resp.read()
+    #end of example
+
+    #get html table of words
+    htmlWordsTable = re.findall(r'<table id="GridView3"(.*?)</table>', str(respData))[0]
+    #remove whitespace chars
+    htmlWordsTable = htmlWordsTable.replace('\\r','').replace('\\n','').replace('\\t', '')
+    #get each table row
+    htmlTableRows = re.findall(r'<tr>(.*?)</tr>', str(htmlWordsTable))
+
+    #add each word to a dictionary, the word being the key and the string list of tags being the value
+    words = {}
+    for row in htmlTableRows:
+        try:
+            word = re.findall(r'<td width="120"><a href="/how-to-use/(.*?)" target="_blank">(.*?)</a></td>', row)[0][1]
+        except IndexError:
+            word = re.findall(r'<td width="120">(.*?)</td>', row)[0]
+
+        types = re.findall(r'<td width="300">\((.*?)\)</td>', row)[0]
+        words[word] = types
+
+    #insert each word into database table
+    for word, tags in words.items():
+        databaseInteraction.InsertWordIntoDictionary(word, tags)