From ca5806fdd5610015216c9149a9f9f42d9bd0855f Mon Sep 17 00:00:00 2001
From: "Andrew Hawes (hawesa2)" <hawesa2@coventry.ac.uk>
Date: Mon, 7 Nov 2022 02:56:06 +0000
Subject: [PATCH] Initial upload

---
 Databases/words_list_website_parser.py | 34 ++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 Databases/words_list_website_parser.py
diff --git a/Databases/words_list_website_parser.py b/Databases/words_list_website_parser.py
new file mode 100644
index 0000000..ccc0792
--- /dev/null
+++ b/Databases/words_list_website_parser.py
@@ -0,0 +1,34 @@
+import urllib.request
+import re
+import database_interaction as databaseInteraction
+
+if __name__ == '__main__':
+    url = 'https://www.talkenglish.com/vocabulary/top-2000-vocabulary.aspx'
+
+    #example code to get html source of webpage. source: https://pythonprogramming.net/parse-website-using-regular-expressions-urllib/
+    req = urllib.request.Request(url)
+    resp = urllib.request.urlopen(req)
+    respData = resp.read()
+    #end of example
+
+    #get html table of words
+    htmlWordsTable = re.findall(r'<table id="GridView3"(.*?)</table>', str(respData))[0]
+    #remove whitespace chars
+    htmlWordsTable = htmlWordsTable.replace('\\r','').replace('\\n','').replace('\\t', '')
+    #get each table row
+    htmlTableRows = re.findall(r'<tr>(.*?)</tr>', str(htmlWordsTable))
+    
+    #add each word to a dictionary, the word being the key and the string list of tags being the value
+    words = {}
+    for row in htmlTableRows:
+        try:
+            word = re.findall(r'<td width="120"><a href="/how-to-use/(.*?)" target="_blank">(.*?)</a></td>', row)[0][1]
+        except IndexError:
+            word = re.findall(r'<td width="120">(.*?)</td>', row)[0]
+
+        types = re.findall(r'<td width="300">\((.*?)\)</td>', row)[0]
+        words[word] = types
+
+    #insert each word into database table
+    for word, tags in words.items():
+        databaseInteraction.InsertWordIntoDictionary(word, tags)