Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Chatbot/TrainingDatasetSort.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
209 lines (177 sloc)
10.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
import json | |
from datetime import datetime | |
import time | |
timeframe = '2015-01' | |
sql_transaction = [] | |
start_row = 0 | |
cleanup = 1000000 | |
connection = sqlite3.connect('{}2.db'.format(timeframe)) | |
c = connection.cursor() | |
def create_table(): | |
c.execute( | |
"CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, " | |
"comment TEXT, subreddit TEXT, unix INT, score INT)") | |
def format_data(data): | |
data = data.replace("/n", " newlinechar ").replace("/r", " returnchar ").replace('"', "'") | |
return data | |
def find_parent(pid): | |
'''The function checks if the comment has a parent (in the selected dataset).''' | |
try: | |
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid) | |
c.execute(sql) | |
result = c.fetchone() | |
if result != None: | |
return result[0] | |
else: | |
return False | |
except Exception as e: | |
# print(str(e)) | |
return False | |
def find_existing_score(pid): | |
'''The function checks for the score of the parent of a comment.''' | |
try: | |
sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid) | |
c.execute(sql) | |
result = c.fetchone() | |
if result != None: | |
return result[0] | |
else: | |
return False | |
except Exception as e: | |
return False | |
def acceptable(data): | |
'''The function checks if the comment is not too long, too short, or is removed or deleted.''' | |
if len(data.split(' ')) > 80 or len(data) < 1: | |
return False | |
elif len(data) > 1000: | |
return False | |
elif data == '[deleted]' or data == '[removed]': | |
return False | |
else: | |
return True | |
def sql_insert_replace_comment(parentid, commentid, parent, comment, subreddit, time, score): | |
'''This function replaces the data of an existing comment with another one.''' | |
try: | |
sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, | |
score = ? WHERE parent_id = ?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score) | |
transaction_bldr(sql) | |
except Exception as e: | |
print('s0 insertion, str(e)') | |
def sql_insert_no_parent(parentid, commentid, comment, subreddit, time, score): | |
'''The function inserts a comment that has no parent.''' | |
try: | |
sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES | |
('{}','{}','{}','{}',{},{});""".format(parentid, commentid, comment, subreddit, int(time), score) | |
transaction_bldr(sql) | |
except Exception as e: | |
print('s0 insertion, str(e)') | |
def sql_insert_has_parent_comment(parentid, commentid, parent, comment, subreddit, time, score): | |
'''The function inserts a comment with its parent found.''' | |
try: | |
sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ('{}', | |
'{}','{}','{}','{}',{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score) | |
transaction_bldr(sql) | |
except Exception as e: | |
print('s0 insertion, str(e)') | |
def transaction_bldr(sql): | |
'''''' | |
global sql_transaction | |
sql_transaction.append(sql) | |
if len(sql_transaction) > 1000: | |
c.execute('BEGIN TRANSACTION') | |
for s in sql_transaction: | |
try: | |
c.execute(s) | |
except: | |
pass | |
connection.commit() | |
sql_transaction = [] | |
if __name__ == '__main__': | |
create_table() | |
row_counter = 0 | |
paired_rows = 0 | |
with open('C:/chatbot_data/Reddit_comments/RC_{}'.format(timeframe), buffering=1000) as f: | |
for row in f: | |
row_counter += 1 | |
if row_counter >= start_row: | |
try: | |
row = json.loads(row) | |
parent_id = row['parent_id'] | |
comment_id = row['id'] | |
body = format_data(row['body']) | |
created_utc = row['created_utc'] | |
score = row['score'] | |
subreddit = row['subreddit'] | |
parent_data = find_parent(parent_id) | |
#Only pieces written by myself in this entire code(#####) | |
##### | |
desired_subreddit = ['AskReddit', 'AskScience', 'askengineers', 'Foodforthought', 'YouShouldKnow', | |
'anime', 'manga', 'todayilearned', 'wikipedia', 'rpg', 'DnD', 'programming', | |
'learnprogramming', 'python', 'java', 'javascript', 'learnpython', 'excel', | |
'unity3d', 'linux', 'linux_gaming', 'legaladvice', 'advice', 'whowouldwin', | |
'wouldyourather', 'skeptic', 'conspiracy', 'karmaconspiracy', 'UFOs', | |
'conspiratard', 'empiredidnothingwrong', 'dadjokes', 'punny', 'puns', | |
'ProgrammerHumor', 'politicalhumor', 'funandsad', 'audiophiles', 'headphones', | |
'audioengineering', 'worldnews', 'news', 'UpliftingNews', 'gamernews', | |
'tropicalweather', 'awesome', 'TrueReddit', 'dataisbeautiful', 'DataHoarder', | |
'bestof', 'DepthHub', 'BestOfReports', 'bestoflegaladvice', 'creepy', | |
'creepypasta', 'starwars', 'startrek', 'explainlikeimfive', 'gamedev', | |
'engineering', 'ubuntu', 'cscareerquestions', 'EngineeringStudents', | |
'askengineers', 'InternetIsBeautiful', 'google', 'web_design', 'discordapp', | |
'gaming', 'Games', 'outside', 'truegaming', 'gamephysics', 'webgames', | |
'IndieGaming', 'patientgamers', 'AndroidGaming', 'randomactofgaming', | |
'speedrun', 'gamemusic', 'emulation', 'MMORPG', 'gamecollecting', 'hitboxporn', | |
'gamingcirclejerk', 'gamersriseup', 'gamingdetails', 'gaming4gamers', | |
'retrogaming', 'DotA2', 'starcraft', 'smashbros', 'dayz', 'civ', | |
'KerbalSpaceProgram', 'masseffect', 'clashofclans', 'starbound', | |
'heroesofthestorm', 'terraria', 'dragonage', 'citiesskylines', 'smite', | |
'bindingofisaac', 'eve', 'starcitizen', 'animalcrossing', 'metalgearsolid', | |
'elitedangerous', 'bloodborne', 'monsterhunter', 'warframe', 'undertale', | |
'thedivision', 'stardewvalley', 'nomanskythegame', 'totalwar', 'pathofexile', | |
'ClashRoyale', 'crusaderkings', 'dwarffortress', 'eu4', 'thesims', | |
'assasinscreed', 'playrust', 'forhonor', 'stellaris', 'kingdomhearts', | |
'blackdesertonline', 'factorio', 'Warhammer', 'splatoon','rimworld', 'Xcom', | |
'streetfighter', 'paydaytheheist', 'MonsterHunterWorld', 'Seaofthieves', | |
'cyberpunkgames', 'warhammer40k', 'paladins', 'osugame', 'spidermanps4', | |
'persona5', 'horizon', 'reddeadredemption', 'mountainblade', 'deadbydaylight', | |
'farcry', 'hoi4', 'warthunder', 'grandorder', 'divinityorignalsin', | |
'escapefromtarkov', 'theexpanse', 'darkestdungeon', 'forza', 'godofwar', 'ark', | |
'bioshock', 'edh', 'summonerswar', 'duellinks', 'arma', 'pathfinderrpg', | |
'footballmanagergames', 'kingdomcome', 'subnautica', 'thelastofus', 'doom', | |
'borderlands', 'borderlands2', 'Darksouls', 'Darksouls2', 'Darksouls3', | |
'diablo', 'diablo3', 'elderscrollsonline', 'ElderScrolls', 'Skyrim', | |
'skyrimmods', 'fallout', 'fo4', 'fo6', 'fireemblem', 'FortniteBR', 'Fortnite', | |
'FortniteBattleRoyale', 'GrandTheftAutoV', 'gtav', 'gtaonline', 'hearthstone', | |
'minecraft', 'overwatch', 'PUBATTLEGROUNDS', 'PUBG', 'rocketleague', 'witcher', | |
'halo', 'fifa', 'nba2k','leagueoflegends', 'Pokemon', 'zelda'] | |
##### | |
if score >= 3: | |
##### | |
if subreddit in desired_subreddit: | |
##### | |
existing_comment_score = find_existing_score(parent_id) | |
if existing_comment_score: | |
if score > existing_comment_score: | |
if acceptable(body): | |
sql_insert_replace_comment(comment_id, parent_id, parent_data, body, subreddit, created_utc, score) | |
else: | |
if acceptable(body): | |
if parent_data: | |
sql_insert_has_parent_comment(comment_id, parent_id, parent_data, body, subreddit, created_utc, score) | |
paired_rows += 1 | |
else: | |
sql_insert_no_parent(comment_id, parent_id, body, subreddit, created_utc, score) | |
except Exception as e: | |
print(str(e)) | |
if row_counter % 100000 == 0: | |
print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now()))) | |
#Clean up data in which the parent comment is not found. | |
if row_counter > start_row: | |
if row_counter % cleanup == 0: | |
print("Cleanin up!") | |
sql = "DELETE FROM parent_reply WHERE parent IS NULL" | |
c.execute(sql) | |
connection.commit() | |
c.execute("VACUUM") | |
connection.commit() | |