Skip to content
Permalink
2f85e7af45
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
209 lines (177 sloc) 10.4 KB
import sqlite3
import json
from datetime import datetime
import time
timeframe = '2015-01'
sql_transaction = []
start_row = 0
cleanup = 1000000
connection = sqlite3.connect('{}2.db'.format(timeframe))
c = connection.cursor()
def create_table():
c.execute(
"CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, "
"comment TEXT, subreddit TEXT, unix INT, score INT)")
def format_data(data):
data = data.replace("/n", " newlinechar ").replace("/r", " returnchar ").replace('"', "'")
return data
def find_parent(pid):
'''The function checks if the comment has a parent (in the selected dataset).'''
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else:
return False
except Exception as e:
# print(str(e))
return False
def find_existing_score(pid):
'''The function checks for the score of the parent of a comment.'''
try:
sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else:
return False
except Exception as e:
return False
def acceptable(data):
'''The function checks if the comment is not too long, too short, or is removed or deleted.'''
if len(data.split(' ')) > 80 or len(data) < 1:
return False
elif len(data) > 1000:
return False
elif data == '[deleted]' or data == '[removed]':
return False
else:
return True
def sql_insert_replace_comment(parentid, commentid, parent, comment, subreddit, time, score):
'''This function replaces the data of an existing comment with another one.'''
try:
sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?,
score = ? WHERE parent_id = ?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion, str(e)')
def sql_insert_no_parent(parentid, commentid, comment, subreddit, time, score):
'''The function inserts a comment that has no parent.'''
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES
('{}','{}','{}','{}',{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion, str(e)')
def sql_insert_has_parent_comment(parentid, commentid, parent, comment, subreddit, time, score):
'''The function inserts a comment with its parent found.'''
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ('{}',
'{}','{}','{}','{}',{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion, str(e)')
def transaction_bldr(sql):
''''''
global sql_transaction
sql_transaction.append(sql)
if len(sql_transaction) > 1000:
c.execute('BEGIN TRANSACTION')
for s in sql_transaction:
try:
c.execute(s)
except:
pass
connection.commit()
sql_transaction = []
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
with open('C:/chatbot_data/Reddit_comments/RC_{}'.format(timeframe), buffering=1000) as f:
for row in f:
row_counter += 1
if row_counter >= start_row:
try:
row = json.loads(row)
parent_id = row['parent_id']
comment_id = row['id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
#Only pieces written by myself in this entire code(#####)
#####
desired_subreddit = ['AskReddit', 'AskScience', 'askengineers', 'Foodforthought', 'YouShouldKnow',
'anime', 'manga', 'todayilearned', 'wikipedia', 'rpg', 'DnD', 'programming',
'learnprogramming', 'python', 'java', 'javascript', 'learnpython', 'excel',
'unity3d', 'linux', 'linux_gaming', 'legaladvice', 'advice', 'whowouldwin',
'wouldyourather', 'skeptic', 'conspiracy', 'karmaconspiracy', 'UFOs',
'conspiratard', 'empiredidnothingwrong', 'dadjokes', 'punny', 'puns',
'ProgrammerHumor', 'politicalhumor', 'funandsad', 'audiophiles', 'headphones',
'audioengineering', 'worldnews', 'news', 'UpliftingNews', 'gamernews',
'tropicalweather', 'awesome', 'TrueReddit', 'dataisbeautiful', 'DataHoarder',
'bestof', 'DepthHub', 'BestOfReports', 'bestoflegaladvice', 'creepy',
'creepypasta', 'starwars', 'startrek', 'explainlikeimfive', 'gamedev',
'engineering', 'ubuntu', 'cscareerquestions', 'EngineeringStudents',
'askengineers', 'InternetIsBeautiful', 'google', 'web_design', 'discordapp',
'gaming', 'Games', 'outside', 'truegaming', 'gamephysics', 'webgames',
'IndieGaming', 'patientgamers', 'AndroidGaming', 'randomactofgaming',
'speedrun', 'gamemusic', 'emulation', 'MMORPG', 'gamecollecting', 'hitboxporn',
'gamingcirclejerk', 'gamersriseup', 'gamingdetails', 'gaming4gamers',
'retrogaming', 'DotA2', 'starcraft', 'smashbros', 'dayz', 'civ',
'KerbalSpaceProgram', 'masseffect', 'clashofclans', 'starbound',
'heroesofthestorm', 'terraria', 'dragonage', 'citiesskylines', 'smite',
'bindingofisaac', 'eve', 'starcitizen', 'animalcrossing', 'metalgearsolid',
'elitedangerous', 'bloodborne', 'monsterhunter', 'warframe', 'undertale',
'thedivision', 'stardewvalley', 'nomanskythegame', 'totalwar', 'pathofexile',
'ClashRoyale', 'crusaderkings', 'dwarffortress', 'eu4', 'thesims',
'assasinscreed', 'playrust', 'forhonor', 'stellaris', 'kingdomhearts',
'blackdesertonline', 'factorio', 'Warhammer', 'splatoon','rimworld', 'Xcom',
'streetfighter', 'paydaytheheist', 'MonsterHunterWorld', 'Seaofthieves',
'cyberpunkgames', 'warhammer40k', 'paladins', 'osugame', 'spidermanps4',
'persona5', 'horizon', 'reddeadredemption', 'mountainblade', 'deadbydaylight',
'farcry', 'hoi4', 'warthunder', 'grandorder', 'divinityorignalsin',
'escapefromtarkov', 'theexpanse', 'darkestdungeon', 'forza', 'godofwar', 'ark',
'bioshock', 'edh', 'summonerswar', 'duellinks', 'arma', 'pathfinderrpg',
'footballmanagergames', 'kingdomcome', 'subnautica', 'thelastofus', 'doom',
'borderlands', 'borderlands2', 'Darksouls', 'Darksouls2', 'Darksouls3',
'diablo', 'diablo3', 'elderscrollsonline', 'ElderScrolls', 'Skyrim',
'skyrimmods', 'fallout', 'fo4', 'fo6', 'fireemblem', 'FortniteBR', 'Fortnite',
'FortniteBattleRoyale', 'GrandTheftAutoV', 'gtav', 'gtaonline', 'hearthstone',
'minecraft', 'overwatch', 'PUBATTLEGROUNDS', 'PUBG', 'rocketleague', 'witcher',
'halo', 'fifa', 'nba2k','leagueoflegends', 'Pokemon', 'zelda']
#####
if score >= 3:
#####
if subreddit in desired_subreddit:
#####
existing_comment_score = find_existing_score(parent_id)
if existing_comment_score:
if score > existing_comment_score:
if acceptable(body):
sql_insert_replace_comment(comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
else:
if acceptable(body):
if parent_data:
sql_insert_has_parent_comment(comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
paired_rows += 1
else:
sql_insert_no_parent(comment_id, parent_id, body, subreddit, created_utc, score)
except Exception as e:
print(str(e))
if row_counter % 100000 == 0:
print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))
#Clean up data in which the parent comment is not found.
if row_counter > start_row:
if row_counter % cleanup == 0:
print("Cleanin up!")
sql = "DELETE FROM parent_reply WHERE parent IS NULL"
c.execute(sql)
connection.commit()
c.execute("VACUUM")
connection.commit()