create_movie_corpus.py

"""this file is used in conjunction with the cornell movie dialogue corpus as mentioned below to make a corpus for chatterbot training data"""

"""NOTE: for this file to work the "movie_lines.txt" file must be placed in the same folder as this file"""
import sqlite3 #standard library
#this file makes use of the Cornell movie lines corpus which can be found at : https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html


"""takes the row and returns the plaintext"""
def get_text(row):
    num_plusses = 0
    while num_plusses != 24: #in the movie corpus, metadata and text etc are separated by a string of characters, in each of which is 6"+" symbols, counting through these we can get to text reliably
        if row[0] == "+":
            num_plusses += 1
        row = row[1:]
    return row

"""the yml file that this file creates requires the first few lines to be in a format similar to, but not the same as the rest of the file, this function creates that

NOTE: as this function opens the file in "write" mode it means that it deletes any other copy of the corpus in the same folder""""
def make_initial():
    with open("movie_lines_corpus.yml",'w') as corp:
        corp.write("categories:\n")
        corp.write("- movie_lines\n")
        corp.write("conversations:\n")


make_initial()

with open("movie_lines_corpus.yml",'a') as corp:
    with open("movie_lines.txt") as lines:
        line_counter = 0
        for row in lines:

            if line_counter < 16:
                line_counter += 1
                print("illegal line")
                continue

            text = get_text(row) #gets the plain text
            text = text.replace("-","")
            text = text.replace(":","")
            text = text.replace("\"","")
            text = text.replace('\'',"")
            text = text.replace("*","")
            text = text.replace("\t","")
            text = text.replace("  ","")
            text = text.replace("[","")
            text = text.replace("]","")
            text = text.replace('`',"")
            text = text.replace("{","")
            text = text.replace("}","")
            text = text.replace("(","")
            text = text.replace(")","")
            text = text.replace(";","")
            text = text.replace("|","")
            text = text.replace("!","")
            text = text.replace("<","")
            text = text.replace(">","")
            text = text.replace("&quot","")
            text = text.replace("&","")
            text = text.lstrip()
            text = text.rstrip() #removes any characters which may cause any errors during the training process, this comes at the cost of punctuation being lost however
            text = text.encode("utf-8")

            #some lines need to have 2 dashes, others need 1, this writes each in an appropriate way
            if line_counter % 2 == 1:
                corp.write("  - {}".format(text))
            else:
                corp.write("- - {}".format(text))
            line_counter += 1
            corp.write("\n")
            if line_counter == 100:  #this value can be changed, this will change the size of the corpus
                break

"""THE FILE CREATED BY THIS CALLED 'movie_lines_corpus.yml' MUST BE MOVED TO THE 'custom' FOLDER IN THE CHATTERBOT LIBRARY FOLDER TO WORK"""
	"""this file is used in conjunction with the cornell movie dialogue corpus as mentioned below to make a corpus for chatterbot training data"""

	"""NOTE: for this file to work the "movie_lines.txt" file must be placed in the same folder as this file"""
	import sqlite3 #standard library
	#this file makes use of the Cornell movie lines corpus which can be found at : https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html


	"""takes the row and returns the plaintext"""
	def get_text(row):
	num_plusses = 0
	while num_plusses != 24: #in the movie corpus, metadata and text etc are separated by a string of characters, in each of which is 6"+" symbols, counting through these we can get to text reliably
	if row[0] == "+":
	num_plusses += 1
	row = row[1:]
	return row

	"""the yml file that this file creates requires the first few lines to be in a format similar to, but not the same as the rest of the file, this function creates that

	NOTE: as this function opens the file in "write" mode it means that it deletes any other copy of the corpus in the same folder""""
	def make_initial():
	with open("movie_lines_corpus.yml",'w') as corp:
	corp.write("categories:\n")
	corp.write("- movie_lines\n")
	corp.write("conversations:\n")


	make_initial()

	with open("movie_lines_corpus.yml",'a') as corp:
	with open("movie_lines.txt") as lines:
	line_counter = 0
	for row in lines:

	if line_counter < 16:
	line_counter += 1
	print("illegal line")
	continue

	text = get_text(row) #gets the plain text
	text = text.replace("-","")
	text = text.replace(":","")
	text = text.replace("\"","")
	text = text.replace('\'',"")
	text = text.replace("*","")
	text = text.replace("\t","")
	text = text.replace(" ","")
	text = text.replace("[","")
	text = text.replace("]","")
	text = text.replace('`',"")
	text = text.replace("{","")
	text = text.replace("}","")
	text = text.replace("(","")
	text = text.replace(")","")
	text = text.replace(";","")
	text = text.replace("\|","")
	text = text.replace("!","")
	text = text.replace("<","")
	text = text.replace(">","")
	text = text.replace("&quot","")
	text = text.replace("&","")
	text = text.lstrip()
	text = text.rstrip() #removes any characters which may cause any errors during the training process, this comes at the cost of punctuation being lost however
	text = text.encode("utf-8")

	#some lines need to have 2 dashes, others need 1, this writes each in an appropriate way
	if line_counter % 2 == 1:
	corp.write(" - {}".format(text))
	else:
	corp.write("- - {}".format(text))
	line_counter += 1
	corp.write("\n")
	if line_counter == 100: #this value can be changed, this will change the size of the corpus
	break

	"""THE FILE CREATED BY THIS CALLED 'movie_lines_corpus.yml' MUST BE MOVED TO THE 'custom' FOLDER IN THE CHATTERBOT LIBRARY FOLDER TO WORK"""