Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
"""this file is used in conjunction with the cornell movie dialogue corpus as mentioned below to make a corpus for chatterbot training data"""
"""NOTE: for this file to work the "movie_lines.txt" file must be placed in the same folder as this file"""
import sqlite3 #standard library
#this file makes use of the Cornell movie lines corpus which can be found at : https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
"""takes the row and returns the plaintext"""
def get_text(row):
num_plusses = 0
while num_plusses != 24: #in the movie corpus, metadata and text etc are separated by a string of characters, in each of which is 6"+" symbols, counting through these we can get to text reliably
if row[0] == "+":
num_plusses += 1
row = row[1:]
return row
"""the yml file that this file creates requires the first few lines to be in a format similar to, but not the same as the rest of the file, this function creates that
NOTE: as this function opens the file in "write" mode it means that it deletes any other copy of the corpus in the same folder""""
def make_initial():
with open("movie_lines_corpus.yml",'w') as corp:
corp.write("categories:\n")
corp.write("- movie_lines\n")
corp.write("conversations:\n")
make_initial()
with open("movie_lines_corpus.yml",'a') as corp:
with open("movie_lines.txt") as lines:
line_counter = 0
for row in lines:
if line_counter < 16:
line_counter += 1
print("illegal line")
continue
text = get_text(row) #gets the plain text
text = text.replace("-","")
text = text.replace(":","")
text = text.replace("\"","")
text = text.replace('\'',"")
text = text.replace("*","")
text = text.replace("\t","")
text = text.replace(" ","")
text = text.replace("[","")
text = text.replace("]","")
text = text.replace('`',"")
text = text.replace("{","")
text = text.replace("}","")
text = text.replace("(","")
text = text.replace(")","")
text = text.replace(";","")
text = text.replace("|","")
text = text.replace("!","")
text = text.replace("<","")
text = text.replace(">","")
text = text.replace("&quot","")
text = text.replace("&","")
text = text.lstrip()
text = text.rstrip() #removes any characters which may cause any errors during the training process, this comes at the cost of punctuation being lost however
text = text.encode("utf-8")
#some lines need to have 2 dashes, others need 1, this writes each in an appropriate way
if line_counter % 2 == 1:
corp.write(" - {}".format(text))
else:
corp.write("- - {}".format(text))
line_counter += 1
corp.write("\n")
if line_counter == 100: #this value can be changed, this will change the size of the corpus
break
"""THE FILE CREATED BY THIS CALLED 'movie_lines_corpus.yml' MUST BE MOVED TO THE 'custom' FOLDER IN THE CHATTERBOT LIBRARY FOLDER TO WORK"""