Skip to content
Permalink
Browse files
Update create_movie_corpus.py
  • Loading branch information
cookf2 committed Nov 29, 2018
1 parent 2059e7b commit a2e1464ad2ce4044b663e4bc816082007ea9743d
Showing 1 changed file with 8 additions and 4 deletions.
@@ -1,5 +1,7 @@
#make corpus
import sqlite3
"""this file is used in conjunction with the cornell movie dialogue corpus as mentioned below to make a corpus for chatterbot training data"""

"""NOTE: for this file to work the "movie_lines.txt" file must be placed in the same folder as this file"""
import sqlite3 #standard library
#this file makes use of the Cornell movie lines corpus which can be found at : https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html


@@ -12,7 +14,9 @@ def get_text(row):
row = row[1:]
return row

"""the yml file that this file creates requires the first few lines to be in a format similar to, but not the same as the rest of the file, this function creates that"""
"""the yml file that this file creates requires the first few lines to be in a format similar to, but not the same as the rest of the file, this function creates that
NOTE: as this function opens the file in "write" mode it means that it deletes any other copy of the corpus in the same folder""""
def make_initial():
with open("movie_lines_corpus.yml",'w') as corp:
corp.write("categories:\n")
@@ -65,7 +69,7 @@ with open("movie_lines_corpus.yml",'a') as corp:
corp.write("- - {}".format(text))
line_counter += 1
corp.write("\n")
if line_counter == 100000: #this value can be changed, this will change the size of the corpus
if line_counter == 100: #this value can be changed, this will change the size of the corpus
break

"""THE FILE CREATED BY THIS CALLED 'movie_lines_corpus.yml' MUST BE MOVED TO THE 'custom' FOLDER IN THE CHATTERBOT LIBRARY FOLDER TO WORK"""

0 comments on commit a2e1464

Please sign in to comment.