Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Python_Telegram_Chat_bot/create_movie_corpus.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
75 lines (63 sloc)
3.32 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""this file is used in conjunction with the cornell movie dialogue corpus as mentioned below to make a corpus for chatterbot training data""" | |
"""NOTE: for this file to work the "movie_lines.txt" file must be placed in the same folder as this file""" | |
import sqlite3 #standard library | |
#this file makes use of the Cornell movie lines corpus which can be found at : https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html | |
"""takes the row and returns the plaintext""" | |
def get_text(row): | |
num_plusses = 0 | |
while num_plusses != 24: #in the movie corpus, metadata and text etc are separated by a string of characters, in each of which is 6"+" symbols, counting through these we can get to text reliably | |
if row[0] == "+": | |
num_plusses += 1 | |
row = row[1:] | |
return row | |
"""the yml file that this file creates requires the first few lines to be in a format similar to, but not the same as the rest of the file, this function creates that | |
NOTE: as this function opens the file in "write" mode it means that it deletes any other copy of the corpus in the same folder"""" | |
def make_initial(): | |
with open("movie_lines_corpus.yml",'w') as corp: | |
corp.write("categories:\n") | |
corp.write("- movie_lines\n") | |
corp.write("conversations:\n") | |
make_initial() | |
with open("movie_lines_corpus.yml",'a') as corp: | |
with open("movie_lines.txt") as lines: | |
line_counter = 0 | |
for row in lines: | |
if line_counter < 16: | |
line_counter += 1 | |
print("illegal line") | |
continue | |
text = get_text(row) #gets the plain text | |
text = text.replace("-","") | |
text = text.replace(":","") | |
text = text.replace("\"","") | |
text = text.replace('\'',"") | |
text = text.replace("*","") | |
text = text.replace("\t","") | |
text = text.replace(" ","") | |
text = text.replace("[","") | |
text = text.replace("]","") | |
text = text.replace('`',"") | |
text = text.replace("{","") | |
text = text.replace("}","") | |
text = text.replace("(","") | |
text = text.replace(")","") | |
text = text.replace(";","") | |
text = text.replace("|","") | |
text = text.replace("!","") | |
text = text.replace("<","") | |
text = text.replace(">","") | |
text = text.replace(""","") | |
text = text.replace("&","") | |
text = text.lstrip() | |
text = text.rstrip() #removes any characters which may cause any errors during the training process, this comes at the cost of punctuation being lost however | |
text = text.encode("utf-8") | |
#some lines need to have 2 dashes, others need 1, this writes each in an appropriate way | |
if line_counter % 2 == 1: | |
corp.write(" - {}".format(text)) | |
else: | |
corp.write("- - {}".format(text)) | |
line_counter += 1 | |
corp.write("\n") | |
if line_counter == 100: #this value can be changed, this will change the size of the corpus | |
break | |
"""THE FILE CREATED BY THIS CALLED 'movie_lines_corpus.yml' MUST BE MOVED TO THE 'custom' FOLDER IN THE CHATTERBOT LIBRARY FOLDER TO WORK""" |