From 181075c056b42c0fa3c3866f4d61247fba85229b Mon Sep 17 00:00:00 2001 From: Boyan-Yordanov Date: Sat, 4 Mar 2023 16:18:01 +0000 Subject: [PATCH] encapsulated the preprocessing into usefull functions --- preprocess.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 preprocess.py diff --git a/preprocess.py b/preprocess.py new file mode 100644 index 0000000..c33481d --- /dev/null +++ b/preprocess.py @@ -0,0 +1,90 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +import warnings +warnings.filterwarnings('ignore') + + +def extract_games(df): + all_games = [] + for games in df: + all_games.append(games) + return all_games + +def extract_moves(games): + all_white_moves = [] + all_black_moves = [] + for game in games: + game = game.split(",") + for move in game: + move = move.split(" ") + all_black_moves.append(move[1::2]) + all_white_moves.append(move[0::2]) + return all_white_moves, all_black_moves + +def number_of_takes(player_games): + all_takes = [] + for game in player_games: + takes = 0 + for moves in game: + takes = takes + moves.count("x") + all_takes.append(takes) + return all_takes + +def mate_games(chess_data): + chess_data = chess_data[chess_data.rated != False] + chess_data = chess_data[chess_data.victory_status == "mate"] + chess_data = chess_data[chess_data.turns > 4] + chess_data.drop_duplicates(subset=['id'], inplace=True) + chess_data[['whites_opening','blacks_opening']] = chess_data.opening_name.str.split(": ", 1, expand=True) + chess_data[['time_limit','increment']] = chess_data.increment_code.str.split("+", 1, expand=True).astype('int') + chess_data.drop(['id', 'rated', 'white_id', 'black_id','opening_name', 'increment_code', 'victory_status'],axis=1,inplace=True) + from sklearn.preprocessing import LabelEncoder + le = LabelEncoder() + for column_name in ['winner','whites_opening','blacks_opening', 'opening_eco']: + chess_data[column_name] = le.fit_transform(chess_data[column_name]) + games_df = extract_games(chess_data.moves) + white_moves, black_moves = extract_moves(games_df) + white_took = number_of_takes(white_moves) + wtdf = pd.DataFrame(data = white_took, columns=["white_took"]) + black_took = number_of_takes(black_moves) + btdf = pd.DataFrame(data = black_took, columns=["black_took"]) + chess_data = pd.concat([chess_data,wtdf],axis=1) + chess_data = pd.concat([chess_data,btdf],axis=1) + chess_data.drop(['moves', 'last_move_at', 'created_at'], axis=1, inplace=True) + chess_data = chess_data[chess_data.turns < 200] + chess_data.reset_index(inplace = True) + chess_data.drop(['index'],axis=1,inplace=True) + x = chess_data.iloc[:, chess_data.columns != 'winner'] + y = chess_data.iloc[:, 2] + return x, y + +def draw_and_mate_games(chess_data): + chess_data = chess_data[chess_data.rated != False] + chess_data = chess_data[chess_data.victory_status != "outoftime"] + chess_data = chess_data[chess_data.victory_status != "resign"] + chess_data = chess_data[chess_data.turns > 4] + chess_data.drop_duplicates(subset=['id'], inplace=True) + chess_data[['whites_opening','blacks_opening']] = chess_data.opening_name.str.split(": ", 1, expand=True) + chess_data[['time_limit','increment']] = chess_data.increment_code.str.split("+", 1, expand=True).astype('int') + chess_data.drop(['id', 'rated', 'white_id', 'black_id','opening_name', 'increment_code'],axis=1,inplace=True) + from sklearn.preprocessing import LabelEncoder + le = LabelEncoder() + for column_name in ['winner','whites_opening','blacks_opening', 'opening_eco', 'victory_status']: + chess_data[column_name] = le.fit_transform(chess_data[column_name]) + games_df = extract_games(chess_data.moves) + white_moves, black_moves = extract_moves(games_df) + white_took = number_of_takes(white_moves) + wtdf = pd.DataFrame(data = white_took, columns=["white_took"]) + black_took = number_of_takes(black_moves) + btdf = pd.DataFrame(data = black_took, columns=["black_took"]) + chess_data = pd.concat([chess_data,wtdf],axis=1) + chess_data = pd.concat([chess_data,btdf],axis=1) + chess_data.drop(['moves', 'last_move_at', 'created_at'], axis=1, inplace=True) + chess_data = chess_data[chess_data.turns < 200] + chess_data.reset_index(inplace = True) + chess_data.drop(['index'],axis=1,inplace=True) + x = chess_data.iloc[:, chess_data.columns != 'winner'] + y = chess_data.iloc[:, 2] + return x, y