training_algorithm.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forward_statistics =['value', 'was_home', 'last_season_position', 'percent_value',\n",
    "       'position rank', 'goals_scored_ex', 'assists_ex', 'total_points_ex',\n",
    "       'minutes_ex', 'goals_conceded_ex', 'creativity_ex', 'influence_ex',\n",
    "       'threat_ex', 'bonus_ex', 'bps_ex', 'ict_index_ex', 'now_cost_ex', 'GW', 'opponent_last_season_position',\n",
    "        'mean assists 3','mean bonus 3', 'mean bps 3','mean creativity 3', 'mean goals_scored 3',\n",
    "       'mean ict_index 3', 'mean influence 3', 'mean minutes 3', 'mean penalties_missed 3',  'mean threat 3',\n",
    "       'mean total_points 3','mean value 3', 'mean match_result 3', 'std bps 3', 'std creativity 3',\n",
    "       'std ict_index 3', 'std influence 3', 'std minutes 3',\n",
    "       'std threat 3', 'std total_points 3', 'std value 3']\n",
    "\n",
    "leak_columns = [\n",
    "    \"name\",\n",
    "    \"team\",\n",
    "] \n",
    "\n",
    "\n",
    "dropped_columns = [\n",
    "    \"season\",\n",
    "    \"opponent\",\n",
    "    \"match_result\",\n",
    " \n",
    "    \"assists\",\n",
    "    \"penalties_missed\",\n",
    "    \"bonus\",\n",
    "    \"bps\",\n",
    "    \"clean_sheets\",\n",
    "    \"creativity\",\n",
    "    \"goals_conceded\",\n",
    "    \"goals_scored\",\n",
    "    \"ict_index\",\n",
    "    \"influence\",\n",
    "    \"own_goals\",\n",
    "    \"penalties_saved\",\n",
    "    \"red_cards\",\n",
    "    \"saves\",\n",
    "    \"selected\",\n",
    "    \"threat\",\n",
    "    \"transfers_balance\",\n",
    "    \"transfers_in\",\n",
    "    \"transfers_out\",\n",
    "    \"yellow_cards\",\n",
    "    \"team Goal scored\",\n",
    "    \"team Goal conceded\"\n",
    "] \n",
    "\n",
    "midfielder_statistics =['value', 'was_home', 'last_season_position', 'percent_value',\n",
    "       'position rank', 'goals_scored_ex', 'assists_ex', 'total_points_ex',\n",
    "       'minutes_ex', 'goals_conceded_ex', 'creativity_ex', 'influence_ex',\n",
    "       'threat_ex', 'bonus_ex', 'bps_ex', 'ict_index_ex', 'now_cost_ex', 'GW', 'opponent_last_season_position',\n",
    "        'mean assists 3','mean bonus 3', 'mean bps 3','mean creativity 3', 'mean goals_scored 3',\n",
    "       'mean ict_index 3', 'mean influence 3', 'mean minutes 3', 'mean penalties_missed 3',  'mean threat 3',\n",
    "       'mean total_points 3','mean value 3', 'mean match_result 3', 'std bps 3', 'std creativity 3',\n",
    "       'std ict_index 3', 'std influence 3', 'std minutes 3',\n",
    "       'std threat 3', 'std total_points 3', 'std value 3']\n",
    "\n",
    "goalkeeper_statistics = ['value', 'was_home', 'last_season_position', 'percent_value',\n",
    "       'position rank', 'total_points_ex', 'minutes_ex', 'goals_conceded_ex',\n",
    "       'bonus_ex', 'bps_ex', 'ict_index_ex', 'clean_sheets_ex',\n",
    "       'red_cards_ex', 'now_cost_ex', 'GW', 'opponent_last_season_position',\n",
    "       'mean bonus 3', 'mean bps 3', 'mean clean_sheets 3', 'mean goals_conceded 3',\n",
    "       'mean ict_index 3',  'mean minutes 3',\n",
    "       'mean own_goals 3',  'mean penalties_saved 3',\n",
    "        'mean saves 3',  'mean threat 3',\n",
    "       'mean total_points 3',\n",
    "       'mean value 3', 'mean match_result 3', 'std bps 3',\n",
    "       'std ict_index 3', 'std influence 3', 'std minutes 3',\n",
    "       'std threat 3', 'std total_points 3', 'std value 3']\n",
    "\n",
    "statistics =['value', 'position','was_home', 'last_season_position', 'percent_value',\n",
    "       'position rank', 'goals_scored_ex', 'assists_ex', 'total_points_ex',\n",
    "       'minutes_ex', 'goals_conceded_ex', 'creativity_ex', 'influence_ex',\n",
    "       'threat_ex', 'bonus_ex', 'bps_ex', 'ict_index_ex', 'clean_sheets_ex',\n",
    "       'yellow_cards_ex','now_cost_ex', 'GW', 'opponent_last_season_position', 'mean assists 3',\n",
    "       'mean bonus 3', 'mean bps 3', 'mean clean_sheets 3',\n",
    "       'mean creativity 3', 'mean goals_conceded 3', 'mean goals_scored 3',\n",
    "       'mean ict_index 3', 'mean influence 3', 'mean minutes 3',\n",
    "       'mean own_goals 3',\n",
    "       'mean red_cards 3',  'mean threat 3','mean total_points 3',\n",
    "       'mean value 3', 'mean match_result 3', 'std bps 3', 'std creativity 3',\n",
    "       'std ict_index 3', 'std influence 3', 'std minutes 3',\n",
    "       'std threat 3', 'std total_points 3', 'std value 3','mean saves 3','mean assists all',\n",
    "       'mean bonus all', 'mean bps all', 'mean clean_sheets all',\n",
    "       'mean creativity all', 'mean goals_conceded all', 'mean goals_scored all',\n",
    "       'mean ict_index all', 'mean influence all', 'mean minutes all',\n",
    "       'mean own_goals all',\n",
    "       'mean red_cards all',  'mean threat all','mean total_points all',\n",
    "       'mean value all', 'mean match_result all',\n",
    "       'mean team Goal scored 3','mean team Goal scored all','mean team Goal conceded 3','mean team Goal conceded all',\"ratio_goal_scored all\",\"ratio_goal_scored 3\",\n",
    "       'opp mean team Goal scored 3','opp mean team Goal conceded 3','opp mean team Goal scored all','opp mean team Goal conceded all',\"opp mean match_result all\"]\n",
    "\n",
    "\n",
    "date_cols=[\"day_of week\",\"month\",\"hour\",\"week\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_minutes(val):\n",
    "    if val > 10:\n",
    "        return 1\n",
    "    else:\n",
    "        return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install catboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from hyperopt import tpe,hp,fmin,STATUS_OK,Trials\n",
    "from hyperopt.pyll.base import scope"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import warnings\n",
    "import os\n",
    "\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.ensemble import (\n",
    "    RandomForestClassifier,\n",
    "    RandomForestRegressor,\n",
    "    GradientBoostingRegressor,\n",
    ")\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.linear_model import LinearRegression, Lasso, Ridge\n",
    "from sklearn.metrics import (\n",
    "    mean_squared_error,\n",
    "    mean_absolute_error,\n",
    "    confusion_matrix,\n",
    "    accuracy_score,\n",
    "    f1_score,\n",
    ")\n",
    "from lightgbm import LGBMRegressor, LGBMClassifier\n",
    "from catboost import CatBoostClassifier, CatBoostRegressor\n",
    "from sklearn.model_selection import StratifiedKFold,KFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gameweek=26"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train = pd.read_csv(\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Cleaned_Data\\\\cleaned_previous_seasons.csv\", index_col=0)\n",
    "\n",
    "\n",
    "old_gameweek_paths = [f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Cleaned_Data\\\\2023-24\\\\GW{i}.csv\" for i in range(1, gameweek)]\n",
    "old_gameweek_cleaned = [pd.read_csv(path) for path in old_gameweek_paths]\n",
    "old_gameweeks = pd.concat(old_gameweek_cleaned, ignore_index=True)\n",
    "\n",
    "# Ensure old gameweeks data has the same columns as the initial training data\n",
    "old_gameweeks = old_gameweeks[train.columns]\n",
    "\n",
    "# Combine the initial training data with the old gameweeks data\n",
    "train = pd.concat([train, old_gameweeks], ignore_index=True)\n",
    "\n",
    "# Load data for the current gameweek to be used for prediction\n",
    "test = pd.read_csv(f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Cleaned_Data\\\\2023-24\\\\GW{gameweek}.csv\", index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "directory_path = f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Predicted_outcomes\"\n",
    "\n",
    "if not os.path.exists(directory_path):\n",
    "    os.makedirs(directory_path, exist_ok=True)\n",
    "    print(f\"Directory {directory_path} created!\")\n",
    "else:\n",
    "    print(f\"Directory {directory_path} already exists.\")  \n",
    "   \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train[\"position\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train[\"position\"]=train[\"position\"].replace({\"GKP\":\"GK\" })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Function to preprocess and feature engineer the dataset\n",
    "def preprocess_and_feature_engineer(df):\n",
    "    df[\"index\"] = df[\"name\"] + df[\"kickoff_time\"].astype(\"str\")\n",
    "    df.drop_duplicates(\"index\", keep=\"last\", inplace=True)\n",
    "    df = df.set_index(\"index\")\n",
    "    df[\"date\"] = pd.to_datetime(df[\"kickoff_time\"], utc=True, format='ISO8601')\n",
    "    df[\"day_of_week\"] = df[\"date\"].dt.day_name()\n",
    "    df[\"month\"] = df[\"date\"].dt.month\n",
    "    df[\"hour\"] = df[\"date\"].dt.hour\n",
    "    df[\"week\"] = df[\"date\"].dt.isocalendar().week\n",
    "    df.drop([\"kickoff_time\", \"date\"], axis=1, inplace=True)\n",
    "    return df\n",
    "\n",
    "# Function to convert categorical columns to numerical\n",
    "def convert_categorical_to_numerical(df, categorical_columns):\n",
    "    for col in df.columns:\n",
    "        if df[col].dtype == \"object\" and col in categorical_columns:\n",
    "            df[col], _ = pd.factorize(df[col])\n",
    "    return df\n",
    "\n",
    "# Function to handle missing columns\n",
    "def add_missing_columns(df, expected_columns):\n",
    "    missing_columns = [col for col in expected_columns if col not in df.columns]\n",
    "    print(\"Missing columns:\", missing_columns)\n",
    "    for col in missing_columns:\n",
    "        df[col] = 0\n",
    "    return df\n",
    "\n",
    "# Applying the preprocessing steps\n",
    "train = preprocess_and_feature_engineer(train)\n",
    "test = preprocess_and_feature_engineer(test)\n",
    "\n",
    "# Ensure test data has the same columns as train data\n",
    "test = test[train.columns]\n",
    "\n",
    "# Copy datasets before further modification (if necessary)\n",
    "train_copy = train.copy()\n",
    "test_copy = test.copy()\n",
    "\n",
    "# Apply conversions\n",
    "train[\"minutes\"] = train[\"minutes\"].apply(convert_minutes)\n",
    "train = convert_categorical_to_numerical(train, [\"team\", \"name\", \"position\"])\n",
    "test = convert_categorical_to_numerical(test, [\"team\", \"name\", \"position\"])\n",
    "\n",
    "# Update target and drop unused columns\n",
    "target = train[[\"minutes\", \"GW\", \"position\"]]\n",
    "train.drop([\"total_points\", \"minutes\"], axis=1, inplace=True)\n",
    "test.drop([\"total_points\", \"minutes\"], axis=1, inplace=True)\n",
    "\n",
    "# Handle dropped columns\n",
    "dropped_columns = ['team Goal scored', 'team Goal conceded']\n",
    "train.drop(dropped_columns, axis=1, inplace=True, errors='ignore')\n",
    "test.drop(dropped_columns, axis=1, inplace=True, errors='ignore')\n",
    "\n",
    "# Convert 'position' to categorical\n",
    "train[\"position\"] = train[\"position\"].astype(\"category\")\n",
    "test[\"position\"] = test[\"position\"].astype(\"category\")\n",
    "\n",
    "# Replace values in 'was_home'\n",
    "train[\"was_home\"] = train[\"was_home\"].replace({True: 0, False: 1})\n",
    "test[\"was_home\"] = test[\"was_home\"].replace({True: 0, False: 1})\n",
    "\n",
    "# Ensure all expected columns are present\n",
    "train = add_missing_columns(train, expected_columns)\n",
    "test = add_missing_columns(test, expected_columns)\n",
    "\n",
    "# Select relevant columns\n",
    "train = train[statistics + leak_columns + date_cols]\n",
    "test = test[statistics + leak_columns + date_cols]\n",
    "\n",
    "# Splitting the dataset for training\n",
    "x, val, y, y_val = train_test_split(\n",
    "    train.drop(leak_columns, axis=1),\n",
    "    target[\"minutes\"],  # Change to \"total_points\" as needed\n",
    "    test_size=0.1,\n",
    "    random_state=0,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "params={'colsample_bylevel': 0.8070621518153563, 'learning_rate': 0.04765984972709895, 'max_depth': 7, 'reg_lambda': 5, 'scale_pos_weight': 2.5,'subsample': 0.6794390204583894}\n",
    "model=CatBoostClassifier(**params,cat_features=[\"position\"],random_state=0,early_stopping_rounds=500,use_best_model=True,verbose=500,n_estimators=10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(x, y,eval_set=[(val,y_val)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(confusion_matrix(model.predict(val), y_val))\n",
    "print(\n",
    "    f\"starting Accuracy score {accuracy_score(model.predict(val), y_val)}\"\n",
    ")\n",
    "\n",
    "print(\n",
    "    f\" starting f1 score: {f1_score(model.predict(val), y_val)}\"\n",
    ")\n",
    "\n",
    "feature_importance = pd.DataFrame(\n",
    "    {\"column\": x.columns, \"imp\": model.feature_importances_}\n",
    ").sort_values(\n",
    "    \"imp\", ascending=False\n",
    ")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_importance.head(50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_importance.tail(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in ['value', 'percent_value']:\n",
    "    if col in data_for_prediction.columns:\n",
    "        data_for_prediction[col] = pd.to_numeric(data_for_prediction[col], errors='coerce')\n",
    "        data_for_prediction[col].fillna(data_for_prediction[col].mean(), inplace=True)\n",
    "\n",
    "\n",
    "for col in ['month', 'hour', 'week']:\n",
    "    if col in data_for_prediction.columns:\n",
    "       \n",
    "        data_for_prediction[col] = data_for_prediction[col].astype(float)\n",
    "        \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_minutes=model.predict(val)\n",
    "val_=pd.DataFrame({\"ind\":val.index,\"actul_minutes\":y_val,\"predicted_minutes\":predicted_minutes,\"position\":val[\"position\"]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"STARTING GOALKEEPERS PERFORMANCE!!!\")\n",
    "a=val_[val_[\"position\"]==\"GK\"][\"actul_minutes\"]\n",
    "b=val_[val_[\"position\"]==\"GK\"][\"predicted_minutes\"]\n",
    "print(confusion_matrix(a,b))\n",
    "print(\n",
    "    f\"starting Accuracy score {accuracy_score(a,b)}\"\n",
    ")\n",
    "\n",
    "print(\n",
    "    f\" starting f1 score: {f1_score(a,b)}\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"STARTING DEFENDERS PERFORMANCE!!!\")\n",
    "a=val_[val_[\"position\"]==\"DEF\"][\"actul_minutes\"]\n",
    "b=val_[val_[\"position\"]==\"DEF\"][\"predicted_minutes\"]\n",
    "print(confusion_matrix(a,b))\n",
    "print(\n",
    "    f\"starting Accuracy score {accuracy_score(a,b)}\"\n",
    ")\n",
    "\n",
    "print(\n",
    "    f\" starting f1 score: {f1_score(a,b)}\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"STARTING MIDFIELDERS PERFORMANCE!!!\")\n",
    "a=val_[val_[\"position\"]==\"MID\"][\"actul_minutes\"]\n",
    "b=val_[val_[\"position\"]==\"MID\"][\"predicted_minutes\"]\n",
    "print(confusion_matrix(a,b))\n",
    "print(\n",
    "    f\"starting Accuracy score {accuracy_score(a,b)}\"\n",
    ")\n",
    "\n",
    "print(\n",
    "    f\" starting f1 score: {f1_score(a,b)}\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"STARTING FORWARDS PERFORMANCE!!!\")\n",
    "a=val_[val_[\"position\"]==\"FWD\"][\"actul_minutes\"]\n",
    "b=val_[val_[\"position\"]==\"FWD\"][\"predicted_minutes\"]\n",
    "print(confusion_matrix(a,b))\n",
    "print(\n",
    "    f\"starting Accuracy score {accuracy_score(a,b)}\"\n",
    ")\n",
    "\n",
    "print(\n",
    "    f\" starting f1 score: {f1_score(a,b)}\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_copy[test_copy[\"minutes\"]==1][\"position\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "train = train_copy[train_copy[\"minutes\"] > 0]\n",
    "test = test_copy[test_copy[\"minutes\"] > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ensure test data has the same columns as train data\n",
    "test = test[train.columns]\n",
    "\n",
    "# Re-apply preprocessing steps with refactored functions\n",
    "train = preprocess_and_feature_engineer(train)\n",
    "test = preprocess_and_feature_engineer(test)\n",
    "\n",
    "# Apply conversions\n",
    "train[\"minutes\"] = train[\"minutes\"].apply(convert_minutes)  # Only if needed, depends on context\n",
    "train = convert_categorical_to_numerical(train, [\"team\", \"name\", \"position\"])\n",
    "test = convert_categorical_to_numerical(test, [\"team\", \"name\", \"position\"])\n",
    "\n",
    "# Update target for total_points and adjust dataset accordingly\n",
    "target = train[[\"total_points\", \"GW\", \"position\"]]  # Adjusted to use \"total_points\" as target\n",
    "train.drop([\"total_points\", \"minutes\"], axis=1, inplace=True)  # Remove target and unused columns\n",
    "test.drop([\"total_points\", \"minutes\"], axis=1, inplace=True)  # Ensure consistency with train\n",
    "\n",
    "# Dropping columns not needed in the model and handling categorical data\n",
    "dropped_columns = ['team Goal scored', 'team Goal conceded']\n",
    "train.drop(dropped_columns, axis=1, inplace=True, errors='ignore')\n",
    "test.drop(dropped_columns, axis=1, inplace=True, errors='ignore')\n",
    "\n",
    "# Handle 'position' as categorical and 'was_home' replacement\n",
    "train[\"position\"] = train[\"position\"].astype(\"category\")\n",
    "test[\"position\"] = test[\"position\"].astype(\"category\")\n",
    "train[\"was_home\"] = train[\"was_home\"].replace({True: 0, False: 1})\n",
    "test[\"was_home\"] = test[\"was_home\"].replace({True: 0, False: 1})\n",
    "\n",
    "expected_columns = statistics + leak_columns + date_cols\n",
    "\n",
    "# Ensure all expected columns are present and select relevant columns for modeling\n",
    "train = add_missing_columns(train, expected_columns)\n",
    "test = add_missing_columns(test, expected_columns)\n",
    "train = train[statistics + leak_columns + date_cols]\n",
    "test = test[statistics + leak_columns + date_cols]\n",
    "\n",
    "# Splitting the dataset for training with \"total_points\" as the target\n",
    "x, val, y, y_val = train_test_split(\n",
    "    train.drop(leak_columns, axis=1),\n",
    "    target[\"total_points\"],  # Updated target\n",
    "    test_size=0.1,\n",
    "    random_state=0,\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "\n",
    "#cross_validator to splite the data into folds\n",
    "folds=KFold(n_splits=8,shuffle=True,random_state=0)\n",
    "\n",
    "#a dataframe to store the predictions made by each fold\n",
    "predictions_df=pd.DataFrame()\n",
    "\n",
    "#list to save the mean absolute errors from validatingon each folds\n",
    "rmse_val=[]\n",
    "rmse_X=[]\n",
    "\n",
    "#a simple catboost regressor\n",
    "model=LGBMRegressor(**{'colsample_bytree': 0.4199299182268318, 'learning_rate': 0.0032874466037521254, 'max_depth': 9, 'min_split_gain': 0.5685369160138952, 'num_leaves': 99, 'reg_alpha': 0.5621526419488447, 'reg_lambda': 0, 'subsample': 0.6534153111773866}, verbose=-50,random_state=0,early_stopping_rounds=200,n_estimators=10000)\n",
    "\n",
    "#train, make predictions and check the validation accuracy on  each fold\n",
    "for i,(train_index,test_index) in enumerate(folds.split(train.drop(leak_columns, axis=1),target[\"total_points\"])):\n",
    "    train_fold=train.drop(leak_columns, axis=1).iloc[train_index]\n",
    "    val_fold=train.drop(leak_columns, axis=1).iloc[test_index]\n",
    "    y_fold=target[\"total_points\"].iloc[train_index]\n",
    "    y_val_fold=target[\"total_points\"].iloc[test_index]\n",
    "\n",
    "\n",
    "    model.fit(train_fold,y_fold,eval_set=[(val_fold,y_val_fold)])\n",
    "    print(i+1)\n",
    "    prediction=model.predict(test.drop(leak_columns, axis=1))\n",
    "    predictions_df[i]=prediction\n",
    "    rmse_val.append(mean_squared_error(model.predict(val_fold),y_val_fold,squared=False))\n",
    "    rmse_X.append(mean_squared_error(model.predict(train_fold),y_fold,squared=False))\n",
    "print(rmse_val)\n",
    "print(rmse_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(np.mean(rmse_val))\n",
    "print(np.mean(rmse_X))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions_df[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[\"points\"]=np.mean(predictions_df, axis=1).values\n",
    "\n",
    "test[leak_columns + [\"points\", \"value\"]].sort_values(\n",
    "    \"points\", ascending=False\n",
    ").to_csv(\"points.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[test[\"position\"]==\"MID\"].sort_values(by=\"points\",ascending=False).head(5)[[\"name\",\"points\",\"team\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[test[\"position\"]==\"DEF\"].sort_values(by=\"points\",ascending=False).head(5)[[\"name\",\"points\",\"team\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[test[\"position\"]==\"GKP\"].sort_values(by=\"points\",ascending=False).head(5)[[\"name\",\"points\",\"team\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[test[\"position\"]==\"FWD\"].sort_values(by=\"points\",ascending=False).head(5)[[\"name\",\"points\",\"team\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[\"points\"].sort_values(ascending=False).head(50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "feature_importance = pd.DataFrame(\n",
    "    {\"column\": x.columns, \"imp\": model.feature_importances_}\n",
    ").sort_values(\n",
    "    \"imp\", ascending=False\n",
    ")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_copy[test_copy[\"position\"]==\"DEF\"][[\"name\",\"team\",\"minutes\"]].to_csv(f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Predicted_outcomes\\\\GW{gameweek}\\\\defenders_minutes.csv\")\n",
    "test_copy[test_copy[\"position\"]==\"GKP\"][[\"name\",\"team\",\"minutes\"]].to_csv(f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Predicted_outcomes\\\\GW{gameweek}\\\\goalkeepers_minutes.csv\")\n",
    "test_copy[test_copy[\"position\"]==\"MID\"][[\"name\",\"team\",\"minutes\"]].to_csv(f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Predicted_outcomes\\\\GW{gameweek}\\\\midfielders_minutes.csv\")\n",
    "test_copy[test_copy[\"position\"]==\"FWD\"][[\"name\",\"team\",\"minutes\"]].to_csv(f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Predicted_outcomes\\\\GW{gameweek}\\\\forwards_minutes.csv\")\n",
    "test[test[\"position\"]==\"DEF\"][[\"name\",\"team\",\"points\",\"value\"]].to_csv(f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Predicted_outcomes\\\\GW{gameweek}\\\\defenders_points.csv\")\n",
    "test[test[\"position\"]==\"GKP\"][[\"name\",\"team\",\"points\",\"value\"]].to_csv(f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Predicted_outcomes\\\\GW{gameweek}\\\\goalkeepers_points.csv\")\n",
    "test[test[\"position\"]==\"MID\"][[\"name\",\"team\",\"points\",\"value\"]].to_csv(f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Predicted_outcomes\\\\GW{gameweek}\\\\midfielders_points.csv\")\n",
    "test[test[\"position\"]==\"FWD\"][[\"name\",\"team\",\"points\",\"value\"]].to_csv(f\"C:\\\\Users\\\\prane\\\\Downloads\\\\FPL\\\\GW_PointsPredictor\\\\Predicted_outcomes\\\\GW{gameweek}\\\\forwards_points.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "4.4.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}