Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.metrics import confusion_matrix\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from imblearn.over_sampling import RandomOverSampler"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#-----------utility functions---------"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"# Random search function\n",
"def rand_search(model, params, n_iter, X_train, y_train):\n",
" random_search = RandomizedSearchCV(model, params, cv=5,n_jobs=-1, scoring='accuracy', n_iter=n_iter, return_train_score=False)\n",
" random_result = random_search.fit(X_train, y_train)\n",
" print(pd.DataFrame(random_search.cv_results_)[['mean_test_score','params']])\n",
" print(\"Best model accuracy: {:.3f}\".format(random_result.best_score_))\n",
" print('Using', random_result.best_params_)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"##--combine season data\n",
"prem10 = pd.read_csv(\"data/10:11.csv\")\n",
"prem11 = pd.read_csv(\"data/11:12.csv\")\n",
"prem12 = pd.read_csv(\"data/12:13.csv\")\n",
"prem13 = pd.read_csv(\"data/13:14.csv\")\n",
"prem14 = pd.read_csv(\"data/14:15.csv\")\n",
"prem15 = pd.read_csv(\"data/15:16.csv\")\n",
"prem16 = pd.read_csv(\"data/16:17.csv\")\n",
"prem17 = pd.read_csv(\"data/17:18.csv\")\n",
"prem18 = pd.read_csv(\"data/18:19.csv\")\n",
"prem19 = pd.read_csv(\"data/19:20.csv\")\n",
"prem20 = pd.read_csv(\"data/20:21.csv\")\n",
"prem21 = pd.read_csv(\"data/21:22.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"dfAllSeasons = pd.concat([prem10, prem11, prem12, prem13,\n",
"prem14, prem15, prem16, prem17,prem18, prem19, prem20\n",
"], axis=0)\n",
"dfAllSeasons.reset_index(drop=True, inplace=True) #reset index"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"dfAllSeasons.reset_index(drop=True, inplace=True) #reset index"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Div</th>\n <th>Date</th>\n <th>HomeTeam</th>\n <th>AwayTeam</th>\n <th>FTHG</th>\n <th>FTAG</th>\n <th>FTR</th>\n <th>HTHG</th>\n <th>HTAG</th>\n <th>HTR</th>\n <th>...</th>\n <th>AvgC&lt;2.5</th>\n <th>AHCh</th>\n <th>B365CAHH</th>\n <th>B365CAHA</th>\n <th>PCAHH</th>\n <th>PCAHA</th>\n <th>MaxCAHH</th>\n <th>MaxCAHA</th>\n <th>AvgCAHH</th>\n <th>AvgCAHA</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>E0</td>\n <td>14/08/10</td>\n <td>Aston Villa</td>\n <td>West Ham</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>1</th>\n <td>E0</td>\n <td>14/08/10</td>\n <td>Blackburn</td>\n <td>Everton</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>2</th>\n <td>E0</td>\n <td>14/08/10</td>\n <td>Bolton</td>\n <td>Fulham</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>D</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>D</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>3</th>\n <td>E0</td>\n <td>14/08/10</td>\n <td>Chelsea</td>\n <td>West Brom</td>\n <td>6.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>4</th>\n <td>E0</td>\n <td>14/08/10</td>\n <td>Sunderland</td>\n <td>Birmingham</td>\n <td>2.0</td>\n <td>2.0</td>\n <td>D</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>4176</th>\n <td>E0</td>\n <td>23/05/2021</td>\n <td>Liverpool</td>\n <td>Crystal Palace</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>...</td>\n <td>3.49</td>\n <td>-2.25</td>\n <td>1.86</td>\n <td>2.04</td>\n <td>1.88</td>\n <td>2.03</td>\n <td>1.98</td>\n <td>2.14</td>\n <td>1.88</td>\n <td>2.00</td>\n </tr>\n <tr>\n <th>4177</th>\n <td>E0</td>\n <td>23/05/2021</td>\n <td>Man City</td>\n <td>Everton</td>\n <td>5.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>...</td>\n <td>2.77</td>\n <td>-1.75</td>\n <td>2.01</td>\n <td>1.89</td>\n <td>1.99</td>\n <td>1.89</td>\n <td>2.20</td>\n <td>2.00</td>\n <td>2.03</td>\n <td>1.85</td>\n </tr>\n <tr>\n <th>4178</th>\n <td>E0</td>\n <td>23/05/2021</td>\n <td>Sheffield United</td>\n <td>Burnley</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>...</td>\n <td>2.05</td>\n <td>0.00</td>\n <td>2.04</td>\n <td>1.86</td>\n <td>2.05</td>\n <td>1.86</td>\n <td>2.17</td>\n <td>1.90</td>\n <td>2.03</td>\n <td>1.84</td>\n </tr>\n <tr>\n <th>4179</th>\n <td>E0</td>\n <td>23/05/2021</td>\n <td>West Ham</td>\n <td>Southampton</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>H</td>\n <td>...</td>\n <td>2.14</td>\n <td>-0.75</td>\n <td>2.00</td>\n <td>1.90</td>\n <td>2.02</td>\n <td>1.91</td>\n <td>2.06</td>\n <td>2.01</td>\n <td>1.99</td>\n <td>1.89</td>\n </tr>\n <tr>\n <th>4180</th>\n <td>E0</td>\n <td>23/05/2021</td>\n <td>Wolves</td>\n <td>Man United</td>\n <td>1.0</td>\n <td>2.0</td>\n <td>A</td>\n <td>1.0</td>\n <td>2.0</td>\n <td>A</td>\n <td>...</td>\n <td>1.62</td>\n <td>-0.25</td>\n <td>2.04</td>\n <td>1.86</td>\n <td>2.10</td>\n <td>1.84</td>\n <td>2.10</td>\n <td>1.94</td>\n <td>2.00</td>\n <td>1.88</td>\n </tr>\n </tbody>\n</table>\n<p>4181 rows × 139 columns</p>\n</div>",
"text/plain": " Div Date HomeTeam AwayTeam FTHG FTAG FTR HTHG \\\n0 E0 14/08/10 Aston Villa West Ham 3.0 0.0 H 2.0 \n1 E0 14/08/10 Blackburn Everton 1.0 0.0 H 1.0 \n2 E0 14/08/10 Bolton Fulham 0.0 0.0 D 0.0 \n3 E0 14/08/10 Chelsea West Brom 6.0 0.0 H 2.0 \n4 E0 14/08/10 Sunderland Birmingham 2.0 2.0 D 1.0 \n... .. ... ... ... ... ... .. ... \n4176 E0 23/05/2021 Liverpool Crystal Palace 2.0 0.0 H 1.0 \n4177 E0 23/05/2021 Man City Everton 5.0 0.0 H 2.0 \n4178 E0 23/05/2021 Sheffield United Burnley 1.0 0.0 H 1.0 \n4179 E0 23/05/2021 West Ham Southampton 3.0 0.0 H 2.0 \n4180 E0 23/05/2021 Wolves Man United 1.0 2.0 A 1.0 \n\n HTAG HTR ... AvgC<2.5 AHCh B365CAHH B365CAHA PCAHH PCAHA MaxCAHH \\\n0 0.0 H ... NaN NaN NaN NaN NaN NaN NaN \n1 0.0 H ... NaN NaN NaN NaN NaN NaN NaN \n2 0.0 D ... NaN NaN NaN NaN NaN NaN NaN \n3 0.0 H ... NaN NaN NaN NaN NaN NaN NaN \n4 0.0 H ... NaN NaN NaN NaN NaN NaN NaN \n... ... .. ... ... ... ... ... ... ... ... \n4176 0.0 H ... 3.49 -2.25 1.86 2.04 1.88 2.03 1.98 \n4177 0.0 H ... 2.77 -1.75 2.01 1.89 1.99 1.89 2.20 \n4178 0.0 H ... 2.05 0.00 2.04 1.86 2.05 1.86 2.17 \n4179 0.0 H ... 2.14 -0.75 2.00 1.90 2.02 1.91 2.06 \n4180 2.0 A ... 1.62 -0.25 2.04 1.86 2.10 1.84 2.10 \n\n MaxCAHA AvgCAHH AvgCAHA \n0 NaN NaN NaN \n1 NaN NaN NaN \n2 NaN NaN NaN \n3 NaN NaN NaN \n4 NaN NaN NaN \n... ... ... ... \n4176 2.14 1.88 2.00 \n4177 2.00 2.03 1.85 \n4178 1.90 2.03 1.84 \n4179 2.01 1.99 1.89 \n4180 1.94 2.00 1.88 \n\n[4181 rows x 139 columns]"
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfAllSeasons"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"##-------- DATA EXPLORATION---------"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "Text(0, 0.5, 'Frequency')"
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEICAYAAACuxNj9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAbuklEQVR4nO3de7xVdZ3/8dc7MC95LY6GgB5R1NBR1CNj3tK0UStT59cYTCWahjb6q8aaQnPUaih/zZiNPycdLIZ0vIQ6pk7SCJViCOHBUETFUFGPEBy1Em8o+Jk/1nfr4rjPWRvZt8N+Px+P8zhrfdftc/aG/d7f71p7bUUEZmZmfXlXowswM7Pm57AwM7NCDgszMyvksDAzs0IOCzMzK+SwMDOzQg4LswpI2kHSi5IGVGl/IWmXNH2FpH+s0n7XqlPSnZJOq8a+0/6mSRpXrf1Z/zGw0QVYa5H0Ym52M2AVsCbNnx4R19S/qmIR8RSweY32fUYl60laApwWETP62FfV6pR0IbBLRHwmt/9jqrFv638cFlZXEfHmC1klL35WOUkDI2J1o+uwDZOHoawpSHqXpAmSHpP0nKSpkt6bW36DpD9I+rOkmZL2yC2bIumHaYjkRUmzJL1f0g8k/VHSI5L26eW435T0/9P0RpJekvS9NL+ppFclbSOpPQ0dDUzL7pT07XSslZLukDSoj7/vHyQtk7RU0ud6LJsi6Z/S9CBJ/y3pT5Kel3R3emyuBnYAbkt/49dyNZ0q6SngVz3rTHaWNDc9dreUHldJh0nq6lHLEklHSjoaOBf4VDre/bm/+7Tcc3aepCclrZB0laSt0rJSHeMkPSXpWUnf6OvfgDU3h4U1iy8CxwMfArYH/gj8W275NGAEsC1wH9BzuOpE4DxgENnQ1uy03iDgRuD7vRz3LuCwNL0/8IdUA8AHgUUR8cdetv1b4JRU07uBr5ZbKb3wfhX4SPobjuxlfwBfAbqANmA7shfsiIjPAk8Bx0bE5hHxvdw2HwI+ABzVyz5PAj5H9riuBi7t4/iQHfAXwHeAn6bj7V1mtZPTz+HAcLLhr8t6rHMwsBtwBHC+pA8UHduak8PCmsXpwDcioisiVgEXAp8svUOOiMkRsTK3bO/Su9jk5oiYFxGvAjcDr0bEVRGxBvgpULZnQRYqIyS9DzgU+DEwRNLmZC/Cd/VR839ExKMR8QowFRjVy3onpnUfjIiXUv29eR0YDOwYEa9HxN1RfAO3CyPipVRHOVfnjv2PwIlVOlH/aeD7EfF4RLwInAOM6dGr+WZEvBIR9wP3A+VCx/oBh4U1ix2Bm9Pwy5+Ah8lOfG8naYCki9IQ1QvAkrRNfthneW76lTLzZU/6phfYTrJgOJQsHO4BDqI4LP6Qm365t2OQvaN/Ojf/ZB/7/GdgMXCHpMclTehj3ZKn12H5k8BGrP3YvVPbs/bf8iTZedDtcm2VPkbW5BwW1iyeBo6JiK1zP5tExDNkwz3HkQ3fbAW0p21UpWPfBXyYrPdxb5o/ChgNzKzC/pcBw3LzO/S2Yuo9fSUihgPHAmdLOqK0uLfNCo7f89ivA88CL5FdkQZA6m20rcN+l5KFfH7fq1k7qG0D4bCwZnEFMFHSjgCS2iQdl5ZtQXYe4jmyF7fvVPnYd5GN6z8UEa8BdwKnAU9ERHcV9j8VOFnSSEmbARf0tqKkj0vaRZKAF8h6V6VLi5eTnRtYV5/JHftbwI1peO5RYBNJH5O0Edk5n41z2y0H2iX19jpxHfD3knZKw3alcxy+ImsD5LCwZvGvwK1kwy8rgTnAX6ZlV5ENcTwDPJSWVdM9wKa81Yt4CHiV6vQqiIhpwA+AX5ENMf2qj9VHADOAF8nOp/wwIu5My74LnJeG6sqeTO/F1cAUsiGhTcguJiAi/gz8HfAjssf2JbKT6yU3pN/PSbqvzH4np33PBJ4ge8z+7zrUZf2I/OVHZmZWxD0LMzMr5LAwM7NCDgszMyvksDAzs0Ib7I0EBw0aFO3t7Y0uw8ysX5k3b96zEdHWs32DDYv29nY6OzsbXYaZWb8iqewdBjwMZWZmhRwWZmZWyGFhZmaFHBZmZlbIYWFmZoUcFmZmVshhYWZmhRwWZmZWyGFhZmaFNthPcNdb+4SfN7qEmlly0ccaXYKZNZh7FmZmVshhYWZmhRwWZmZWyGFhZmaFHBZmZlbIYWFmZoUcFmZmVshhYWZmhRwWZmZWyGFhZmaFahYWkiZLWiHpwVzbTyXNTz9LJM1P7e2SXsktuyK3zX6SFkhaLOlSSapVzWZmVl4t7w01BbgMuKrUEBGfKk1Luhj4c279xyJiVJn9XA6MB+YAtwNHA9OqX66ZmfWmZj2LiJgJPF9uWeodnAhc19c+JA0GtoyI2RERZMFzfJVLNTOzAo06Z3EIsDwifp9r20nS7yTdJemQ1DYE6Mqt05XaypI0XlKnpM7u7u7qV21m1qIaFRZjWbtXsQzYISL2Ac4GrpW0JVDu/ET0ttOImBQRHRHR0dbWVtWCzcxaWd2/z0LSQOCvgf1KbRGxCliVpudJegzYlawnMTS3+VBgaf2qNTMzaEzP4kjgkYh4c3hJUpukAWl6ODACeDwilgErJR2QznOcBNzSgJrNzFpaLS+dvQ6YDewmqUvSqWnRGN5+YvtQ4AFJ9wM3AmdEROnk+BeAHwGLgcfwlVBmZnVXs2GoiBjbS/vJZdpuAm7qZf1OYM+qFmdmZuvEn+A2M7NCDgszMyvksDAzs0IOCzMzK+SwMDOzQg4LMzMr5LAwM7NCDgszMyvksDAzs0IOCzMzK+SwMDOzQg4LMzMr5LAwM7NCDgszMyvksDAzs0IOCzMzK+SwMDOzQg4LMzMrVMvv4J4saYWkB3NtF0p6RtL89PPR3LJzJC2WtEjSUbn2/SQtSMsulaRa1WxmZuXVsmcxBTi6TPslETEq/dwOIGkkMAbYI23zQ0kD0vqXA+OBEemn3D7NzKyGahYWETETeL7C1Y8Dro+IVRHxBLAYGC1pMLBlRMyOiACuAo6vScFmZtarRpyzOEvSA2mYapvUNgR4OrdOV2obkqZ7tpuZWR3VOywuB3YGRgHLgItTe7nzENFHe1mSxkvqlNTZ3d29nqWamVlJXcMiIpZHxJqIeAO4EhidFnUBw3KrDgWWpvahZdp72/+kiOiIiI62trbqFm9m1sLqGhbpHETJCUDpSqlbgTGSNpa0E9mJ7LkRsQxYKemAdBXUScAt9azZzMxgYK12LOk64DBgkKQu4ALgMEmjyIaSlgCnA0TEQklTgYeA1cCZEbEm7eoLZFdWbQpMSz9mZlZHNQuLiBhbpvnHfaw/EZhYpr0T2LOKpZmZ2TryJ7jNzKyQw8LMzAo5LMzMrJDDwszMCjkszMyskMPCzMwKOSzMzKyQw8LMzAo5LMzMrJDDwszMCjkszMyskMPCzMwKOSzMzKyQw8LMzAo5LMzMrJDDwszMCjkszMyskMPCzMwKOSzMzKxQzb6DW9Jk4OPAiojYM7X9M3As8BrwGHBKRPxJUjvwMLAobT4nIs5I2+wHTAE2BW4HvhQRUau6rfW0T/h5o0uoqSUXfazRJdgGoJY9iynA0T3apgN7RsRewKPAObllj0XEqPRzRq79cmA8MCL99NynmZnVWM3CIiJmAs/3aLsjIlan2TnA0L72IWkwsGVEzE69iauA42tQrpmZ9aGR5yw+B0zLze8k6XeS7pJ0SGobAnTl1ulKbWVJGi+pU1Jnd3d39Ss2M2tRDQkLSd8AVgPXpKZlwA4RsQ9wNnCtpC0Bldm81/MVETEpIjoioqOtra3aZZuZtayaneDujaRxZCe+jyidqI6IVcCqND1P0mPArmQ9ifxQ1VBgaX0rNjOzuvYsJB0NfB34RES8nGtvkzQgTQ8nO5H9eEQsA1ZKOkCSgJOAW+pZs5mZ1fbS2euAw4BBkrqAC8iuftoYmJ699r95ieyhwLckrQbWAGdEROnk+Bd469LZaax9nsPMzOqgZmEREWPLNP+4l3VvAm7qZVknsGcVSzMzs3VU0TCUJL9Ym5m1sErPWVwhaa6kv5O0dS0LMjOz5lNRWETEwcCngWFAp6RrJX2kppWZmVnTqPhqqIj4PXAe2dVMHwIulfSIpL+uVXFmZtYcKj1nsZekS8hu9vdh4NiI+ECavqSG9ZmZWROo9Gqoy4ArgXMj4pVSY0QslXReTSozM7OmUWlYfBR4JSLWAEh6F7BJRLwcEVfXrDozM2sKlZ6zmEH2obiSzVKbmZm1gErDYpOIeLE0k6Y3q01JZmbWbCoNi5ck7VuaSd9e90of65uZ2Qak0nMWXwZukFS64+tg4FM1qcjMzJpORWEREfdK2h3Yjew7Jh6JiNdrWpmZmTWNdbmR4P5Ae9pmH0lExFU1qcrMzJpKRWEh6WpgZ2A+2S3EIfvGOoeFmVkLqLRn0QGMLH2znZmZtZZKr4Z6EHh/LQsxM7PmVWnPYhDwkKS5pO/KBoiIT9SkKjMzayqVhsWFtSzCzMyaW6WXzt4laUdgRETMkLQZMKC2pZmZWbOo9BblnwduBP49NQ0BflawzWRJKyQ9mGt7r6Tpkn6ffm+TW3aOpMWSFkk6Kte+n6QFadmlkrQOf5+ZmVVBpcNQZwKjgd9C9kVIkrYt2GYK2a3N85fXTgB+GREXSZqQ5r8uaSQwBtgD2B6YIWnXdJfby4HxwBzgduBoYFqFdZvZBq59ws8bXUJNLbnoY40uAaj8aqhVEfFaaUbSQLLPWfQqImYCz/doPg74SZr+CXB8rv36iFgVEU8Ai4HRkgYDW0bE7HTZ7lW5bczMrE4qDYu7JJ0LbJq+e/sG4LZ3cLztImIZQPpd6p0MAZ7OrdeV2oak6Z7tZUkaL6lTUmd3d/c7KM/MzMqpNCwmAN3AAuB0suGgan5DXrnzENFHe1kRMSkiOiKio62trWrFmZm1ukqvhnqD7GtVr1zP4y2XNDgilqUhphWpvQsYlltvKLA0tQ8t025mZnVU6dVQT0h6vOfPOzjercC4ND0OuCXXPkbSxpJ2AkYAc9NQ1UpJB6SroE7KbWNmZnWyLveGKtkE+BvgvX1tIOk64DBgkKQu4ALgImCqpFOBp9J+iIiFkqYCDwGrgTNL3/cNfIHsyqpNya6C8pVQZmZ1Vukw1HM9mn4g6TfA+X1sM7aXRUf0sv5EYGKZ9k5gz0rqNDOz2qj0FuX75mbfRdbT2KImFZmZWdOpdBjq4tz0amAJcGLVqzEzs6ZU6TDU4bUuxMzMmlelw1Bn97U8Ir5fnXLMzKwZrcvVUPuTXeIKcCwwk7U/dW1mZhuodfnyo30jYiWApAuBGyLitFoVZmZmzaPS233sALyWm38NaK96NWZm1pQq7VlcDcyVdDPZvZlOYO1bj5uZ2Qas0quhJkqaBhySmk6JiN/VriwzM2smlQ5DAWwGvBAR/wp0pXs4mZlZC6j0RoIXAF8HzklNGwH/WauizMysuVTaszgB+ATwEkBELMW3+zAzaxmVhsVr6WtNA0DSe2pXkpmZNZtKw2KqpH8Htpb0eWAG6/9FSGZm1k8UXg2VvnTop8DuwAvAbsD5ETG9xrWZmVmTKAyLiAhJP4uI/QAHhJlZC6p0GGqOpP1rWomZmTWtSj/BfThwhqQlZFdEiazTsVetCjMzs+bRZ1hI2iEingKOqdYBJe1Gdg6kZDjZ17NuDXwe6E7t50bE7Wmbc4BTgTXAFyPif6pVj5mZFSvqWfyM7G6zT0q6KSL+z/oeMCIWAaMAJA0AngFuBk4BLomIf8mvL2kkMAbYA9gemCFp14hYs761mJlZZYrOWSg3PbwGxz8CeCwinuxjneOA6yNiVUQ8ASwGRtegFjMz60VRWEQv09UyBrguN3+WpAckTZa0TWobwtpfstSV2t5G0nhJnZI6u7u7y61iZmbvQFFY7C3pBUkrgb3S9AuSVkp6YX0OLOndZLcQuSE1XQ7sTDZEtQy4uLRqmc3LBldETIqIjojoaGtrW5/yzMwsp89zFhExoIbHPga4LyKWp2MtLy2QdCXw32m2CxiW224osLSGdZmZWQ/rcovyahtLbghK0uDcshOAB9P0rcAYSRun26KPAObWrUozM6v4cxZVJWkz4CPA6bnm70kaRTbEtKS0LCIWSpoKPASsBs70lVBmZvXVkLCIiJeB9/Vo+2wf608EJta6LjMzK6+Rw1BmZtZPOCzMzKyQw8LMzAo5LMzMrJDDwszMCjkszMyskMPCzMwKOSzMzKyQw8LMzAo5LMzMrJDDwszMCjkszMyskMPCzMwKOSzMzKyQw8LMzAo5LMzMrJDDwszMCjkszMyskMPCzMwKNSQsJC2RtEDSfEmdqe29kqZL+n36vU1u/XMkLZa0SNJRjajZzKyVNbJncXhEjIqIjjQ/AfhlRIwAfpnmkTQSGAPsARwN/FDSgEYUbGbWqpppGOo44Cdp+ifA8bn26yNiVUQ8ASwGRte/PDOz1tWosAjgDknzJI1PbdtFxDKA9Hvb1D4EeDq3bVdqextJ4yV1Surs7u6uUelmZq1nYIOOe1BELJW0LTBd0iN9rKsybVFuxYiYBEwC6OjoKLuOmZmtu4b0LCJiafq9AriZbFhpuaTBAOn3irR6FzAst/lQYGn9qjUzs7qHhaT3SNqiNA38FfAgcCswLq02DrglTd8KjJG0saSdgBHA3PpWbWbW2hoxDLUdcLOk0vGvjYhfSLoXmCrpVOAp4G8AImKhpKnAQ8Bq4MyIWNOAus3MWlbdwyIiHgf2LtP+HHBEL9tMBCbWuDQzM+tFM106a2ZmTcphYWZmhRwWZmZWyGFhZmaFHBZmZlbIYWFmZoUcFmZmVshhYWZmhRwWZmZWyGFhZmaFHBZmZlbIYWFmZoUcFmZmVshhYWZmhRwWZmZWyGFhZmaFHBZmZlbIYWFmZoXqHhaShkn6taSHJS2U9KXUfqGkZyTNTz8fzW1zjqTFkhZJOqreNZuZtbq6fwc3sBr4SkTcJ2kLYJ6k6WnZJRHxL/mVJY0ExgB7ANsDMyTtGhFr6lq1mVkLq3vPIiKWRcR9aXol8DAwpI9NjgOuj4hVEfEEsBgYXftKzcyspKHnLCS1A/sAv01NZ0l6QNJkSduktiHA07nNuuglXCSNl9QpqbO7u7tWZZuZtZyGhYWkzYGbgC9HxAvA5cDOwChgGXBxadUym0e5fUbEpIjoiIiOtra26hdtZtaiGhIWkjYiC4prIuK/ACJieUSsiYg3gCt5a6ipCxiW23wosLSe9ZqZtbpGXA0l4MfAwxHx/Vz74NxqJwAPpulbgTGSNpa0EzACmFuves3MrDFXQx0EfBZYIGl+ajsXGCtpFNkQ0xLgdICIWChpKvAQ2ZVUZ/pKKDOz+qp7WETEbyh/HuL2PraZCEysWVFmZtYnf4LbzMwKOSzMzKyQw8LMzAo5LMzMrJDDwszMCjkszMyskMPCzMwKOSzMzKyQw8LMzAo5LMzMrJDDwszMCjkszMyskMPCzMwKOSzMzKyQw8LMzAo5LMzMrJDDwszMCjkszMyskMPCzMwK9ZuwkHS0pEWSFkua0Oh6zMxaSb8IC0kDgH8DjgFGAmMljWxsVWZmraNfhAUwGlgcEY9HxGvA9cBxDa7JzKxlDGx0ARUaAjydm+8C/rLnSpLGA+PT7IuSFtWhtkYZBDxbjwPp/9XjKC2lbs8d+PmrgQ39+duxXGN/CQuVaYu3NURMAibVvpzGk9QZER2NrsPWnZ+7/q1Vn7/+MgzVBQzLzQ8FljaoFjOzltNfwuJeYISknSS9GxgD3NrgmszMWka/GIaKiNWSzgL+BxgATI6IhQ0uq9FaYrhtA+Xnrn9ryedPEW8b+jczM1tLfxmGMjOzBnJYmJlZIYdFDUh6scf8yZIuq9Ox95Y0Pzc/VtLLkjZK838h6YE0fU89auqvJJ0gKSTtXqfj/U7SqDQ9UNJLkj6TWz5P0r6SviXpyHrU1AokrZE0X9JCSfdLOluSXxt78AOy4VkA7ChpizR/IPAIsE9ufhZARBxY//L6lbHAb8iuvquHe8ieH4C9gUWleUnvAYYD90fE+RExo041tYJXImJUROwBfAT4KHBBz5Uk9YsLgmrFYVFnknaU9EtJD6TfO6T2KZIul/RrSY9L+pCkyZIeljQlt/1fSZot6T5JN0jaPL//iHiD7FLj0ifc9yO7r1bpRehAshelN3tAkg6TdKekGyU9IukaSeU+CNky0uN6EHAqKSwkDUjPjSRtLekNSYemZXdL2kXSaEn3pF7CPZJ2yy0fldv/LEl79TjsLNZ+nq4AStuMBu6LiDXp38on036WSPpm+vewoF69oA1VRKwguwvEWel5Pjn9P7sNuEPS5un/benxPg5A0tckfTFNXyLpV2n6CEn/2bA/qIocFrWxaerWzk9DQt/KLbsMuCoi9gKuAS7NLdsG+DDw98BtwCXAHsBfSBolaRBwHnBkROwLdAJnlzn+PcCB6d3oG8CdrP0iNKvMNvsAXya7UeNwshfKVnY88IuIeBR4XtK+EbEGeJTsMToYmAccImljYGhELCbrxR0aEfsA5wPfSfv7EXAygKRdgY0j4oEex8z3LA4EZgKrUi+xt+cN4Nn07+Fy4Kvr9VcbEfE42Wvjtqnpg8C4iPgw8CpwQnq8DwcuTm+sZgKHpPU7gM3T0O/BwN31rL9WHBa1UerWjoqIUWQvGiUfBK5N01eT/WMquS2ya5kXAMsjYkHqKSwE2oEDyF6oZqUQGkf5+7iU3qGOBu6NiMeAXSS1AZun/ww9zY2IrnS8+el4rWws2Q0rSb/Hpum7gUPTz3fJnr/9yXpzAFsBN0h6kLfCHuAG4OPpBeRzwJSeB4yIJcC7Jb0f2J1sGKrUS3yzR1jGf6Xf8/DzVi35nvX0iHg+1/6ddN5vBtl967Yje+z3S8G+CphNFhqHsIGERUuPwTWJ/AddVqXfb+SmS/MDgTVk/3DH0rc5ZC9gB5P9o4Xslilj6P0FJ3+8NbTwvw1J7yPr4e0pKcg+CBqSvkb2H/8MYHuyNwH/ABxG9s4S4NvAryPiBEntZL06IuJlSdPJ7pZ8ItkLSTmzgU8CyyIiJM0h6+WNJnteyyk9dy39vFWLpOFkj+WK1PRSbvGngTZgv4h4XdISYJPc9Clk/8ceIOt57Aw8XKfSa8o9i/q7h7dOmH6a7ARqpeYAB0naBUDSZmlIYy0RsZLsLr0n81ZYzCYbZvIVUMU+STZUuGNEtEfEMOAJsvD9Ldm7/Dci4lWyXtjpvPXucSvgmTR9co/9/ohs2PHe3DvVnmaRDUPmn7eTgD9ExJ/W78+yIqn3fQVwWZT/xPJWwIoUDoezds9+Jtkw4EzeelMxv5f99DsOi/r7InBK6sZ+FvhSpRtGRDfZC9B1afs5ZMMV5cwiGxcv3dp9Ntm5CIdFsbHAzT3abgL+NiJWkQVx6V3+3cAWZEOHAN8DvitpFlmP5E0RMQ94AfiPPo49i+x5mp22WZb24+etdkrnGBeSDS3dAXyzl3WvATokdZK92Xskt+xuYDAwOyKWk53f2CCGoMC3+zCrG0nbkw1L7Z7ODZn1G+5ZmNWBpJPIhrC+4aCw/sg9CzMzK+SehZmZFXJYmJlZIYeFmZkVcliYmVkhh4WZmRX6X6wcJruZY/EUAAAAAElFTkSuQmCC\n",
"text/plain": "<Figure size 432x288 with 1 Axes>"
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"###---- distribution of wins\n",
"labels= ['Home Win','Away Win', 'Draw']\n",
"promo_count = pd.value_counts(dfAllSeasons['FTR'], sort = True)\n",
"promo_count.plot(kind = 'bar', rot = 0)\n",
"plt.title('Team win distribution')\n",
"plt.xticks(range(3),labels)\n",
"plt.xlabel('')\n",
"plt.ylabel('Frequency')"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "H 1881\nA 1280\nD 1019\nName: FTR, dtype: int64"
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# value of frequency of results\n",
"dfAllSeasons['FTR'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"##-------------------DATA PREPROCESSING--------------------"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"dfNew = dfAllSeasons.filter(['FTR', 'Attendence', 'HS', 'AS', 'HST', 'AST', 'HHW', 'AHW', 'HC', 'AC', 'HF', 'AF', 'HFKC', 'AFKC', \n",
"'HO', 'AO', 'HY', 'AY', 'HR', 'AR', 'HBP', 'ABP'])"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>FTR</th>\n <th>HS</th>\n <th>AS</th>\n <th>HST</th>\n <th>AST</th>\n <th>HC</th>\n <th>AC</th>\n <th>HF</th>\n <th>AF</th>\n <th>HY</th>\n <th>AY</th>\n <th>HR</th>\n <th>AR</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>H</td>\n <td>23.0</td>\n <td>12.0</td>\n <td>11.0</td>\n <td>2.0</td>\n <td>16.0</td>\n <td>7.0</td>\n <td>15.0</td>\n <td>15.0</td>\n <td>1.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>H</td>\n <td>7.0</td>\n <td>17.0</td>\n <td>2.0</td>\n <td>12.0</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>19.0</td>\n <td>14.0</td>\n <td>2.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>D</td>\n <td>13.0</td>\n <td>12.0</td>\n <td>9.0</td>\n <td>7.0</td>\n <td>4.0</td>\n <td>8.0</td>\n <td>12.0</td>\n <td>13.0</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>H</td>\n <td>18.0</td>\n <td>10.0</td>\n <td>13.0</td>\n <td>4.0</td>\n <td>3.0</td>\n <td>1.0</td>\n <td>10.0</td>\n <td>10.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>D</td>\n <td>6.0</td>\n <td>13.0</td>\n <td>2.0</td>\n <td>7.0</td>\n <td>3.0</td>\n <td>6.0</td>\n <td>13.0</td>\n <td>10.0</td>\n <td>3.0</td>\n <td>3.0</td>\n <td>1.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>4176</th>\n <td>H</td>\n <td>19.0</td>\n <td>5.0</td>\n <td>5.0</td>\n <td>4.0</td>\n <td>14.0</td>\n <td>1.0</td>\n <td>10.0</td>\n <td>8.0</td>\n <td>2.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4177</th>\n <td>H</td>\n <td>21.0</td>\n <td>8.0</td>\n <td>11.0</td>\n <td>3.0</td>\n <td>7.0</td>\n <td>5.0</td>\n <td>8.0</td>\n <td>10.0</td>\n <td>2.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4178</th>\n <td>H</td>\n <td>12.0</td>\n <td>10.0</td>\n <td>3.0</td>\n <td>3.0</td>\n <td>8.0</td>\n <td>9.0</td>\n <td>11.0</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4179</th>\n <td>H</td>\n <td>14.0</td>\n <td>17.0</td>\n <td>7.0</td>\n <td>5.0</td>\n <td>2.0</td>\n <td>3.0</td>\n <td>5.0</td>\n <td>9.0</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4180</th>\n <td>A</td>\n <td>14.0</td>\n <td>9.0</td>\n <td>4.0</td>\n <td>4.0</td>\n <td>6.0</td>\n <td>2.0</td>\n <td>14.0</td>\n <td>3.0</td>\n <td>4.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n<p>4181 rows × 13 columns</p>\n</div>",
"text/plain": " FTR HS AS HST AST HC AC HF AF HY AY HR AR\n0 H 23.0 12.0 11.0 2.0 16.0 7.0 15.0 15.0 1.0 2.0 0.0 0.0\n1 H 7.0 17.0 2.0 12.0 1.0 3.0 19.0 14.0 2.0 1.0 0.0 0.0\n2 D 13.0 12.0 9.0 7.0 4.0 8.0 12.0 13.0 1.0 3.0 0.0 0.0\n3 H 18.0 10.0 13.0 4.0 3.0 1.0 10.0 10.0 1.0 0.0 0.0 0.0\n4 D 6.0 13.0 2.0 7.0 3.0 6.0 13.0 10.0 3.0 3.0 1.0 0.0\n... .. ... ... ... ... ... ... ... ... ... ... ... ...\n4176 H 19.0 5.0 5.0 4.0 14.0 1.0 10.0 8.0 2.0 2.0 0.0 0.0\n4177 H 21.0 8.0 11.0 3.0 7.0 5.0 8.0 10.0 2.0 2.0 0.0 0.0\n4178 H 12.0 10.0 3.0 3.0 8.0 9.0 11.0 1.0 3.0 1.0 0.0 0.0\n4179 H 14.0 17.0 7.0 5.0 2.0 3.0 5.0 9.0 0.0 3.0 0.0 0.0\n4180 A 14.0 9.0 4.0 4.0 6.0 2.0 14.0 3.0 4.0 1.0 0.0 0.0\n\n[4181 rows x 13 columns]"
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfNew"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dfAllSeasons.duplicated().sum()#no duplicated values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dfAllSeasons.isnull().sum()#no null values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dfAllSeasons.isna().sum()#no nan values"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "CategoricalDtype(categories=['A', 'D', 'H'], ordered=False)"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfNew[\"FTR\"] = dfNew[\"FTR\"].astype('category')\n",
"dfNew[\"FTR\"].dtypes"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"dfNew[\"results\"] = dfNew[\"FTR\"].cat.codes"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"dfNew.drop(['FTR'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>HS</th>\n <th>AS</th>\n <th>HST</th>\n <th>AST</th>\n <th>HC</th>\n <th>AC</th>\n <th>HF</th>\n <th>AF</th>\n <th>HY</th>\n <th>AY</th>\n <th>HR</th>\n <th>AR</th>\n <th>results</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>23.0</td>\n <td>12.0</td>\n <td>11.0</td>\n <td>2.0</td>\n <td>16.0</td>\n <td>7.0</td>\n <td>15.0</td>\n <td>15.0</td>\n <td>1.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>1</th>\n <td>7.0</td>\n <td>17.0</td>\n <td>2.0</td>\n <td>12.0</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>19.0</td>\n <td>14.0</td>\n <td>2.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>13.0</td>\n <td>12.0</td>\n <td>9.0</td>\n <td>7.0</td>\n <td>4.0</td>\n <td>8.0</td>\n <td>12.0</td>\n <td>13.0</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>18.0</td>\n <td>10.0</td>\n <td>13.0</td>\n <td>4.0</td>\n <td>3.0</td>\n <td>1.0</td>\n <td>10.0</td>\n <td>10.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>4</th>\n <td>6.0</td>\n <td>13.0</td>\n <td>2.0</td>\n <td>7.0</td>\n <td>3.0</td>\n <td>6.0</td>\n <td>13.0</td>\n <td>10.0</td>\n <td>3.0</td>\n <td>3.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>4176</th>\n <td>19.0</td>\n <td>5.0</td>\n <td>5.0</td>\n <td>4.0</td>\n <td>14.0</td>\n <td>1.0</td>\n <td>10.0</td>\n <td>8.0</td>\n <td>2.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>4177</th>\n <td>21.0</td>\n <td>8.0</td>\n <td>11.0</td>\n <td>3.0</td>\n <td>7.0</td>\n <td>5.0</td>\n <td>8.0</td>\n <td>10.0</td>\n <td>2.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>4178</th>\n <td>12.0</td>\n <td>10.0</td>\n <td>3.0</td>\n <td>3.0</td>\n <td>8.0</td>\n <td>9.0</td>\n <td>11.0</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>4179</th>\n <td>14.0</td>\n <td>17.0</td>\n <td>7.0</td>\n <td>5.0</td>\n <td>2.0</td>\n <td>3.0</td>\n <td>5.0</td>\n <td>9.0</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>2</td>\n </tr>\n <tr>\n <th>4180</th>\n <td>14.0</td>\n <td>9.0</td>\n <td>4.0</td>\n <td>4.0</td>\n <td>6.0</td>\n <td>2.0</td>\n <td>14.0</td>\n <td>3.0</td>\n <td>4.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>4181 rows × 13 columns</p>\n</div>",
"text/plain": " HS AS HST AST HC AC HF AF HY AY HR AR \\\n0 23.0 12.0 11.0 2.0 16.0 7.0 15.0 15.0 1.0 2.0 0.0 0.0 \n1 7.0 17.0 2.0 12.0 1.0 3.0 19.0 14.0 2.0 1.0 0.0 0.0 \n2 13.0 12.0 9.0 7.0 4.0 8.0 12.0 13.0 1.0 3.0 0.0 0.0 \n3 18.0 10.0 13.0 4.0 3.0 1.0 10.0 10.0 1.0 0.0 0.0 0.0 \n4 6.0 13.0 2.0 7.0 3.0 6.0 13.0 10.0 3.0 3.0 1.0 0.0 \n... ... ... ... ... ... ... ... ... ... ... ... ... \n4176 19.0 5.0 5.0 4.0 14.0 1.0 10.0 8.0 2.0 2.0 0.0 0.0 \n4177 21.0 8.0 11.0 3.0 7.0 5.0 8.0 10.0 2.0 2.0 0.0 0.0 \n4178 12.0 10.0 3.0 3.0 8.0 9.0 11.0 1.0 3.0 1.0 0.0 0.0 \n4179 14.0 17.0 7.0 5.0 2.0 3.0 5.0 9.0 0.0 3.0 0.0 0.0 \n4180 14.0 9.0 4.0 4.0 6.0 2.0 14.0 3.0 4.0 1.0 0.0 0.0 \n\n results \n0 2 \n1 2 \n2 1 \n3 2 \n4 1 \n... ... \n4176 2 \n4177 2 \n4178 2 \n4179 2 \n4180 0 \n\n[4181 rows x 13 columns]"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfNew #2 == home win, 1 == Draw, 0 = away win"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"# divide into x and y\n",
"X = dfNew.drop([\"results\"], axis=1)\n",
"y = dfNew[\"results\"]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0) # split into train and test data"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "MLPClassifier(activation='identity', alpha=1e-05, hidden_layer_sizes=(5,),\n learning_rate_init=0.01, max_iter=10000, random_state=1,\n solver='lbfgs')"
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#-------- neural networks-----------\n",
"from sklearn.neural_network import MLPClassifier\n",
"clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,), activation='identity', learning_rate='constant', learning_rate_init=0.01, random_state=1, max_iter=10000)\n",
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "0.569377990430622"
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"#make parameters\n",
"\n",
"solvers = ['sgd', 'lbfgs', 'adam']\n",
"activation = ['identity', 'logistic', 'tanh', 'relu']\n",
"hiddenLayerSize = [1,2,3,4,5]\n",
"alpha = [1e-1,1e-2,1e-3,1e-4,1e-5]\n",
"learningRate = ['constant', 'invscaling', 'adapting']\n",
"learningRateInit = [1e-1,1e-2,1e-3,1e-4,1e-5]\n",
"nnParams = dict(solver=solvers, alpha=alpha, hidden_layer_sizes=hiddenLayerSize, activation=activation, learning_rate=learningRate, learning_rate_init=learningRateInit)\n"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/aaronokoroh/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py:918: UserWarning: One or more of the test scores are non-finite: [0.58403149 nan 0.56877769 nan 0.44617223 0.44617223\n",
" nan 0.58761893 0.5810415 0.44617223 nan nan\n",
" 0.52063899 nan 0.58612551 nan nan 0.58492969\n",
" 0.44617223 nan 0.25628474 0.58672341 0.58612237 nan\n",
" nan nan 0.5855276 0.25628474 0.58791878 0.58761983]\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" mean_test_score params\n",
"0 0.584031 {'solver': 'adam', 'learning_rate_init': 0.001...\n",
"1 NaN {'solver': 'lbfgs', 'learning_rate_init': 0.00...\n",
"2 0.568778 {'solver': 'adam', 'learning_rate_init': 0.000...\n",
"3 NaN {'solver': 'adam', 'learning_rate_init': 0.1, ...\n",
"4 0.446172 {'solver': 'lbfgs', 'learning_rate_init': 0.01...\n",
"5 0.446172 {'solver': 'sgd', 'learning_rate_init': 0.01, ...\n",
"6 NaN {'solver': 'adam', 'learning_rate_init': 0.1, ...\n",
"7 0.587619 {'solver': 'lbfgs', 'learning_rate_init': 1e-0...\n",
"8 0.581042 {'solver': 'adam', 'learning_rate_init': 0.001...\n",
"9 0.446172 {'solver': 'sgd', 'learning_rate_init': 0.1, '...\n",
"10 NaN {'solver': 'lbfgs', 'learning_rate_init': 0.00...\n",
"11 NaN {'solver': 'lbfgs', 'learning_rate_init': 0.00...\n",
"12 0.520639 {'solver': 'adam', 'learning_rate_init': 1e-05...\n",
"13 NaN {'solver': 'lbfgs', 'learning_rate_init': 0.00...\n",
"14 0.586126 {'solver': 'lbfgs', 'learning_rate_init': 0.01...\n",
"15 NaN {'solver': 'lbfgs', 'learning_rate_init': 0.1,...\n",
"16 NaN {'solver': 'sgd', 'learning_rate_init': 1e-05,...\n",
"17 0.584930 {'solver': 'lbfgs', 'learning_rate_init': 0.00...\n",
"18 0.446172 {'solver': 'sgd', 'learning_rate_init': 0.0001...\n",
"19 NaN {'solver': 'lbfgs', 'learning_rate_init': 0.01...\n",
"20 0.256285 {'solver': 'sgd', 'learning_rate_init': 0.0001...\n",
"21 0.586723 {'solver': 'lbfgs', 'learning_rate_init': 0.00...\n",
"22 0.586122 {'solver': 'adam', 'learning_rate_init': 0.1, ...\n",
"23 NaN {'solver': 'sgd', 'learning_rate_init': 1e-05,...\n",
"24 NaN {'solver': 'sgd', 'learning_rate_init': 0.01, ...\n",
"25 NaN {'solver': 'sgd', 'learning_rate_init': 0.0001...\n",
"26 0.585528 {'solver': 'lbfgs', 'learning_rate_init': 0.00...\n",
"27 0.256285 {'solver': 'sgd', 'learning_rate_init': 0.0001...\n",
"28 0.587919 {'solver': 'lbfgs', 'learning_rate_init': 0.00...\n",
"29 0.587620 {'solver': 'adam', 'learning_rate_init': 0.1, ...\n",
"Best model recall: 0.588\n",
"Using {'solver': 'lbfgs', 'learning_rate_init': 0.0001, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 1, 'alpha': 0.001, 'activation': 'logistic'}\n"
]
}
],
"source": [
"# do random search\n",
"rand_search(MLPClassifier(random_state=0, max_iter=10000),nnParams, 30, X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "LogisticRegression(max_iter=1000)"
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# ---------logistic regression------------ No major hyperparameters\n",
"logReg = LogisticRegression(max_iter=1000)\n",
"logReg.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "0.569377990430622"
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logReg.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#---------------------K NEAREST NEIGHBOURS---------------"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "KNeighborsClassifier(n_neighbors=25)"
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# --------K nearest neighbours-----------\n",
"Knn = KNeighborsClassifier(n_neighbors=25)\n",
"Knn.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "0.5804425837320574"
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Knn.score(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"neighbors = range(1,50)\n",
"metric = ['euclidean', 'manhattan', 'minkowski']\n",
"weights = ['uniform', 'distance']\n",
"knnParams = dict(metric=metric, n_neighbors=neighbors, weights=weights)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/aaronokoroh/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py:285: UserWarning: The total space of parameters 294 is smaller than n_iter=500. Running 294 iterations. For exhaustive searches, use GridSearchCV.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" mean_test_score params\n",
"0 0.498507 {'weights': 'uniform', 'n_neighbors': 1, 'metr...\n",
"1 0.498507 {'weights': 'distance', 'n_neighbors': 1, 'met...\n",
"2 0.449763 {'weights': 'uniform', 'n_neighbors': 2, 'metr...\n",
"3 0.501798 {'weights': 'distance', 'n_neighbors': 2, 'met...\n",
"4 0.485648 {'weights': 'uniform', 'n_neighbors': 3, 'metr...\n",
".. ... ...\n",
"289 0.604964 {'weights': 'distance', 'n_neighbors': 47, 'me...\n",
"290 0.560107 {'weights': 'uniform', 'n_neighbors': 48, 'met...\n",
"291 0.603169 {'weights': 'distance', 'n_neighbors': 48, 'me...\n",
"292 0.556520 {'weights': 'uniform', 'n_neighbors': 49, 'met...\n",
"293 0.601376 {'weights': 'distance', 'n_neighbors': 49, 'me...\n",
"\n",
"[294 rows x 2 columns]\n",
"Best model accuracy: 0.605\n",
"Using {'weights': 'distance', 'n_neighbors': 47, 'metric': 'euclidean'}\n"
]
}
],
"source": [
"rand_search(KNeighborsClassifier(),knnParams, 500, X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#-----------------------RANDOM FOREST---------------------------------------------------"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "RandomForestClassifier(max_depth=6, random_state=3)"
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# -------random forest---------------\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"rfc = RandomForestClassifier(max_depth=6, random_state=3)\n",
"rfc.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "0.5801435406698564"
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rfc.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n_estimators = range(1,150)\n",
"max_depth = range(1,100)\n",
"min_samples_leaf = range(1,100)\n",
"min_samples_split = range(1,100)\n",
"max_features = ['auto', 'log2', 'sqrt']\n",
"criterion = ['gini', 'entropy']\n",
"rfcParams = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, max_features=max_features, criterion=criterion) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" mean_test_score params\n",
"0 0.533005 {'n_estimators': 148, 'min_samples_split': 26,...\n",
"1 0.562623 {'n_estimators': 142, 'min_samples_split': 59,...\n",
"2 0.457213 {'n_estimators': 139, 'min_samples_split': 73,...\n",
"3 0.536339 {'n_estimators': 69, 'min_samples_split': 54, ...\n",
"4 0.536230 {'n_estimators': 23, 'min_samples_split': 85, ...\n",
".. ... ...\n",
"95 0.569180 {'n_estimators': 18, 'min_samples_split': 15, ...\n",
"96 0.457213 {'n_estimators': 23, 'min_samples_split': 17, ...\n",
"97 0.565902 {'n_estimators': 43, 'min_samples_split': 53, ...\n",
"98 0.457213 {'n_estimators': 79, 'min_samples_split': 17, ...\n",
"99 0.480328 {'n_estimators': 93, 'min_samples_split': 29, ...\n",
"\n",
"[100 rows x 2 columns]\n",
"Best model recall: 0.589\n",
"Using {'n_estimators': 8, 'min_samples_split': 54, 'min_samples_leaf': 33, 'max_features': 'auto', 'max_depth': 93, 'criterion': 'gini'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/aaronokoroh/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py:918: UserWarning: One or more of the test scores are non-finite: [0.53300546 0.56262295 0.45721311 0.5363388 0.53622951 0.55928962\n",
" 0.45721311 0.53956284 0.4736612 nan 0.45721311 0.57907104\n",
" 0.54945355 0.56590164 0.57896175 0.52978142 nan 0.55595628\n",
" 0.45721311 0.4936612 0.53612022 0.56928962 0.54289617 nan\n",
" 0.45721311 0.45721311 0.45721311 0.53628415 0.54617486 0.57251366\n",
" 0.45721311 0.54289617 0.54945355 0.54289617 0.57579235 0.55934426\n",
" 0.54617486 0.53628415 0.56912568 0.57251366 0.51994536 0.54617486\n",
" 0.5626776 0.45721311 0.44404372 0.57907104 0.56595628 0.45721311\n",
" 0.56923497 0.57579235 0.45721311 0.55928962 nan 0.55928962\n",
" 0.5757377 0.57251366 0.56256831 0.57245902 0.45721311 0.57245902\n",
" 0.49 0.58885246 0.45721311 0.55606557 0.45721311 0.53300546\n",
" 0.45721311 0.45721311 0.55595628 0.56923497 0.45721311 0.5495082\n",
" 0.45721311 0.45076503 0.52644809 0.45721311 0.55928962 0.53628415\n",
" 0.55617486 0.56918033 0.57579235 0.45721311 0.5363388 0.57251366\n",
" 0.54945355 0.54617486 nan 0.45721311 0.56262295 0.45721311\n",
" 0.55934426 0.52978142 0.54295082 0.51 0.45721311 0.56918033\n",
" 0.45721311 0.56590164 0.45721311 0.48032787]\n",
" warnings.warn(\n"
]
}
],
"source": [
"rand_search(RandomForestClassifier(random_state=0),rfcParams, 100, X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#####----------------MATCH PREDICTIONS----------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"teamInfoData = df.filter(['Date','AwayTeam','HomeTeam','FTR', 'Attendence', 'HS', 'AS', 'HST', 'AST', 'HHW', 'AHW', 'HC', 'AC', 'HF', 'AF', 'HFKC', 'AFKC', \n",
"'HO', 'AO', 'HY', 'AY', 'HR', 'AR', 'HBP', 'ABP'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Date</th>\n <th>AwayTeam</th>\n <th>HomeTeam</th>\n <th>FTR</th>\n <th>HS</th>\n <th>AS</th>\n <th>HST</th>\n <th>AST</th>\n <th>HC</th>\n <th>AC</th>\n <th>HF</th>\n <th>AF</th>\n <th>HY</th>\n <th>AY</th>\n <th>HR</th>\n <th>AR</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>13/08/2021</td>\n <td>Arsenal</td>\n <td>Brentford</td>\n <td>H</td>\n <td>8</td>\n <td>22</td>\n <td>3</td>\n <td>4</td>\n <td>2</td>\n <td>5</td>\n <td>12</td>\n <td>8</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>14/08/2021</td>\n <td>Leeds</td>\n <td>Man United</td>\n <td>H</td>\n <td>16</td>\n <td>10</td>\n <td>8</td>\n <td>3</td>\n <td>5</td>\n <td>4</td>\n <td>11</td>\n <td>9</td>\n <td>1</td>\n <td>2</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>14/08/2021</td>\n <td>Brighton</td>\n <td>Burnley</td>\n <td>A</td>\n <td>14</td>\n <td>14</td>\n <td>3</td>\n <td>8</td>\n <td>7</td>\n <td>6</td>\n <td>10</td>\n <td>7</td>\n <td>2</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>14/08/2021</td>\n <td>Crystal Palace</td>\n <td>Chelsea</td>\n <td>H</td>\n <td>13</td>\n <td>4</td>\n <td>6</td>\n <td>1</td>\n <td>5</td>\n <td>2</td>\n <td>15</td>\n <td>11</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>14/08/2021</td>\n <td>Southampton</td>\n <td>Everton</td>\n <td>H</td>\n <td>14</td>\n <td>6</td>\n <td>6</td>\n <td>3</td>\n <td>6</td>\n <td>8</td>\n <td>13</td>\n <td>15</td>\n <td>2</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>276</th>\n <td>13/03/2022</td>\n <td>Norwich</td>\n <td>Leeds</td>\n <td>H</td>\n <td>13</td>\n <td>12</td>\n <td>7</td>\n <td>4</td>\n <td>6</td>\n <td>2</td>\n <td>18</td>\n <td>8</td>\n <td>3</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>277</th>\n <td>13/03/2022</td>\n <td>Watford</td>\n <td>Southampton</td>\n <td>A</td>\n <td>13</td>\n <td>9</td>\n <td>7</td>\n <td>5</td>\n <td>11</td>\n <td>3</td>\n <td>8</td>\n <td>12</td>\n <td>3</td>\n <td>2</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>278</th>\n <td>13/03/2022</td>\n <td>Aston Villa</td>\n <td>West Ham</td>\n <td>H</td>\n <td>11</td>\n <td>13</td>\n <td>4</td>\n <td>7</td>\n <td>5</td>\n <td>7</td>\n <td>9</td>\n <td>3</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>279</th>\n <td>13/03/2022</td>\n <td>Leicester</td>\n <td>Arsenal</td>\n <td>H</td>\n <td>21</td>\n <td>6</td>\n <td>8</td>\n <td>3</td>\n <td>2</td>\n <td>6</td>\n <td>8</td>\n <td>10</td>\n <td>0</td>\n <td>3</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>280</th>\n <td>14/03/2022</td>\n <td>Man City</td>\n <td>Crystal Palace</td>\n <td>D</td>\n <td>7</td>\n <td>18</td>\n <td>1</td>\n <td>4</td>\n <td>2</td>\n <td>6</td>\n <td>6</td>\n <td>11</td>\n <td>3</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>281 rows × 16 columns</p>\n</div>",
"text/plain": " Date AwayTeam HomeTeam FTR HS AS HST AST HC AC \\\n0 13/08/2021 Arsenal Brentford H 8 22 3 4 2 5 \n1 14/08/2021 Leeds Man United H 16 10 8 3 5 4 \n2 14/08/2021 Brighton Burnley A 14 14 3 8 7 6 \n3 14/08/2021 Crystal Palace Chelsea H 13 4 6 1 5 2 \n4 14/08/2021 Southampton Everton H 14 6 6 3 6 8 \n.. ... ... ... .. .. .. ... ... .. .. \n276 13/03/2022 Norwich Leeds H 13 12 7 4 6 2 \n277 13/03/2022 Watford Southampton A 13 9 7 5 11 3 \n278 13/03/2022 Aston Villa West Ham H 11 13 4 7 5 7 \n279 13/03/2022 Leicester Arsenal H 21 6 8 3 2 6 \n280 14/03/2022 Man City Crystal Palace D 7 18 1 4 2 6 \n\n HF AF HY AY HR AR \n0 12 8 0 0 0 0 \n1 11 9 1 2 0 0 \n2 10 7 2 1 0 0 \n3 15 11 0 0 0 0 \n4 13 15 2 0 0 0 \n.. .. .. .. .. .. .. \n276 18 8 3 1 0 0 \n277 8 12 3 2 0 0 \n278 9 3 1 0 0 0 \n279 8 10 0 3 0 0 \n280 6 11 3 1 0 0 \n\n[281 rows x 16 columns]"
},
"execution_count": 514,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"teamInfoData"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def getLastFive(team):\n",
" df = teamInfoData.query('HomeTeam==\"%s\" | AwayTeam==\"%s\"'%(team,team)).tail(10)\n",
" ## home stats\n",
" homeTeam = df.query('HomeTeam==\"%s\"'%(team))\n",
" homeStats = homeTeam.filter(['HS', 'HST', 'HC', 'HF', 'HY', 'HR'])\n",
" homeSum = homeStats.sum(axis=0) # sum all home stats\n",
"\n",
" ## away stats\n",
" awayTeam = df.query('AwayTeam==\"%s\"'%(team))\n",
" awayStats = awayTeam.filter(['AS', 'AST', 'AC', 'AF', 'AY', 'AR'])\n",
" awaySum = awayStats.sum(axis=0) # sum all home stats\n",
"\n",
" # add both stats togther\n",
" totalSum = pd.concat([awaySum, homeSum])\n",
"\n",
" # now sum the home and away stats together\n",
"\n",
" totalSum['S'] = totalSum['HS'] + totalSum['AS']\n",
" totalSum['ST'] = totalSum['HST'] + totalSum['AST']\n",
" totalSum['C'] = totalSum['HC'] + totalSum['AC']\n",
" totalSum['F'] = totalSum['HF'] + totalSum['AF']\n",
" totalSum['Y'] = totalSum['HY'] + totalSum['AY']\n",
" totalSum['R'] = totalSum['HR'] + totalSum['AR']\n",
"\n",
" finalSum = totalSum.drop(['HS', 'HST', 'HC', 'HF', 'HY', 'HR','AS', 'AST', 'AC', 'AF', 'AY', 'AR'])\n",
"\n",
" return finalSum\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "1"
},
"execution_count": 516,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"team1 = getLastFive('Leeds')\n",
"team2 = getLastFive('Burnley')\n",
"#team1[5]\n",
"\n",
"\n",
"logReg.predict([[team1[0],team2[0],team1[1],team2[1],team1[2],team2[2],team1[3],team2[3],team1[4],team2[4],team1[5],team2[5]]])[0]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"dfAllSeasons = pd.concat([prem10, prem11, prem12, prem13,\n",
"prem14, prem15, prem16, prem17,prem18, prem20, prem20\n",
"], axis=0)\n",
"df3.reset_index(drop=True, inplace=True) #reset index"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 64-bit ('base': conda)",
"name": "python388jvsc74a57bd02232eecaa32ceb858866bb7932d64b19f721470f3fd7d04d24aa9a9fefabd8f8"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"metadata": {
"interpreter": {
"hash": "2232eecaa32ceb858866bb7932d64b19f721470f3fd7d04d24aa9a9fefabd8f8"
}
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2
}