Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
GW_PointsPredictor/Feature_engineering.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
106 lines (85 sloc)
4.78 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from GW_PointsPredictor.Utility_functions import * | |
import numpy as np | |
import pandas as pd | |
from scipy import stats | |
# Consolidated mapping for club positions across different seasons | |
season_positions = { | |
"2019-20": {"Liverpool": 1, "Man City": 2,"Man Utd":3, "Chelsea":4,"Leicester":5,"Spurs":6,"Wolves":7,"Arsenal":8, "Sheffield Utd":9,"Burnley":10,"Southampton":11,"Everton":12,"Newcastle":13,"Crystal Palace":14,"Brighton":15,"West Ham":16,"Aston Villa":17, "AFC Bournemouth" : 18, "Watford" : 19, "Norwich City": 20 }, | |
"2020-21": {"Man City": 1, "Man Utd": 2,"Liverpool":3, "Chelsea":4,"Leicester":5,"West Ham":6,"Spurs":7,"Arsenal":8, "Leeds Utd":9,"Everton":10,"Aston Villa":11,"Newcastle":12,"Wolves":13,"Crystal Palace":14,"Southampton":15,"Brighton":16,"Bunrley":17, "Fulham" : 18, "West Brom" : 19, "Sheffield Utd": 20 }, | |
"2021-22": {"Man City": 1, "Liverpool": 2,"Chelea":3, "Spurs":4,"Arsenal":5,"Man Utd":6,"West Ham":7,"Leicester":8, "Brighton":9,"Wolves":10,"Newcastle":11,"Crystal Palace":12,"Brentford":13,"Aston Villa":14,"Southampton":15,"Everton":16,"Leeds Utd":17, "Burnley" : 18, "Watford" : 19, "Norwich City": 20 }, | |
"2022-23": {"Man City": 1, "Arsenal": 2,"Man Utd":3, "Newcastle":4,"Liverpool":5,"Brighton":6,"Aston Villa":7,"Spurs":8, "Brentford":9,"Fulham":10,"Crystal Palace":11,"Chelsea":12,"Wolves":13,"West Ham":14,"AFC Bournemouth":15,"Nottingham Forest":16,"Everton":17, "Leeds Utd" : 18, "Leicester" : 19, "Southampton": 20 } | |
} | |
# Simplify split_test to directly split data without iterating | |
def split_test(data, gameweek): | |
""" | |
Splits data into the specified gameweek and other gameweeks. | |
""" | |
data_gw = data[data["GW"] == gameweek] | |
data_other_gw = data[data["GW"] != gameweek] | |
return data_gw, data_other_gw | |
# Vectorized approach to check_win, drastically improving efficiency | |
def check_win(df): | |
""" | |
Returns a list indicating win (3 points), draw (1 point), or loss (0 points) for each row in DataFrame. | |
""" | |
result = df["team_a_score"] - df["team_h_score"] | |
home_win = (result < 0) & df["was_home"] | |
away_win = (result > 0) & ~df["was_home"] | |
draw = result == 0 | |
points = np.where(draw, 1, np.where(home_win | away_win, 3, 0)) | |
return points.tolist() | |
#season position retrieval to use the season_positions dictionary | |
def get_season_pos(club, year): | |
""" | |
Retrieves the season position of a club for a given year. | |
""" | |
return season_positions.get(year, {}).get(club, 20) | |
# Using pandas .mode() method directly | |
def find_mode(vals): | |
# Assuming vals is a pandas Series | |
mode_val = vals.mode() | |
return mode_val.iloc[0] if not mode_val.empty else np.nan | |
def find_mean(vals): | |
""" | |
Compute the mean of the provided values, ignoring -1. | |
""" | |
vals_filtered = [val for val in vals if val != -1] | |
return np.mean(vals_filtered) if vals_filtered else np.nan | |
def find_max(vals): | |
return vals.max() | |
# Calculating standard deviation with pandas .std() | |
def find_std(vals): | |
return vals.std() | |
# Counting occurrences of a specific value | |
def find_value_count(vals, to_count): | |
counts = vals.value_counts() | |
return counts.get(to_count, 0) | |
def get_opp_team(data): | |
""" | |
Returns the list of opponent teams for each row in the DataFrame. | |
""" | |
return np.where(data["team"] == data["home_team"], data["away_team"], data["home_team"]) | |
def create_features(data, mean_features, std_features, no_last_stats): | |
""" | |
Create features based on mean and standard deviation for the given features over the last N statistics. | |
""" | |
for feature in mean_features: | |
data[f"mean_{feature}_{no_last_stats}"] = data[f"last_{no_last_stats}_{feature}"].apply(find_mean) | |
for feature in std_features: | |
data[f"std_{feature}_{no_last_stats}"] = data[f"last_{no_last_stats}_{feature}"].apply(np.std) | |
return data | |
def calculate_rolling_stats(data, stat, n=3): | |
""" | |
Calculate rolling statistics (e.g., mean, sum) for the last N games for each player. | |
Parameters: | |
data (pd.DataFrame): DataFrame containing player stats. | |
stat (str): The statistic column name to calculate rolling stats for. | |
n (int): Number of games to include in rolling calculation. | |
Returns: | |
pd.DataFrame: DataFrame with new columns for rolling statistics. | |
""" | |
data = data.sort_values(by=['name', 'date']) | |
# Group by player and calculate rolling stats, shifting to not include the current game | |
data[f'{stat}_rolling_mean_last_{n}'] = data.groupby('name')[stat].transform(lambda x: x.rolling(window=n, min_periods=1).mean().shift()) | |
data[f'{stat}_rolling_sum_last_{n}'] = data.groupby('name')[stat].transform(lambda x: x.rolling(window=n, min_periods=1).sum().shift()) | |
return data |