Feature_engineering.py

from GW_PointsPredictor.Utility_functions import *
import numpy as np
import pandas as pd
from scipy import stats


# Consolidated mapping for club positions across different seasons
season_positions = {
    "2019-20": {"Liverpool": 1, "Man City": 2,"Man Utd":3, "Chelsea":4,"Leicester":5,"Spurs":6,"Wolves":7,"Arsenal":8, "Sheffield Utd":9,"Burnley":10,"Southampton":11,"Everton":12,"Newcastle":13,"Crystal Palace":14,"Brighton":15,"West Ham":16,"Aston Villa":17, "AFC Bournemouth" : 18, "Watford" : 19, "Norwich City": 20  },
    "2020-21": {"Man City": 1, "Man Utd": 2,"Liverpool":3, "Chelsea":4,"Leicester":5,"West Ham":6,"Spurs":7,"Arsenal":8, "Leeds Utd":9,"Everton":10,"Aston Villa":11,"Newcastle":12,"Wolves":13,"Crystal Palace":14,"Southampton":15,"Brighton":16,"Bunrley":17, "Fulham" : 18, "West Brom" : 19, "Sheffield Utd": 20  },
    "2021-22": {"Man City": 1, "Liverpool": 2,"Chelea":3, "Spurs":4,"Arsenal":5,"Man Utd":6,"West Ham":7,"Leicester":8, "Brighton":9,"Wolves":10,"Newcastle":11,"Crystal Palace":12,"Brentford":13,"Aston Villa":14,"Southampton":15,"Everton":16,"Leeds Utd":17, "Burnley" : 18, "Watford" : 19, "Norwich City": 20  },
    "2022-23": {"Man City": 1, "Arsenal": 2,"Man Utd":3, "Newcastle":4,"Liverpool":5,"Brighton":6,"Aston Villa":7,"Spurs":8, "Brentford":9,"Fulham":10,"Crystal Palace":11,"Chelsea":12,"Wolves":13,"West Ham":14,"AFC Bournemouth":15,"Nottingham Forest":16,"Everton":17, "Leeds Utd" : 18, "Leicester" : 19, "Southampton": 20 }
}

# Simplify split_test to directly split data without iterating
def split_test(data, gameweek):
    """
    Splits data into the specified gameweek and other gameweeks.
    """
    data_gw = data[data["GW"] == gameweek]
    data_other_gw = data[data["GW"] != gameweek]
    return data_gw, data_other_gw

# Vectorized approach to check_win, drastically improving efficiency
def check_win(df):
    """
    Returns a list indicating win (3 points), draw (1 point), or loss (0 points) for each row in DataFrame.
    """
    result = df["team_a_score"] - df["team_h_score"]
    home_win = (result < 0) & df["was_home"]
    away_win = (result > 0) & ~df["was_home"]
    draw = result == 0
    points = np.where(draw, 1, np.where(home_win | away_win, 3, 0))
    return points.tolist()

#season position retrieval to use the season_positions dictionary
def get_season_pos(club, year):
    """
    Retrieves the season position of a club for a given year.
    """
    return season_positions.get(year, {}).get(club, 20)

# Using pandas .mode() method directly
def find_mode(vals):
    # Assuming vals is a pandas Series
    mode_val = vals.mode()
    return mode_val.iloc[0] if not mode_val.empty else np.nan


def find_mean(vals):
    """
    Compute the mean of the provided values, ignoring -1.
    """
    vals_filtered = [val for val in vals if val != -1]
    return np.mean(vals_filtered) if vals_filtered else np.nan


def find_max(vals):
    return vals.max()

# Calculating standard deviation with pandas .std()
def find_std(vals):
    return vals.std()


# Counting occurrences of a specific value
def find_value_count(vals, to_count):
    counts = vals.value_counts()
    return counts.get(to_count, 0)

def get_opp_team(data):
    """
    Returns the list of opponent teams for each row in the DataFrame.
    """
    return np.where(data["team"] == data["home_team"], data["away_team"], data["home_team"])


def create_features(data, mean_features, std_features, no_last_stats):
    """
    Create features based on mean and standard deviation for the given features over the last N statistics.
    """
    for feature in mean_features:
        data[f"mean_{feature}_{no_last_stats}"] = data[f"last_{no_last_stats}_{feature}"].apply(find_mean)
    for feature in std_features:
        data[f"std_{feature}_{no_last_stats}"] = data[f"last_{no_last_stats}_{feature}"].apply(np.std)
    return data

def calculate_rolling_stats(data, stat, n=3):
    """
    Calculate rolling statistics (e.g., mean, sum) for the last N games for each player.

    Parameters:
    data (pd.DataFrame): DataFrame containing player stats.
    stat (str): The statistic column name to calculate rolling stats for.
    n (int): Number of games to include in rolling calculation.

    Returns:
    pd.DataFrame: DataFrame with new columns for rolling statistics.
    """
    data = data.sort_values(by=['name', 'date'])

    # Group by player and calculate rolling stats, shifting to not include the current game
    data[f'{stat}_rolling_mean_last_{n}'] = data.groupby('name')[stat].transform(lambda x: x.rolling(window=n, min_periods=1).mean().shift())
    data[f'{stat}_rolling_sum_last_{n}'] = data.groupby('name')[stat].transform(lambda x: x.rolling(window=n, min_periods=1).sum().shift())

    return data
	from GW_PointsPredictor.Utility_functions import *
	import numpy as np
	import pandas as pd
	from scipy import stats


	# Consolidated mapping for club positions across different seasons
	season_positions = {
	"2019-20": {"Liverpool": 1, "Man City": 2,"Man Utd":3, "Chelsea":4,"Leicester":5,"Spurs":6,"Wolves":7,"Arsenal":8, "Sheffield Utd":9,"Burnley":10,"Southampton":11,"Everton":12,"Newcastle":13,"Crystal Palace":14,"Brighton":15,"West Ham":16,"Aston Villa":17, "AFC Bournemouth" : 18, "Watford" : 19, "Norwich City": 20 },
	"2020-21": {"Man City": 1, "Man Utd": 2,"Liverpool":3, "Chelsea":4,"Leicester":5,"West Ham":6,"Spurs":7,"Arsenal":8, "Leeds Utd":9,"Everton":10,"Aston Villa":11,"Newcastle":12,"Wolves":13,"Crystal Palace":14,"Southampton":15,"Brighton":16,"Bunrley":17, "Fulham" : 18, "West Brom" : 19, "Sheffield Utd": 20 },
	"2021-22": {"Man City": 1, "Liverpool": 2,"Chelea":3, "Spurs":4,"Arsenal":5,"Man Utd":6,"West Ham":7,"Leicester":8, "Brighton":9,"Wolves":10,"Newcastle":11,"Crystal Palace":12,"Brentford":13,"Aston Villa":14,"Southampton":15,"Everton":16,"Leeds Utd":17, "Burnley" : 18, "Watford" : 19, "Norwich City": 20 },
	"2022-23": {"Man City": 1, "Arsenal": 2,"Man Utd":3, "Newcastle":4,"Liverpool":5,"Brighton":6,"Aston Villa":7,"Spurs":8, "Brentford":9,"Fulham":10,"Crystal Palace":11,"Chelsea":12,"Wolves":13,"West Ham":14,"AFC Bournemouth":15,"Nottingham Forest":16,"Everton":17, "Leeds Utd" : 18, "Leicester" : 19, "Southampton": 20 }
	}

	# Simplify split_test to directly split data without iterating
	def split_test(data, gameweek):
	"""
	Splits data into the specified gameweek and other gameweeks.
	"""
	data_gw = data[data["GW"] == gameweek]
	data_other_gw = data[data["GW"] != gameweek]
	return data_gw, data_other_gw

	# Vectorized approach to check_win, drastically improving efficiency
	def check_win(df):
	"""
	Returns a list indicating win (3 points), draw (1 point), or loss (0 points) for each row in DataFrame.
	"""
	result = df["team_a_score"] - df["team_h_score"]
	home_win = (result < 0) & df["was_home"]
	away_win = (result > 0) & ~df["was_home"]
	draw = result == 0
	points = np.where(draw, 1, np.where(home_win \| away_win, 3, 0))
	return points.tolist()

	#season position retrieval to use the season_positions dictionary
	def get_season_pos(club, year):
	"""
	Retrieves the season position of a club for a given year.
	"""
	return season_positions.get(year, {}).get(club, 20)

	# Using pandas .mode() method directly
	def find_mode(vals):
	# Assuming vals is a pandas Series
	mode_val = vals.mode()
	return mode_val.iloc[0] if not mode_val.empty else np.nan


	def find_mean(vals):
	"""
	Compute the mean of the provided values, ignoring -1.
	"""
	vals_filtered = [val for val in vals if val != -1]
	return np.mean(vals_filtered) if vals_filtered else np.nan


	def find_max(vals):
	return vals.max()

	# Calculating standard deviation with pandas .std()
	def find_std(vals):
	return vals.std()


	# Counting occurrences of a specific value
	def find_value_count(vals, to_count):
	counts = vals.value_counts()
	return counts.get(to_count, 0)

	def get_opp_team(data):
	"""
	Returns the list of opponent teams for each row in the DataFrame.
	"""
	return np.where(data["team"] == data["home_team"], data["away_team"], data["home_team"])


	def create_features(data, mean_features, std_features, no_last_stats):
	"""
	Create features based on mean and standard deviation for the given features over the last N statistics.
	"""
	for feature in mean_features:
	data[f"mean_{feature}_{no_last_stats}"] = data[f"last_{no_last_stats}_{feature}"].apply(find_mean)
	for feature in std_features:
	data[f"std_{feature}_{no_last_stats}"] = data[f"last_{no_last_stats}_{feature}"].apply(np.std)
	return data

	def calculate_rolling_stats(data, stat, n=3):
	"""
	Calculate rolling statistics (e.g., mean, sum) for the last N games for each player.

	Parameters:
	data (pd.DataFrame): DataFrame containing player stats.
	stat (str): The statistic column name to calculate rolling stats for.
	n (int): Number of games to include in rolling calculation.

	Returns:
	pd.DataFrame: DataFrame with new columns for rolling statistics.
	"""
	data = data.sort_values(by=['name', 'date'])

	# Group by player and calculate rolling stats, shifting to not include the current game
	data[f'{stat}_rolling_mean_last_{n}'] = data.groupby('name')[stat].transform(lambda x: x.rolling(window=n, min_periods=1).mean().shift())
	data[f'{stat}_rolling_sum_last_{n}'] = data.groupby('name')[stat].transform(lambda x: x.rolling(window=n, min_periods=1).sum().shift())

	return data