Model selection and testing.py

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

#Process the data
df = pd.read_csv('Cleaned_Trips_by_Distance.csv')
week_32 = df[df['Week'] == 31]
start_index = 88
end_index = 94
y = week_32['Number of Trips 10-25'].iloc[start_index:end_index + 1]
df_full = pd.read_csv('Trips_Full Data.csv')
x = df_full['Trips 25-100 Miles'].values.reshape((-1, 1))

#Function to fit and calculate regression models
def fit_and_evaluate_model(model, x, y, model_name):
    model.fit(x, y)
    y_pred = model.predict(x)
    r2 = r2_score(y, y_pred)
    print(f"{model_name} - R-squared: {r2:.2f}")

#Linear model
linear_model = LinearRegression()
fit_and_evaluate_model(linear_model, x, y, "Linear Regression")

#Polynomial model
poly_features = PolynomialFeatures(degree=2)
x_poly = poly_features.fit_transform(x)
poly_model = LinearRegression()
fit_and_evaluate_model(poly_model, x_poly, y, "Polynomial Regression (Degree=2)")

#Random forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
fit_and_evaluate_model(rf_model, x, y, "Random Forest Regression")


# In[2]:


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from dask_ml.model_selection import train_test_split

#Process the data
df = pd.read_csv('Cleaned_Trips_by_Distance.csv')
week_32 = df[df['Week'] == 31]
start_index = 88
end_index = 94
y = week_32['Number of Trips 10-25'].iloc[start_index:end_index + 1]
df_full = pd.read_csv('Trips_Full Data.csv')
x = df_full['Trips 25-100 Miles'].values.reshape((-1, 1))

#Splitting the data into training and testing sets (80-20)
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, shuffle=True
)

#Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

#Predictions on the training set
y_pred_train = rf_model.predict(x_train)

#Model performance on the training set (R-sq and MSE)
train_r2 = r2_score(y_train, y_pred_train)
train_mse = mean_squared_error(y_train, y_pred_train)
print(f"Random Forest Regression - R-squared on Training Set: {train_r2:.2f}")
print(f"Random Forest Regression - MSE on Training Set: {train_mse:.2f}")

#Predictions on the test set
y_pred_test = rf_model.predict(x_test)

#Model performance on the test set (R-sq and MSE)
test_r2 = r2_score(y_test, y_pred_test)
test_mse = mean_squared_error(y_test, y_pred_test)
print(f"Random Forest Regression - R-squared on Test Set: {test_r2:.2f}")
print(f"Random Forest Regression - MSE on Test Set: {test_mse:.2f}")


# In[ ]:


# In[ ]:
	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:


	import pandas as pd
	import numpy as np
	from sklearn.linear_model import LinearRegression
	from sklearn.preprocessing import PolynomialFeatures
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import r2_score

	#Process the data
	df = pd.read_csv('Cleaned_Trips_by_Distance.csv')
	week_32 = df[df['Week'] == 31]
	start_index = 88
	end_index = 94
	y = week_32['Number of Trips 10-25'].iloc[start_index:end_index + 1]
	df_full = pd.read_csv('Trips_Full Data.csv')
	x = df_full['Trips 25-100 Miles'].values.reshape((-1, 1))

	#Function to fit and calculate regression models
	def fit_and_evaluate_model(model, x, y, model_name):
	model.fit(x, y)
	y_pred = model.predict(x)
	r2 = r2_score(y, y_pred)
	print(f"{model_name} - R-squared: {r2:.2f}")

	#Linear model
	linear_model = LinearRegression()
	fit_and_evaluate_model(linear_model, x, y, "Linear Regression")

	#Polynomial model
	poly_features = PolynomialFeatures(degree=2)
	x_poly = poly_features.fit_transform(x)
	poly_model = LinearRegression()
	fit_and_evaluate_model(poly_model, x_poly, y, "Polynomial Regression (Degree=2)")

	#Random forest model
	rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
	fit_and_evaluate_model(rf_model, x, y, "Random Forest Regression")


	# In[2]:


	import pandas as pd
	import numpy as np
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import r2_score, mean_squared_error
	from dask_ml.model_selection import train_test_split

	#Process the data
	df = pd.read_csv('Cleaned_Trips_by_Distance.csv')
	week_32 = df[df['Week'] == 31]
	start_index = 88
	end_index = 94
	y = week_32['Number of Trips 10-25'].iloc[start_index:end_index + 1]
	df_full = pd.read_csv('Trips_Full Data.csv')
	x = df_full['Trips 25-100 Miles'].values.reshape((-1, 1))

	#Splitting the data into training and testing sets (80-20)
	x_train, x_test, y_train, y_test = train_test_split(
	x, y, test_size=0.2, random_state=42, shuffle=True
	)

	#Train the Random Forest model
	rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
	rf_model.fit(x_train, y_train)

	#Predictions on the training set
	y_pred_train = rf_model.predict(x_train)

	#Model performance on the training set (R-sq and MSE)
	train_r2 = r2_score(y_train, y_pred_train)
	train_mse = mean_squared_error(y_train, y_pred_train)
	print(f"Random Forest Regression - R-squared on Training Set: {train_r2:.2f}")
	print(f"Random Forest Regression - MSE on Training Set: {train_mse:.2f}")

	#Predictions on the test set
	y_pred_test = rf_model.predict(x_test)

	#Model performance on the test set (R-sq and MSE)
	test_r2 = r2_score(y_test, y_pred_test)
	test_mse = mean_squared_error(y_test, y_pred_test)
	print(f"Random Forest Regression - R-squared on Test Set: {test_r2:.2f}")
	print(f"Random Forest Regression - MSE on Test Set: {test_mse:.2f}")


	# In[ ]:





	# In[ ]: