Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
5011CEM_SourceCode/Model selection and testing.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
101 lines (71 sloc)
2.82 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
import pandas as pd | |
import numpy as np | |
from sklearn.linear_model import LinearRegression | |
from sklearn.preprocessing import PolynomialFeatures | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import r2_score | |
#Process the data | |
df = pd.read_csv('Cleaned_Trips_by_Distance.csv') | |
week_32 = df[df['Week'] == 31] | |
start_index = 88 | |
end_index = 94 | |
y = week_32['Number of Trips 10-25'].iloc[start_index:end_index + 1] | |
df_full = pd.read_csv('Trips_Full Data.csv') | |
x = df_full['Trips 25-100 Miles'].values.reshape((-1, 1)) | |
#Function to fit and calculate regression models | |
def fit_and_evaluate_model(model, x, y, model_name): | |
model.fit(x, y) | |
y_pred = model.predict(x) | |
r2 = r2_score(y, y_pred) | |
print(f"{model_name} - R-squared: {r2:.2f}") | |
#Linear model | |
linear_model = LinearRegression() | |
fit_and_evaluate_model(linear_model, x, y, "Linear Regression") | |
#Polynomial model | |
poly_features = PolynomialFeatures(degree=2) | |
x_poly = poly_features.fit_transform(x) | |
poly_model = LinearRegression() | |
fit_and_evaluate_model(poly_model, x_poly, y, "Polynomial Regression (Degree=2)") | |
#Random forest model | |
rf_model = RandomForestRegressor(n_estimators=100, random_state=42) | |
fit_and_evaluate_model(rf_model, x, y, "Random Forest Regression") | |
# In[2]: | |
import pandas as pd | |
import numpy as np | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import r2_score, mean_squared_error | |
from dask_ml.model_selection import train_test_split | |
#Process the data | |
df = pd.read_csv('Cleaned_Trips_by_Distance.csv') | |
week_32 = df[df['Week'] == 31] | |
start_index = 88 | |
end_index = 94 | |
y = week_32['Number of Trips 10-25'].iloc[start_index:end_index + 1] | |
df_full = pd.read_csv('Trips_Full Data.csv') | |
x = df_full['Trips 25-100 Miles'].values.reshape((-1, 1)) | |
#Splitting the data into training and testing sets (80-20) | |
x_train, x_test, y_train, y_test = train_test_split( | |
x, y, test_size=0.2, random_state=42, shuffle=True | |
) | |
#Train the Random Forest model | |
rf_model = RandomForestRegressor(n_estimators=100, random_state=42) | |
rf_model.fit(x_train, y_train) | |
#Predictions on the training set | |
y_pred_train = rf_model.predict(x_train) | |
#Model performance on the training set (R-sq and MSE) | |
train_r2 = r2_score(y_train, y_pred_train) | |
train_mse = mean_squared_error(y_train, y_pred_train) | |
print(f"Random Forest Regression - R-squared on Training Set: {train_r2:.2f}") | |
print(f"Random Forest Regression - MSE on Training Set: {train_mse:.2f}") | |
#Predictions on the test set | |
y_pred_test = rf_model.predict(x_test) | |
#Model performance on the test set (R-sq and MSE) | |
test_r2 = r2_score(y_test, y_pred_test) | |
test_mse = mean_squared_error(y_test, y_pred_test) | |
print(f"Random Forest Regression - R-squared on Test Set: {test_r2:.2f}") | |
print(f"Random Forest Regression - MSE on Test Set: {test_mse:.2f}") | |
# In[ ]: | |
# In[ ]: | |