Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Create model.py
 Loading branch information
Showing
1 changed file
with
64 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number  Diff line number  Diff line change 

@@ 0,0 +1,64 @@  
import pandas as pd  
from sklearn.linear_model import LinearRegression  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import mean_squared_error, r2_score  
from sklearn.preprocessing import StandardScaler  
import numpy as np  


# Load the data  
trips_full_data_df = pd.read_csv('Trips_Full Data.csv')  
trips_by_distance_df = pd.read_csv('Trips_by_Distance.csv')  


# Convert 'Date' to datetime if it's not already and set as index  
trips_full_data_df['Date'] = pd.to_datetime(trips_full_data_df['Date'])  
trips_by_distance_df['Date'] = pd.to_datetime(trips_by_distance_df['Date'])  


# Set 'Date' as index to prepare for merge  
trips_full_data_df.set_index('Date', inplace=True)  
trips_by_distance_df.set_index('Date', inplace=True)  


# Filter the data for Week 32 of 2019 for the predictor variables  
week_32_full = trips_full_data_df[trips_full_data_df['Week of Date'] == 'Week 32']  


# Prepare the Week 31 data for the target variable and group by 'Date'  
week_31_distance = trips_by_distance_df[trips_by_distance_df['Week'] == 31]  
week_31_distance_grouped = week_31_distance.groupby('Date').agg({'Number of Trips 510': 'sum'})  


# Merge the Week 32 and Week 31 data on 'Date'  
merged_data = week_32_full.join(week_31_distance_grouped)  




if merged_data.isnull().values.any():  
print("Warning: NaN values found after merging. Check alignment of 'Date' columns.")  
merged_data.dropna(inplace=True)  




X = merged_data[['Trips 125 Miles', 'Trips 25100 Miles']]  
y = merged_data['Number of Trips 510']  


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  


scaler = StandardScaler()  
X_train_scaled = scaler.fit_transform(X_train)  
X_val_scaled = scaler.transform(X_val)  


model = LinearRegression()  
model.fit(X_train_scaled, y_train)  


from sklearn.metrics import r2_score, mean_squared_error  


# Predict on the validation set  
y_pred = model.predict(X_val_scaled)  


# Calculate R² on the validation set  
r2 = r2_score(y_val, y_pred)  


# Calculate RMSE on the validation set  
mse = mean_squared_error(y_val, y_pred)  
rmse = np.sqrt(mse) # Calculating the square root of the MSE to get RMSE  
print(f"Coefficients: {model.coef_}")  
print(f"Intercept: {model.intercept_}")  




print(f"Linear Regression  Coefficient of determination (R^2) on validation set: {r2}")  
print(f"Linear Regression  Root Mean Square Error (RMSE) on validation set: {rmse}") 