Skip to content
Permalink
21d0c3f3d9
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
64 lines (45 sloc) 2.33 KB
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
# Load the data
trips_full_data_df = pd.read_csv('Trips_Full Data.csv')
trips_by_distance_df = pd.read_csv('Trips_by_Distance.csv')
# Convert 'Date' to datetime if it's not already and set as index
trips_full_data_df['Date'] = pd.to_datetime(trips_full_data_df['Date'])
trips_by_distance_df['Date'] = pd.to_datetime(trips_by_distance_df['Date'])
# Set 'Date' as index to prepare for merge
trips_full_data_df.set_index('Date', inplace=True)
trips_by_distance_df.set_index('Date', inplace=True)
# Filter the data for Week 32 of 2019 for the predictor variables
week_32_full = trips_full_data_df[trips_full_data_df['Week of Date'] == 'Week 32']
# Prepare the Week 31 data for the target variable and group by 'Date'
week_31_distance = trips_by_distance_df[trips_by_distance_df['Week'] == 31]
week_31_distance_grouped = week_31_distance.groupby('Date').agg({'Number of Trips 5-10': 'sum'})
# Merge the Week 32 and Week 31 data on 'Date'
merged_data = week_32_full.join(week_31_distance_grouped)
if merged_data.isnull().values.any():
print("Warning: NaN values found after merging. Check alignment of 'Date' columns.")
merged_data.dropna(inplace=True)
X = merged_data[['Trips 1-25 Miles', 'Trips 25-100 Miles']]
y = merged_data['Number of Trips 5-10']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
model = LinearRegression()
model.fit(X_train_scaled, y_train)
from sklearn.metrics import r2_score, mean_squared_error
# Predict on the validation set
y_pred = model.predict(X_val_scaled)
# Calculate R² on the validation set
r2 = r2_score(y_val, y_pred)
# Calculate RMSE on the validation set
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse) # Calculating the square root of the MSE to get RMSE
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
print(f"Linear Regression - Coefficient of determination (R^2) on validation set: {r2}")
print(f"Linear Regression - Root Mean Square Error (RMSE) on validation set: {rmse}")