Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BIG-data/model.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
64 lines (45 sloc)
2.33 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.linear_model import LinearRegression | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import mean_squared_error, r2_score | |
from sklearn.preprocessing import StandardScaler | |
import numpy as np | |
# Load the data | |
trips_full_data_df = pd.read_csv('Trips_Full Data.csv') | |
trips_by_distance_df = pd.read_csv('Trips_by_Distance.csv') | |
# Convert 'Date' to datetime if it's not already and set as index | |
trips_full_data_df['Date'] = pd.to_datetime(trips_full_data_df['Date']) | |
trips_by_distance_df['Date'] = pd.to_datetime(trips_by_distance_df['Date']) | |
# Set 'Date' as index to prepare for merge | |
trips_full_data_df.set_index('Date', inplace=True) | |
trips_by_distance_df.set_index('Date', inplace=True) | |
# Filter the data for Week 32 of 2019 for the predictor variables | |
week_32_full = trips_full_data_df[trips_full_data_df['Week of Date'] == 'Week 32'] | |
# Prepare the Week 31 data for the target variable and group by 'Date' | |
week_31_distance = trips_by_distance_df[trips_by_distance_df['Week'] == 31] | |
week_31_distance_grouped = week_31_distance.groupby('Date').agg({'Number of Trips 5-10': 'sum'}) | |
# Merge the Week 32 and Week 31 data on 'Date' | |
merged_data = week_32_full.join(week_31_distance_grouped) | |
if merged_data.isnull().values.any(): | |
print("Warning: NaN values found after merging. Check alignment of 'Date' columns.") | |
merged_data.dropna(inplace=True) | |
X = merged_data[['Trips 1-25 Miles', 'Trips 25-100 Miles']] | |
y = merged_data['Number of Trips 5-10'] | |
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) | |
scaler = StandardScaler() | |
X_train_scaled = scaler.fit_transform(X_train) | |
X_val_scaled = scaler.transform(X_val) | |
model = LinearRegression() | |
model.fit(X_train_scaled, y_train) | |
from sklearn.metrics import r2_score, mean_squared_error | |
# Predict on the validation set | |
y_pred = model.predict(X_val_scaled) | |
# Calculate R² on the validation set | |
r2 = r2_score(y_val, y_pred) | |
# Calculate RMSE on the validation set | |
mse = mean_squared_error(y_val, y_pred) | |
rmse = np.sqrt(mse) # Calculating the square root of the MSE to get RMSE | |
print(f"Coefficients: {model.coef_}") | |
print(f"Intercept: {model.intercept_}") | |
print(f"Linear Regression - Coefficient of determination (R^2) on validation set: {r2}") | |
print(f"Linear Regression - Root Mean Square Error (RMSE) on validation set: {rmse}") |