Skip to content
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
Cannot retrieve contributors at this time
50 lines (40 sloc) 2.36 KB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
# Step 1: Data Preparation
trips_by_distance = pd.read_csv("trips_by_distance.csv")
trips_full_data = pd.read_csv("Trips_Full_Data.csv")
# Step 2: Merge datasets using "Population Staying at Home" column
merged_data = pd.merge(trips_by_distance, trips_full_data, on=["Population Staying at Home"])
# Step 3: Handle non-numeric values using One-Hot Encoding
merged_data = pd.get_dummies(merged_data, columns=["Population Staying at Home"], drop_first=True)
# Step 4: Feature Engineering
# Drop irrelevant columns like 'County Name' and 'Row ID'
merged_data.drop(['County Name', 'Row ID', 'Week of Date', 'Level_y', 'Date_y', 'Week Ending Date', 'Level_x', 'Date_x', 'State Postal Code', 'Month of Date'], axis=1, inplace=True)
# Check if there are any remaining non-numeric values in the dataset
non_numeric_cols = merged_data.select_dtypes(exclude=['float', 'int']).columns
if non_numeric_cols.any():
print("Non-numeric values found in columns:", non_numeric_cols)
raise ValueError("Non-numeric values found in input data. Please preprocess the data appropriately.")
# Step 5: Model Selection
X = merged_data.drop(["Number of Trips", "State FIPS", "County FIPS", "Week", "Month"], axis=1)
y = merged_data["Number of Trips"]
# Step 6: Model Training (Random Forest Regressor)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42), y_train)
# Step 7: Model Evaluation and Validation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
# Step 8: Model Interpretation (Optional for Random Forest)
# Random Forest models provide feature importances, which can be used for interpretation if needed
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": model.feature_importances_})
# Step 9: Simulation (Optional for Random Forest)
# You can simulate travel frequency based on different trip lengths using the trained model
# This could involve predicting the number of trips for various trip length scenarios
# For example:
# simulated_data = model.predict(new_data)