Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
5011-BIG-DATA/number1d.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
50 lines (40 sloc)
2.36 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import mean_squared_error | |
from sklearn.preprocessing import OneHotEncoder | |
# Step 1: Data Preparation | |
trips_by_distance = pd.read_csv("trips_by_distance.csv") | |
trips_full_data = pd.read_csv("Trips_Full_Data.csv") | |
# Step 2: Merge datasets using "Population Staying at Home" column | |
merged_data = pd.merge(trips_by_distance, trips_full_data, on=["Population Staying at Home"]) | |
# Step 3: Handle non-numeric values using One-Hot Encoding | |
merged_data = pd.get_dummies(merged_data, columns=["Population Staying at Home"], drop_first=True) | |
# Step 4: Feature Engineering | |
# Drop irrelevant columns like 'County Name' and 'Row ID' | |
merged_data.drop(['County Name', 'Row ID', 'Week of Date', 'Level_y', 'Date_y', 'Week Ending Date', 'Level_x', 'Date_x', 'State Postal Code', 'Month of Date'], axis=1, inplace=True) | |
# Check if there are any remaining non-numeric values in the dataset | |
non_numeric_cols = merged_data.select_dtypes(exclude=['float', 'int']).columns | |
if non_numeric_cols.any(): | |
print("Non-numeric values found in columns:", non_numeric_cols) | |
raise ValueError("Non-numeric values found in input data. Please preprocess the data appropriately.") | |
# Step 5: Model Selection | |
X = merged_data.drop(["Number of Trips", "State FIPS", "County FIPS", "Week", "Month"], axis=1) | |
y = merged_data["Number of Trips"] | |
# Step 6: Model Training (Random Forest Regressor) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
model = RandomForestRegressor(n_estimators=100, random_state=42) | |
model.fit(X_train, y_train) | |
# Step 7: Model Evaluation and Validation | |
y_pred = model.predict(X_test) | |
mse = mean_squared_error(y_test, y_pred) | |
print("Mean Squared Error:", mse) | |
# Step 8: Model Interpretation (Optional for Random Forest) | |
# Random Forest models provide feature importances, which can be used for interpretation if needed | |
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": model.feature_importances_}) | |
print(feature_importances) | |
# Step 9: Simulation (Optional for Random Forest) | |
# You can simulate travel frequency based on different trip lengths using the trained model | |
# This could involve predicting the number of trips for various trip length scenarios | |
# For example: | |
# simulated_data = model.predict(new_data) |