number1d.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

# Step 1: Data Preparation
trips_by_distance = pd.read_csv("trips_by_distance.csv")
trips_full_data = pd.read_csv("Trips_Full_Data.csv")

# Step 2: Merge datasets using "Population Staying at Home" column
merged_data = pd.merge(trips_by_distance, trips_full_data, on=["Population Staying at Home"])

# Step 3: Handle non-numeric values using One-Hot Encoding
merged_data = pd.get_dummies(merged_data, columns=["Population Staying at Home"], drop_first=True)

# Step 4: Feature Engineering
# Drop irrelevant columns like 'County Name' and 'Row ID'
merged_data.drop(['County Name', 'Row ID', 'Week of Date', 'Level_y', 'Date_y', 'Week Ending Date', 'Level_x', 'Date_x', 'State Postal Code', 'Month of Date'], axis=1, inplace=True)

# Check if there are any remaining non-numeric values in the dataset
non_numeric_cols = merged_data.select_dtypes(exclude=['float', 'int']).columns
if non_numeric_cols.any():
    print("Non-numeric values found in columns:", non_numeric_cols)
    raise ValueError("Non-numeric values found in input data. Please preprocess the data appropriately.")

# Step 5: Model Selection
X = merged_data.drop(["Number of Trips", "State FIPS", "County FIPS", "Week", "Month"], axis=1)
y = merged_data["Number of Trips"]

# Step 6: Model Training (Random Forest Regressor)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Model Evaluation and Validation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Step 8: Model Interpretation (Optional for Random Forest)
# Random Forest models provide feature importances, which can be used for interpretation if needed
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": model.feature_importances_})
print(feature_importances)

# Step 9: Simulation (Optional for Random Forest)
# You can simulate travel frequency based on different trip lengths using the trained model
# This could involve predicting the number of trips for various trip length scenarios
# For example:
# simulated_data = model.predict(new_data)
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error
	from sklearn.preprocessing import OneHotEncoder

	# Step 1: Data Preparation
	trips_by_distance = pd.read_csv("trips_by_distance.csv")
	trips_full_data = pd.read_csv("Trips_Full_Data.csv")

	# Step 2: Merge datasets using "Population Staying at Home" column
	merged_data = pd.merge(trips_by_distance, trips_full_data, on=["Population Staying at Home"])

	# Step 3: Handle non-numeric values using One-Hot Encoding
	merged_data = pd.get_dummies(merged_data, columns=["Population Staying at Home"], drop_first=True)

	# Step 4: Feature Engineering
	# Drop irrelevant columns like 'County Name' and 'Row ID'
	merged_data.drop(['County Name', 'Row ID', 'Week of Date', 'Level_y', 'Date_y', 'Week Ending Date', 'Level_x', 'Date_x', 'State Postal Code', 'Month of Date'], axis=1, inplace=True)

	# Check if there are any remaining non-numeric values in the dataset
	non_numeric_cols = merged_data.select_dtypes(exclude=['float', 'int']).columns
	if non_numeric_cols.any():
	print("Non-numeric values found in columns:", non_numeric_cols)
	raise ValueError("Non-numeric values found in input data. Please preprocess the data appropriately.")

	# Step 5: Model Selection
	X = merged_data.drop(["Number of Trips", "State FIPS", "County FIPS", "Week", "Month"], axis=1)
	y = merged_data["Number of Trips"]

	# Step 6: Model Training (Random Forest Regressor)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	model = RandomForestRegressor(n_estimators=100, random_state=42)
	model.fit(X_train, y_train)

	# Step 7: Model Evaluation and Validation
	y_pred = model.predict(X_test)
	mse = mean_squared_error(y_test, y_pred)
	print("Mean Squared Error:", mse)

	# Step 8: Model Interpretation (Optional for Random Forest)
	# Random Forest models provide feature importances, which can be used for interpretation if needed
	feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": model.feature_importances_})
	print(feature_importances)

	# Step 9: Simulation (Optional for Random Forest)
	# You can simulate travel frequency based on different trip lengths using the trained model
	# This could involve predicting the number of trips for various trip length scenarios
	# For example:
	# simulated_data = model.predict(new_data)