Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
DataProject/1dold.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
80 lines (64 sloc)
3.01 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.linear_model import LinearRegression | |
from sklearn.preprocessing import PolynomialFeatures | |
# Referenced Aula and the code Documentation | |
df_full = pd.read_csv('Trips_Full_Data.csv') # I changed what these are called because i kept getting confused. | |
df_distance = pd.read_csv('Trips_by_Distance.csv') | |
df_full = df_full.dropna() # Using pandas so this is slightly different from the way I did it in 1a | |
df_full = df_full.fillna(0) | |
df_full['Week'] = df_full['Week of Date'].str.extract(r'(\d+)', expand=False).astype(int) #https://www.geeksforgeeks.org/python-pandas-series-str-extract/ | |
df_full_week32 = df_full[df_full['Week'] == 32] # Need to group by week32 | |
merged_df = pd.merge(df_full_week32, df_distance, on='Level', how='inner') # merging the datasets based on level, all are 'national' | |
# https://www.w3schools.com/python/pandas/ref_df_merge.asp | |
# People vs Distance: | |
X = merged_df[['Trips 25-100 Miles']] | |
y = merged_df['Number of Trips 10-25'] | |
#Linear model | |
model = LinearRegression() | |
model.fit(X, y) | |
print("Linear model:") | |
r_sq = model.score(X, y) | |
print("Coefficient of determination (R^2):", r_sq) | |
print("Intercept:", model.intercept_) | |
print("Coefficients:", model.coef_) | |
y_pred = model.predict(X) | |
print("Predicted response:\n", y_pred) | |
# There r value is 0 so I don't think the relationship is linear. I am going to use polynomial now. | |
poly = PolynomialFeatures(degree=2) | |
X_poly = poly.fit_transform(X) | |
poly_model = LinearRegression() | |
poly_model.fit(X_poly, y) | |
print("Polynomial model:") | |
y_pred = poly_model.predict(X_poly) | |
print("Predicted response:\n", y_pred) | |
print("Intercept:", poly_model.intercept_) | |
print("Coefficients:", poly_model.coef_) | |
r_sq = poly_model.score(X_poly, y) | |
print("Coefficient of determination (R^2):", r_sq) | |
# Linear model: | |
# ('Coefficient of determination (R^2):', 0.0)T | |
# ('Intercept:', 179005342.22197562) | |
# ('Coefficients:', array([-4.52300271e-16])) | |
# ('Predicted response:\n', array([1.79005342e+08, 1.79005342e+08, 1.79005342e+08, ..., | |
# 1.79005342e+08, 1.79005342e+08, 1.79005342e+08])) | |
# Polynomial model: | |
# ('Predicted response:\n', array([1.79005342e+08, 1.79005342e+08, 1.79005342e+08, ..., | |
# 1.79005342e+08, 1.79005342e+08, 1.79005342e+08])) | |
# ('Intercept:', 179005345.47188395) | |
# ('Coefficients:', array([ 0.00000000e+00, -7.46003515e-08, 4.26695402e-16])) | |
# ('Coefficient of determination (R^2):', -2.220446049250313e-16) | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import r2_score, mean_squared_error | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
poly = PolynomialFeatures(degree=2) | |
X_train_poly = poly.fit_transform(X_train) | |
X_test_poly = poly.transform(X_test) | |
# Initialize polynomial regression model | |
poly_model = LinearRegression() | |
poly_model.fit(X_train_poly, y_train) | |
y_pred = poly_model.predict(X_test_poly) | |
# Evaluate the model | |
r_squared = r2_score(y_test, y_pred) | |
mse = mean_squared_error(y_test, y_pred) | |
print("Trained model R value (R^2):", r_squared) | |
print("Mean squared error:", mse) |