Skip to content
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
Cannot retrieve contributors at this time
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
column_types = {
'Level': 'str',
'Date': 'str',
'State FIPS': 'str',
'State Postal Code': 'str',
'County FIPS': 'str',
'County Name': 'str',
'Population Staying at Home': 'float64',
'Population Not Staying at Home': 'float64',
'Number of Trips': 'float64',
'Number of Trips <1': 'float64',
'Number of Trips 1-3': 'float64',
'Number of Trips 3-5': 'float64',
'Number of Trips 5-10': 'float64',
'Number of Trips 10-25': 'float64',
'Number of Trips 25-50': 'float64',
'Number of Trips 50-100': 'float64',
'Number of Trips 100-250': 'float64',
'Number of Trips 250-500': 'float64',
'Number of Trips >=500': 'float64',
'Row ID': 'str',
'Week': 'int64',
'Month': 'int64'
block_size = "20MB"
data_df = dd.read_csv("Cleaned_Trips_by_Distance.csv", blocksize=block_size, dtype=column_types)
full_data = pd.read_csv("Trips_Full Data.csv")
def clean_dataframe_col(df, cols):
for col in cols:
df = df.dropna(subset=[col])
return df
def convert_to_Date(df, type_df):
if type_df == "pd":
df['Date'] = pd.to_datetime(df['Date'])
elif type_df == "dd":
df['Date'] = df['Date'].astype('M8[us]')
return df
def number_of_trips(df):
total_trips = df['Number of Trips'].sum().compute()
print(f"Total number of trips: {total_trips}")
def clean_populations(data_df, full_data):
data = clean_dataframe_col(data_df, ['Population Staying at Home'])
weekly_means = data.groupby('Week')['Population Staying at Home'].mean().compute()
print("Weekly Means:\n", weekly_means)
print("Total Mean:\n", int(weekly_means.mean()))
plt.figure(figsize=(10, 6))
plt.title('BarChart of People Staying at Home vs Week')
plt.ylabel('Average Population Staying at Home')
trips_columns = ['Trips <1 Mile', 'Trips 1-3 Miles', 'Trips 3-5 Miles',
'Trips 5-10 Miles', 'Trips 10-25 Miles', 'Trips 25-50 Miles',
'Trips 50-100 Miles', 'Trips 100-250 Miles', 'Trips 250-500 Miles',
'Trips 500+ Miles']
trip_values = full_data[trips_columns].mean()
plt.figure(figsize=(12, 8))
sns.barplot(x=trip_values.index, y=trip_values.values, palette='viridis')
plt.xlabel('Distance Range', fontsize=12)
plt.ylabel('Average Number of Trips', fontsize=12)
plt.title('Average Number of Trips for Each Distance Range', fontsize=14)
plt.xticks(rotation=45, ha='right')
clean_populations(data_df, full_data)