#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
column_types = {
'Level': 'str',
'Date': 'str',
'State FIPS': 'str',
'State Postal Code': 'str',
'County FIPS': 'str',
'County Name': 'str',
'Population Staying at Home': 'float64',
'Population Not Staying at Home': 'float64',
'Number of Trips': 'float64',
'Number of Trips <1': 'float64',
'Number of Trips 1-3': 'float64',
'Number of Trips 3-5': 'float64',
'Number of Trips 5-10': 'float64',
'Number of Trips 10-25': 'float64',
'Number of Trips 25-50': 'float64',
'Number of Trips 50-100': 'float64',
'Number of Trips 100-250': 'float64',
'Number of Trips 250-500': 'float64',
'Number of Trips >=500': 'float64',
'Row ID': 'str',
'Week': 'int64',
'Month': 'int64'
block_size = "20MB"
data_df = dd.read_csv("Cleaned_Trips_by_Distance.csv", blocksize=block_size, dtype=column_types)
full_data = pd.read_csv("Trips_Full Data.csv")
def clean_dataframe_col(df, cols):
for col in cols:
df = df.dropna(subset=[col])
return df
def convert_to_Date(df, type_df):
if type_df == "pd":
df['Date'] = pd.to_datetime(df['Date'])
elif type_df == "dd":
df['Date'] = df['Date'].astype('M8[us]')
return df
def number_of_trips(df):
total_trips = df['Number of Trips'].sum().compute()
print(f"Total number of trips: {total_trips}")
def clean_populations(data_df, full_data):
data = clean_dataframe_col(data_df, ['Population Staying at Home'])
weekly_means = data.groupby('Week')['Population Staying at Home'].mean().compute()
print("Weekly Means:\n", weekly_means)
print("Total Mean:\n", int(weekly_means.mean()))
plt.figure(figsize=(10, 6))
plt.title('BarChart of People Staying at Home vs Week')
plt.ylabel('Average Population Staying at Home')
trips_columns = ['Trips <1 Mile', 'Trips 1-3 Miles', 'Trips 3-5 Miles',
'Trips 5-10 Miles', 'Trips 10-25 Miles', 'Trips 25-50 Miles',
'Trips 50-100 Miles', 'Trips 100-250 Miles', 'Trips 250-500 Miles',
'Trips 500+ Miles']
trip_values = full_data[trips_columns].mean()
plt.figure(figsize=(12, 8))
sns.barplot(x=trip_values.index, y=trip_values.values, palette='viridis')
plt.xlabel('Distance Range', fontsize=12)
plt.ylabel('Average Number of Trips', fontsize=12)
plt.title('Average Number of Trips for Each Distance Range', fontsize=14)
plt.xticks(rotation=45, ha='right')
clean_populations(data_df, full_data)