Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
5011CEM_SourceCode/Plotting for dask.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
102 lines (73 sloc)
2.77 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[ ]: | |
import pandas as pd | |
import dask.dataframe as dd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
column_types = { | |
'Level': 'str', | |
'Date': 'str', | |
'State FIPS': 'str', | |
'State Postal Code': 'str', | |
'County FIPS': 'str', | |
'County Name': 'str', | |
'Population Staying at Home': 'float64', | |
'Population Not Staying at Home': 'float64', | |
'Number of Trips': 'float64', | |
'Number of Trips <1': 'float64', | |
'Number of Trips 1-3': 'float64', | |
'Number of Trips 3-5': 'float64', | |
'Number of Trips 5-10': 'float64', | |
'Number of Trips 10-25': 'float64', | |
'Number of Trips 25-50': 'float64', | |
'Number of Trips 50-100': 'float64', | |
'Number of Trips 100-250': 'float64', | |
'Number of Trips 250-500': 'float64', | |
'Number of Trips >=500': 'float64', | |
'Row ID': 'str', | |
'Week': 'int64', | |
'Month': 'int64' | |
} | |
block_size = "20MB" | |
data_df = dd.read_csv("Cleaned_Trips_by_Distance.csv", blocksize=block_size, dtype=column_types) | |
full_data = pd.read_csv("Trips_Full Data.csv") | |
def clean_dataframe_col(df, cols): | |
for col in cols: | |
df = df.dropna(subset=[col]) | |
return df | |
def convert_to_Date(df, type_df): | |
if type_df == "pd": | |
df['Date'] = pd.to_datetime(df['Date']) | |
elif type_df == "dd": | |
df['Date'] = df['Date'].astype('M8[us]') | |
return df | |
def number_of_trips(df): | |
total_trips = df['Number of Trips'].sum().compute() | |
print(f"Total number of trips: {total_trips}") | |
def clean_populations(data_df, full_data): | |
data = clean_dataframe_col(data_df, ['Population Staying at Home']) | |
weekly_means = data.groupby('Week')['Population Staying at Home'].mean().compute() | |
print("Weekly Means:\n", weekly_means) | |
print("Total Mean:\n", int(weekly_means.mean())) | |
plt.figure(figsize=(10, 6)) | |
weekly_means.plot(kind='bar') | |
plt.title('BarChart of People Staying at Home vs Week') | |
plt.xlabel('Week') | |
plt.ylabel('Average Population Staying at Home') | |
plt.show() | |
trips_columns = ['Trips <1 Mile', 'Trips 1-3 Miles', 'Trips 3-5 Miles', | |
'Trips 5-10 Miles', 'Trips 10-25 Miles', 'Trips 25-50 Miles', | |
'Trips 50-100 Miles', 'Trips 100-250 Miles', 'Trips 250-500 Miles', | |
'Trips 500+ Miles'] | |
trip_values = full_data[trips_columns].mean() | |
plt.figure(figsize=(12, 8)) | |
sns.barplot(x=trip_values.index, y=trip_values.values, palette='viridis') | |
plt.xlabel('Distance Range', fontsize=12) | |
plt.ylabel('Average Number of Trips', fontsize=12) | |
plt.title('Average Number of Trips for Each Distance Range', fontsize=14) | |
plt.xticks(rotation=45, ha='right') | |
plt.tight_layout() | |
plt.show() | |
clean_populations(data_df, full_data) | |