Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
DataProject/1a.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
101 lines (83 sloc)
3.74 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import numpy as np | |
import pandas as pd | |
from IPython.display import display | |
from dask import dataframe as dd | |
import dask.dataframe as dd | |
import dask.array as da | |
import dask.bag as db | |
# Histogram visualisation does not work without this. | |
import matplotlib | |
matplotlib.use('Agg') # Use the Agg backend | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
# Load the CSV file using Dask | |
ddf = dd.read_csv('Trips_Full_Data.csv') | |
ddd = dd.read_csv('Trips_by_Distance.csv', usecols=['Week', 'Population Staying at Home','Number of Trips'], dtype={'Population Staying at Home': 'float64', 'Number of Trips': 'float64'}) | |
# How many people are staying at home - find the average of the columns in TFD that are staying at home | |
# Calculate the average people staying at home using Dask | |
average = ddf['Population Staying at Home'].mean().compute() # Overall average | |
# Convert the average to an int otherwise it'll output in standard form | |
average_int = average.astype('int64') | |
print("Average number of people staying at home:", average_int) | |
# Data cleaning | |
# Fill null values | |
ddd['Population Staying at Home'] = ddd['Population Staying at Home'].fillna(0) | |
# Convert floats to ints so that the mean() works | |
ddd['Population Staying at Home'] = ddd['Population Staying at Home'].round().astype('int64') | |
# Group by 'Week' and calculate the average of 'Population Staying at Home' for each week | |
average_per_week = ddd.groupby('Week')['Population Staying at Home'].mean() | |
avperweek = average_per_week.compute() | |
# Convert the avperweek to integer otherwise it gives it to you in standard form | |
avperweek_int = avperweek.astype('int64') | |
print("Average number of people staying at home per week", avperweek_int) | |
fig = plt.figure(figsize=(10, 6)) | |
plt.bar(range(len(avperweek_int)), avperweek_int, width=0.4, color='orange') | |
plt.xlabel("Week") | |
plt.xticks(range(len(avperweek_int)), rotation=45) # Display all week numbers | |
plt.ylabel("Average number of people staying at home") | |
plt.title("Average number of people staying at home per week") | |
plt.rcParams.update({ | |
'text.color': "black", | |
'axes.labelcolor': "black", | |
'xtick.color': "black", | |
'ytick.color': "black", | |
'font.size': 10 | |
}) # Change text color and size for better readability | |
plt.tight_layout() | |
plt.savefig('bar_plot1.png') | |
# Try Histogram | |
fig = plt.figure(figsize=(10, 6)) | |
plt.hist(avperweek, bins=44, color='orange', edgecolor='black') | |
plt.xlabel("Week") | |
plt.ylabel("Average number of people staying at home") | |
plt.title("Average number of people staying at home per week") | |
plt.grid(True) | |
plt.tight_layout() | |
plt.savefig('histogram_plot.png') # Save the plot as an image file | |
plt.savefig("histogram1a2") | |
#Documentation references: https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.astype.html | |
# How far are people traveling when they dont stay at home, find the average of how far people have travelled when they're not staying at home | |
# Unique values | |
ddd['Week'].nunique().compute() | |
how_far = ddd.groupby(by= "Week")["Number of Trips"].mean().compute() | |
# Convert the mean distance to int64 | |
how_far_int = how_far.astype('int64') | |
# Print the mean distance | |
print("How far are people travelling when they don't stay home on average:", how_far_int) | |
#Barplot | |
fig = plt.figure(figsize=(10, 6)) | |
plt.bar(range(len(how_far_int)), how_far_int, width=0.4, color='orange') | |
plt.xlabel("Week") | |
plt.xticks(range(len(how_far_int)), rotation=45) | |
plt.ylabel("Total Trip Distance") | |
plt.title("How far are people travelling when they don't stay home") | |
plt.rcParams.update({ | |
'text.color': "black", | |
'axes.labelcolor': "black", | |
'xtick.color': "black", | |
'ytick.color': "black", | |
'font.size': 10 | |
}) # Change text color and size for better readability | |
plt.tight_layout() | |
plt.savefig('bar_plot2.png') # Save the plot as an image file | |