Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import matplotlib.pyplot as plt
import time
# Parallel computing on a and b whilst working out computational efficiency using DASK!
n_processors = [10, 20]
n_processors_time = {}
for processor in n_processors:
start_time = time.time()
import csv
import numpy as np
import pandas as pd
from dask import dataframe as dd
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
import seaborn as sns
import matplotlib.pyplot as plt
# Load the CSV file using Dask
ddf = dd.read_csv('Trips_Full_Data.csv')
ddd = dd.read_csv('Trips_by_Distance.csv', usecols=['Week', 'Population Staying at Home','Number of Trips'], dtype={'Population Staying at Home': 'float64', 'Number of Trips': 'float64'})
# How many people are staying at home - find the average of the columns in TFD that are staying at home
# Calculate the average people staying at home using Dask
average = ddf['Population Staying at Home'].mean().compute() # Overall average
# Convert the average to an int otherwise it'll output in standard form
average_int = average.astype('int64')
print("Average number of people staying at home:", average_int)
# Data cleaning
# Fill null values
ddd['Population Staying at Home'] = ddd['Population Staying at Home'].fillna(0)
# Convert floats to ints so that the mean() works
ddd['Population Staying at Home'] = ddd['Population Staying at Home'].round().astype('int64')
# Group by 'Week' and calculate the average of 'Population Staying at Home' for each week
average_per_week = ddd.groupby('Week')['Population Staying at Home'].mean()
avperweek = average_per_week.compute()
# Convert the avperweek to integer otherwise it gives it to you in standard form
avperweek_int = avperweek.astype('int64')
print("Average number of people staying at home per week", avperweek_int)
fig = plt.figure(figsize=(10, 6))
plt.bar(range(len(avperweek_int)), avperweek_int, width=0.4, color='orange')
plt.xlabel("Week")
plt.xticks(range(len(avperweek_int)), rotation=45) # Display all week numbers
plt.ylabel("Average number of people staying at home")
plt.title("Average number of people staying at home per week")
plt.rcParams.update({
'text.color': "black",
'axes.labelcolor': "black",
'xtick.color': "black",
'ytick.color': "black",
'font.size': 10
}) # Change text color and size for better readability
plt.tight_layout()
plt.show() # For this I need to use plt.show() otherwise it gives a runtime error as I am using IDLE for this question.
#Documentation references: https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.astype.html
# How far are people traveling when they dont stay at home, find the average of how far people have travelled when they're not staying at home
df_data = dd.read_csv('Trips_Full_Data.csv', dtype = {'Trips 1-25 Miles': 'float64',
'Trips 1-3 Miles': 'float64',
'Trips 10-25 Miles': 'float64',
'Trips 100-250 Miles': 'float64',
'Trips 25-50 Miles': 'float64',
'Trips 250-500 Miles': 'float64',
'Trips 3-5 Miles': 'float64',
'Trips 5-10 Miles': 'float64',
'Trips 50-100 Miles': 'float64',
'Trips <1 Mile': 'float64',
'Trips >=500 Miles': 'float64',
'Population Not Staying at Home': 'float64',
'Population Staying at Home': 'float64',
'Week': 'float64'
})
Trips = [
'Trips 1-25 Miles',
'Trips 1-3 Miles',
'Trips 10-25 Miles',
'Trips 100-250 Miles',
'Trips 100+ Miles',
'Trips 25-100 Miles',
'Trips 25-50 Miles',
'Trips 250-500 Miles',
'Trips 3-5 Miles',
'Trips 5-10 Miles',
'Trips 50-100 Miles',
'Trips 500+ Miles'
]
# Group and sum
df_merge = df_data[Trips].sum().compute()
print(df_merge)
# Unique values
ddf['Week of Date'].nunique().compute()
how_far = ddf.groupby(by= "Week of Date")["Trips"].mean().compute()
print("Dfmerge", df_merge)
print("How far", how_far)
# Convert the mean distance to int64
how_far_int = how_far.astype('int64')
#Print the mean distance
print("How far are people travelling when they don't stay home on average:", how_far_int)
import csv
import numpy as np
import pandas as pd
from dask import dataframe as dd
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
# Graph visualisation does not work without this.
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.dates as mdates
from datetime import datetime
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
# Load the CSV file using Dask
ddd = dd.read_csv('Trips_by_Distance.csv',usecols=['Date', 'Number of Trips 10-25', 'Number of Trips 50-100'], dtype={'Number of Trips 10-25': 'float64', 'Number of Trips 50-100' : 'float64'})
ddd['Date'] = ddd['Date'].fillna(0)
ddd['Number of Trips 10-25'] = ddd['Number of Trips 10-25'].fillna(0)
ddd['Number of Trips 50-100'] = ddd['Number of Trips 50-100'].fillna(0)
# Trips 10-25
grouped_ddd = ddd.groupby('Date')
combined_ddd = grouped_ddd['Number of Trips 10-25'].sum().reset_index() # Combinig duplicate dates
# Identify the dates that > 10000000 people conducted 10-25 Number of Trips and compare them to > 10000000 people who did 50-100 Number of trips
popfilter = combined_ddd[combined_ddd['Number of Trips 10-25']>100000000] # Filter by dates greater than 10000000
dates_list = popfilter['Date'].to_dask_array().compute().tolist() # Puts the Dates into a list so its easier to PLOT and also convert to pandas
dates = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list]
# Plot the scatter plot
plt.scatter(x=dates, y=popfilter["Number of Trips 10-25"].to_dask_array(lengths=True).compute())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) # I get this error: RuntimeError: Locator attempting to generate 1039 ticks from 736991.0 to 738029.0: exceeds Locator.MAXTICKS
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
plt.gcf().autofmt_xdate() # Reference: https://github.com/matplotlib/matplotlib/issues/20202/
plt.title('Scatter plot of dates that > 10000000 people conducted 10-25 number of trips')
plt.xlabel('Date')
plt.ylabel('Number of Trips 10-25')
plt.show()
# Trips 50-100
grouped_ddd_50_100 = ddd.groupby('Date')
combined_ddd_50_100 = grouped_ddd_50_100['Number of Trips 50-100'].sum().reset_index()
# Identify the dates that > 10000000 people conducted 50-100 Number of Trips
popfilter_50_100 = combined_ddd_50_100[combined_ddd_50_100['Number of Trips 50-100'] > 10000000]
dates_list_50_100 = popfilter_50_100['Date'].to_dask_array().compute().tolist()
dates_50_100 = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list_50_100]
# Plot the scatter plot for Trips 50-100
plt.scatter(x=dates_50_100, y=popfilter_50_100["Number of Trips 50-100"].to_dask_array(lengths=True).compute())
plt.ylim(0, max(popfilter_50_100["Number of Trips 50-100"].compute()) * 1.1) # because otherwise theres some random orange line?
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
plt.gcf().autofmt_xdate()
plt.title('Scatter plot of dates that > 10000000 people conducted 50-100 number of trips')
plt.xlabel('Date')
plt.ylabel('Number of Trips 50-100')
plt.show()
dask_time = time.time() - start_time
n_processors_time[processor] = dask_time
print(f"Time taken for {processor} processors: {dask_time} seconds")
# Plotting the time taken for different numbers of processors
plt.figure(figsize=(8, 5))
plt.bar(n_processors_time.keys(), n_processors_time.values(), color=['orange', 'pink'])
plt.xlabel('Number of Processors')
plt.ylabel('Time Taken (seconds)')
plt.title('Time Taken for Processing with Different Numbers of Processors')
plt.xticks(list(n_processors_time.keys()))
plt.tight_layout()
plt.grid(True)
plt.show()
# Taken from output
# Time taken for 10 processors: 17.375105142593384 seconds
# Time taken for 20 processors: 14.823020696640015 seconds