1c.py

import matplotlib.pyplot as plt
import time


# Parallel computing on a and b whilst working out computational efficiency using DASK!
n_processors = [10, 20]
n_processors_time = {}

for processor in n_processors:
    start_time = time.time()

    import csv
    import numpy as np
    import pandas as pd
    from dask import dataframe as dd
    import dask.dataframe as dd
    import dask.array as da
    import dask.bag as db
    import seaborn as sns
    import matplotlib.pyplot as plt


    # Load the CSV file using Dask
    ddf = dd.read_csv('Trips_Full_Data.csv')
    ddd = dd.read_csv('Trips_by_Distance.csv', usecols=['Week', 'Population Staying at Home','Number of Trips'], dtype={'Population Staying at Home': 'float64', 'Number of Trips': 'float64'})


    # How many people are staying at home - find the average of the columns in TFD that are staying at home


    # Calculate the average people staying at home using Dask
    average = ddf['Population Staying at Home'].mean().compute() # Overall average
    # Convert the average to an int otherwise it'll output in standard form
    average_int = average.astype('int64')
    print("Average number of people staying at home:", average_int)
    # Data cleaning
    # Fill null values
    ddd['Population Staying at Home'] = ddd['Population Staying at Home'].fillna(0)
    # Convert floats to ints so that the mean() works
    ddd['Population Staying at Home'] = ddd['Population Staying at Home'].round().astype('int64')


    # Group by 'Week' and calculate the average of 'Population Staying at Home' for each week
    average_per_week = ddd.groupby('Week')['Population Staying at Home'].mean()
    avperweek = average_per_week.compute()
    # Convert the avperweek to integer otherwise it gives it to you in standard form
    avperweek_int = avperweek.astype('int64')
    print("Average number of people staying at home per week", avperweek_int)

    fig = plt.figure(figsize=(10, 6))
    plt.bar(range(len(avperweek_int)), avperweek_int, width=0.4, color='orange')
    plt.xlabel("Week")
    plt.xticks(range(len(avperweek_int)), rotation=45)  # Display all week numbers
    plt.ylabel("Average number of people staying at home")
    plt.title("Average number of people staying at home per week")
    plt.rcParams.update({
        'text.color': "black",
        'axes.labelcolor': "black",
        'xtick.color': "black",
        'ytick.color': "black",
        'font.size': 10
    })  # Change text color and size for better readability
    plt.tight_layout()
    plt.show() # For this I need to use plt.show() otherwise it gives a runtime error as I am using IDLE for this question.

    #Documentation references: https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.astype.html

    # How far are people traveling when they dont stay at home, find the average of how far people have travelled when they're not staying at home
    df_data = dd.read_csv('Trips_Full_Data.csv', dtype = {'Trips 1-25 Miles': 'float64',
                                                              'Trips 1-3 Miles': 'float64',
                                                              'Trips 10-25 Miles': 'float64',
                                                              'Trips 100-250 Miles': 'float64',
                                                              'Trips 25-50 Miles': 'float64',
                                                              'Trips 250-500 Miles': 'float64',
                                                              'Trips 3-5 Miles': 'float64',
                                                              'Trips 5-10 Miles': 'float64',
                                                              'Trips 50-100 Miles': 'float64',
                                                              'Trips <1 Mile': 'float64',
                                                              'Trips >=500 Miles': 'float64',
                                                              'Population Not Staying at Home': 'float64',
                                                              'Population Staying at Home': 'float64',
                                                              'Week': 'float64'
                                                          })


    Trips = [
        'Trips 1-25 Miles',
        'Trips 1-3 Miles',
        'Trips 10-25 Miles',
        'Trips 100-250 Miles',
        'Trips 100+ Miles',
        'Trips 25-100 Miles',
        'Trips 25-50 Miles',
        'Trips 250-500 Miles',
        'Trips 3-5 Miles',
        'Trips 5-10 Miles',
        'Trips 50-100 Miles',
        'Trips 500+ Miles'
    ]
    # Group and sum
    df_merge = df_data[Trips].sum().compute()
    print(df_merge)

    # Unique values
    ddf['Week of Date'].nunique().compute()
    how_far = ddf.groupby(by= "Week of Date")["Trips"].mean().compute()
    print("Dfmerge", df_merge)
    print("How far", how_far)
    # Convert the mean distance to int64
    how_far_int = how_far.astype('int64')

    #Print the mean distance
    print("How far are people travelling when they don't stay home on average:", how_far_int)
    import csv
    import numpy as np
    import pandas as pd
    from dask import dataframe as dd
    import dask.dataframe as dd
    import dask.array as da
    import dask.bag as db
    # Graph visualisation does not work without this.
    import matplotlib.pyplot as plt
    from matplotlib.ticker import MaxNLocator
    import matplotlib.dates as mdates
    from datetime import datetime
    from pandas.plotting import register_matplotlib_converters
    register_matplotlib_converters()


    # Load the CSV file using Dask
    ddd = dd.read_csv('Trips_by_Distance.csv',usecols=['Date', 'Number of Trips 10-25', 'Number of Trips 50-100'], dtype={'Number of Trips 10-25': 'float64', 'Number of Trips 50-100' : 'float64'})
    ddd['Date'] = ddd['Date'].fillna(0)
    ddd['Number of Trips 10-25'] = ddd['Number of Trips 10-25'].fillna(0)
    ddd['Number of Trips 50-100'] = ddd['Number of Trips 50-100'].fillna(0)

    # Trips 10-25
    grouped_ddd = ddd.groupby('Date')
    combined_ddd = grouped_ddd['Number of Trips 10-25'].sum().reset_index() # Combinig duplicate dates

    # Identify the dates that > 10000000 people conducted 10-25 Number of Trips and compare them to > 10000000 people who did 50-100 Number of trips

    popfilter = combined_ddd[combined_ddd['Number of Trips 10-25']>100000000] # Filter by dates greater than 10000000
    dates_list = popfilter['Date'].to_dask_array().compute().tolist() # Puts the Dates into a list so its easier to PLOT and also convert to pandas
    dates = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list]

    # Plot the scatter plot
    plt.scatter(x=dates, y=popfilter["Number of Trips 10-25"].to_dask_array(lengths=True).compute())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
    plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) # I get this error: RuntimeError: Locator attempting to generate 1039 ticks from 736991.0 to 738029.0: exceeds Locator.MAXTICKS
    plt.xticks(rotation=45)
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
    plt.gcf().autofmt_xdate() # Reference: https://github.com/matplotlib/matplotlib/issues/20202/
    plt.title('Scatter plot of dates that > 10000000 people conducted 10-25 number of trips')
    plt.xlabel('Date')
    plt.ylabel('Number of Trips 10-25')
    plt.show()

    # Trips 50-100
    grouped_ddd_50_100 = ddd.groupby('Date')
    combined_ddd_50_100 = grouped_ddd_50_100['Number of Trips 50-100'].sum().reset_index()

    # Identify the dates that > 10000000 people conducted 50-100 Number of Trips
    popfilter_50_100 = combined_ddd_50_100[combined_ddd_50_100['Number of Trips 50-100'] > 10000000]
    dates_list_50_100 = popfilter_50_100['Date'].to_dask_array().compute().tolist()
    dates_50_100 = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list_50_100]

    # Plot the scatter plot for Trips 50-100
    plt.scatter(x=dates_50_100, y=popfilter_50_100["Number of Trips 50-100"].to_dask_array(lengths=True).compute())
    plt.ylim(0, max(popfilter_50_100["Number of Trips 50-100"].compute()) * 1.1)  # because otherwise theres some random orange line?
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
    plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
    plt.xticks(rotation=45)
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
    plt.gcf().autofmt_xdate()
    plt.title('Scatter plot of dates that > 10000000 people conducted 50-100 number of trips')
    plt.xlabel('Date')
    plt.ylabel('Number of Trips 50-100')
    plt.show()

    dask_time = time.time() - start_time
    n_processors_time[processor] = dask_time
    print(f"Time taken for {processor} processors: {dask_time} seconds")

# Plotting the time taken for different numbers of processors
plt.figure(figsize=(8, 5))
plt.bar(n_processors_time.keys(), n_processors_time.values(), color=['orange', 'pink'])
plt.xlabel('Number of Processors')
plt.ylabel('Time Taken (seconds)')
plt.title('Time Taken for Processing with Different Numbers of Processors')
plt.xticks(list(n_processors_time.keys()))
plt.tight_layout()
plt.grid(True)
plt.show()

# Taken from output
# Time taken for 10 processors: 17.375105142593384 seconds
# Time taken for 20 processors: 14.823020696640015 seconds
	import matplotlib.pyplot as plt
	import time


	# Parallel computing on a and b whilst working out computational efficiency using DASK!
	n_processors = [10, 20]
	n_processors_time = {}

	for processor in n_processors:
	start_time = time.time()

	import csv
	import numpy as np
	import pandas as pd
	from dask import dataframe as dd
	import dask.dataframe as dd
	import dask.array as da
	import dask.bag as db
	import seaborn as sns
	import matplotlib.pyplot as plt


	# Load the CSV file using Dask
	ddf = dd.read_csv('Trips_Full_Data.csv')
	ddd = dd.read_csv('Trips_by_Distance.csv', usecols=['Week', 'Population Staying at Home','Number of Trips'], dtype={'Population Staying at Home': 'float64', 'Number of Trips': 'float64'})


	# How many people are staying at home - find the average of the columns in TFD that are staying at home


	# Calculate the average people staying at home using Dask
	average = ddf['Population Staying at Home'].mean().compute() # Overall average
	# Convert the average to an int otherwise it'll output in standard form
	average_int = average.astype('int64')
	print("Average number of people staying at home:", average_int)
	# Data cleaning
	# Fill null values
	ddd['Population Staying at Home'] = ddd['Population Staying at Home'].fillna(0)
	# Convert floats to ints so that the mean() works
	ddd['Population Staying at Home'] = ddd['Population Staying at Home'].round().astype('int64')


	# Group by 'Week' and calculate the average of 'Population Staying at Home' for each week
	average_per_week = ddd.groupby('Week')['Population Staying at Home'].mean()
	avperweek = average_per_week.compute()
	# Convert the avperweek to integer otherwise it gives it to you in standard form
	avperweek_int = avperweek.astype('int64')
	print("Average number of people staying at home per week", avperweek_int)

	fig = plt.figure(figsize=(10, 6))
	plt.bar(range(len(avperweek_int)), avperweek_int, width=0.4, color='orange')
	plt.xlabel("Week")
	plt.xticks(range(len(avperweek_int)), rotation=45) # Display all week numbers
	plt.ylabel("Average number of people staying at home")
	plt.title("Average number of people staying at home per week")
	plt.rcParams.update({
	'text.color': "black",
	'axes.labelcolor': "black",
	'xtick.color': "black",
	'ytick.color': "black",
	'font.size': 10
	}) # Change text color and size for better readability
	plt.tight_layout()
	plt.show() # For this I need to use plt.show() otherwise it gives a runtime error as I am using IDLE for this question.

	#Documentation references: https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.astype.html

	# How far are people traveling when they dont stay at home, find the average of how far people have travelled when they're not staying at home
	df_data = dd.read_csv('Trips_Full_Data.csv', dtype = {'Trips 1-25 Miles': 'float64',
	'Trips 1-3 Miles': 'float64',
	'Trips 10-25 Miles': 'float64',
	'Trips 100-250 Miles': 'float64',
	'Trips 25-50 Miles': 'float64',
	'Trips 250-500 Miles': 'float64',
	'Trips 3-5 Miles': 'float64',
	'Trips 5-10 Miles': 'float64',
	'Trips 50-100 Miles': 'float64',
	'Trips <1 Mile': 'float64',
	'Trips >=500 Miles': 'float64',
	'Population Not Staying at Home': 'float64',
	'Population Staying at Home': 'float64',
	'Week': 'float64'
	})


	Trips = [
	'Trips 1-25 Miles',
	'Trips 1-3 Miles',
	'Trips 10-25 Miles',
	'Trips 100-250 Miles',
	'Trips 100+ Miles',
	'Trips 25-100 Miles',
	'Trips 25-50 Miles',
	'Trips 250-500 Miles',
	'Trips 3-5 Miles',
	'Trips 5-10 Miles',
	'Trips 50-100 Miles',
	'Trips 500+ Miles'
	]
	# Group and sum
	df_merge = df_data[Trips].sum().compute()
	print(df_merge)

	# Unique values
	ddf['Week of Date'].nunique().compute()
	how_far = ddf.groupby(by= "Week of Date")["Trips"].mean().compute()
	print("Dfmerge", df_merge)
	print("How far", how_far)
	# Convert the mean distance to int64
	how_far_int = how_far.astype('int64')

	#Print the mean distance
	print("How far are people travelling when they don't stay home on average:", how_far_int)
	import csv
	import numpy as np
	import pandas as pd
	from dask import dataframe as dd
	import dask.dataframe as dd
	import dask.array as da
	import dask.bag as db
	# Graph visualisation does not work without this.
	import matplotlib.pyplot as plt
	from matplotlib.ticker import MaxNLocator
	import matplotlib.dates as mdates
	from datetime import datetime
	from pandas.plotting import register_matplotlib_converters
	register_matplotlib_converters()


	# Load the CSV file using Dask
	ddd = dd.read_csv('Trips_by_Distance.csv',usecols=['Date', 'Number of Trips 10-25', 'Number of Trips 50-100'], dtype={'Number of Trips 10-25': 'float64', 'Number of Trips 50-100' : 'float64'})
	ddd['Date'] = ddd['Date'].fillna(0)
	ddd['Number of Trips 10-25'] = ddd['Number of Trips 10-25'].fillna(0)
	ddd['Number of Trips 50-100'] = ddd['Number of Trips 50-100'].fillna(0)

	# Trips 10-25
	grouped_ddd = ddd.groupby('Date')
	combined_ddd = grouped_ddd['Number of Trips 10-25'].sum().reset_index() # Combinig duplicate dates

	# Identify the dates that > 10000000 people conducted 10-25 Number of Trips and compare them to > 10000000 people who did 50-100 Number of trips

	popfilter = combined_ddd[combined_ddd['Number of Trips 10-25']>100000000] # Filter by dates greater than 10000000
	dates_list = popfilter['Date'].to_dask_array().compute().tolist() # Puts the Dates into a list so its easier to PLOT and also convert to pandas
	dates = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list]

	# Plot the scatter plot
	plt.scatter(x=dates, y=popfilter["Number of Trips 10-25"].to_dask_array(lengths=True).compute())
	plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
	plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) # I get this error: RuntimeError: Locator attempting to generate 1039 ticks from 736991.0 to 738029.0: exceeds Locator.MAXTICKS
	plt.xticks(rotation=45)
	plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
	plt.gcf().autofmt_xdate() # Reference: https://github.com/matplotlib/matplotlib/issues/20202/
	plt.title('Scatter plot of dates that > 10000000 people conducted 10-25 number of trips')
	plt.xlabel('Date')
	plt.ylabel('Number of Trips 10-25')
	plt.show()

	# Trips 50-100
	grouped_ddd_50_100 = ddd.groupby('Date')
	combined_ddd_50_100 = grouped_ddd_50_100['Number of Trips 50-100'].sum().reset_index()

	# Identify the dates that > 10000000 people conducted 50-100 Number of Trips
	popfilter_50_100 = combined_ddd_50_100[combined_ddd_50_100['Number of Trips 50-100'] > 10000000]
	dates_list_50_100 = popfilter_50_100['Date'].to_dask_array().compute().tolist()
	dates_50_100 = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list_50_100]

	# Plot the scatter plot for Trips 50-100
	plt.scatter(x=dates_50_100, y=popfilter_50_100["Number of Trips 50-100"].to_dask_array(lengths=True).compute())
	plt.ylim(0, max(popfilter_50_100["Number of Trips 50-100"].compute()) * 1.1) # because otherwise theres some random orange line?
	plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
	plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
	plt.xticks(rotation=45)
	plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
	plt.gcf().autofmt_xdate()
	plt.title('Scatter plot of dates that > 10000000 people conducted 50-100 number of trips')
	plt.xlabel('Date')
	plt.ylabel('Number of Trips 50-100')
	plt.show()

	dask_time = time.time() - start_time
	n_processors_time[processor] = dask_time
	print(f"Time taken for {processor} processors: {dask_time} seconds")

	# Plotting the time taken for different numbers of processors
	plt.figure(figsize=(8, 5))
	plt.bar(n_processors_time.keys(), n_processors_time.values(), color=['orange', 'pink'])
	plt.xlabel('Number of Processors')
	plt.ylabel('Time Taken (seconds)')
	plt.title('Time Taken for Processing with Different Numbers of Processors')
	plt.xticks(list(n_processors_time.keys()))
	plt.tight_layout()
	plt.grid(True)
	plt.show()

	# Taken from output
	# Time taken for 10 processors: 17.375105142593384 seconds
	# Time taken for 20 processors: 14.823020696640015 seconds