1b.py

import csv
import numpy as np
import pandas as pd
from dask import dataframe as dd
import dask.dataframe as dd
# Graph visualisation does not work without this.
import matplotlib
matplotlib.use('Agg')  # Use the Agg backend
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.dates as mdates
from datetime import datetime
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()


# Load the CSV file using Dask
ddd = dd.read_csv('Trips_by_Distance.csv',usecols=['Date', 'Number of Trips 10-25', 'Number of Trips 50-100'], dtype={'Number of Trips 10-25': 'float64', 'Number of Trips 50-100' : 'float64'})

# Data cleaning
ddd['Date'] = ddd['Date'].fillna(0)
ddd['Number of Trips 10-25'] = ddd['Number of Trips 10-25'].fillna(0)
ddd['Number of Trips 50-100'] = ddd['Number of Trips 50-100'].fillna(0)

# Trips 10-25
grouped_ddd = ddd.groupby('Date')
combined_ddd = grouped_ddd['Number of Trips 10-25'].sum().reset_index() # Combinig duplicate dates

# Identify the dates that > 10000000 people conducted 10-25 Number of Trips and compare them to > 10000000 people who did 50-100 Number of trips

popfilter = combined_ddd[combined_ddd['Number of Trips 10-25']>100000000] # Filter by dates greater than 10000000
dates_list = popfilter['Date'].to_dask_array().compute().tolist() # Puts the Dates into a list so its easier to PLOT and also convert to pandas
dates = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list]

# Plot the scatter plot
plt.scatter(x=dates, y=popfilter["Number of Trips 10-25"].to_dask_array(lengths=True).compute())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) # I get this error: RuntimeError: Locator attempting to generate 1039 ticks from 736991.0 to 738029.0: exceeds Locator.MAXTICKS
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
plt.gcf().autofmt_xdate() # Reference: https://github.com/matplotlib/matplotlib/issues/20202/
plt.title('Scatter plot of dates that > 10000000 people conducted 10-25 number of trips')
plt.xlabel('Date')
plt.ylabel('Number of Trips 10-25')
plt.savefig('scatter1.png')

# Trips 50-100
grouped_ddd_50_100 = ddd.groupby('Date')
combined_ddd_50_100 = grouped_ddd_50_100['Number of Trips 50-100'].sum().reset_index()

# Identify the dates that > 10000000 people conducted 50-100 Number of Trips
popfilter_50_100 = combined_ddd_50_100[combined_ddd_50_100['Number of Trips 50-100'] > 10000000]
dates_list_50_100 = popfilter_50_100['Date'].to_dask_array().compute().tolist()
dates_50_100 = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list_50_100]

# Plot the scatter plot for Trips 50-100
plt.scatter(x=dates_50_100, y=popfilter_50_100["Number of Trips 50-100"].to_dask_array(lengths=True).compute())
plt.ylim(0, max(popfilter_50_100["Number of Trips 50-100"].compute()) * 1.1)  # because otherwise theres some random orange line?
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
plt.gcf().autofmt_xdate()
plt.title('Scatter plot of dates that > 10000000 people conducted 50-100 number of trips')
plt.xlabel('Date')
plt.ylabel('Number of Trips 50-100')
plt.savefig('scatter2.png')
	import csv
	import numpy as np
	import pandas as pd
	from dask import dataframe as dd
	import dask.dataframe as dd
	# Graph visualisation does not work without this.
	import matplotlib
	matplotlib.use('Agg') # Use the Agg backend
	import matplotlib.pyplot as plt
	from matplotlib.ticker import MaxNLocator
	import matplotlib.dates as mdates
	from datetime import datetime
	from pandas.plotting import register_matplotlib_converters
	register_matplotlib_converters()


	# Load the CSV file using Dask
	ddd = dd.read_csv('Trips_by_Distance.csv',usecols=['Date', 'Number of Trips 10-25', 'Number of Trips 50-100'], dtype={'Number of Trips 10-25': 'float64', 'Number of Trips 50-100' : 'float64'})

	# Data cleaning
	ddd['Date'] = ddd['Date'].fillna(0)
	ddd['Number of Trips 10-25'] = ddd['Number of Trips 10-25'].fillna(0)
	ddd['Number of Trips 50-100'] = ddd['Number of Trips 50-100'].fillna(0)

	# Trips 10-25
	grouped_ddd = ddd.groupby('Date')
	combined_ddd = grouped_ddd['Number of Trips 10-25'].sum().reset_index() # Combinig duplicate dates

	# Identify the dates that > 10000000 people conducted 10-25 Number of Trips and compare them to > 10000000 people who did 50-100 Number of trips

	popfilter = combined_ddd[combined_ddd['Number of Trips 10-25']>100000000] # Filter by dates greater than 10000000
	dates_list = popfilter['Date'].to_dask_array().compute().tolist() # Puts the Dates into a list so its easier to PLOT and also convert to pandas
	dates = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list]

	# Plot the scatter plot
	plt.scatter(x=dates, y=popfilter["Number of Trips 10-25"].to_dask_array(lengths=True).compute())
	plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
	plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) # I get this error: RuntimeError: Locator attempting to generate 1039 ticks from 736991.0 to 738029.0: exceeds Locator.MAXTICKS
	plt.xticks(rotation=45)
	plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
	plt.gcf().autofmt_xdate() # Reference: https://github.com/matplotlib/matplotlib/issues/20202/
	plt.title('Scatter plot of dates that > 10000000 people conducted 10-25 number of trips')
	plt.xlabel('Date')
	plt.ylabel('Number of Trips 10-25')
	plt.savefig('scatter1.png')

	# Trips 50-100
	grouped_ddd_50_100 = ddd.groupby('Date')
	combined_ddd_50_100 = grouped_ddd_50_100['Number of Trips 50-100'].sum().reset_index()

	# Identify the dates that > 10000000 people conducted 50-100 Number of Trips
	popfilter_50_100 = combined_ddd_50_100[combined_ddd_50_100['Number of Trips 50-100'] > 10000000]
	dates_list_50_100 = popfilter_50_100['Date'].to_dask_array().compute().tolist()
	dates_50_100 = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list_50_100]

	# Plot the scatter plot for Trips 50-100
	plt.scatter(x=dates_50_100, y=popfilter_50_100["Number of Trips 50-100"].to_dask_array(lengths=True).compute())
	plt.ylim(0, max(popfilter_50_100["Number of Trips 50-100"].compute()) * 1.1) # because otherwise theres some random orange line?
	plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
	plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
	plt.xticks(rotation=45)
	plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15))
	plt.gcf().autofmt_xdate()
	plt.title('Scatter plot of dates that > 10000000 people conducted 50-100 number of trips')
	plt.xlabel('Date')
	plt.ylabel('Number of Trips 50-100')
	plt.savefig('scatter2.png')