Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
DataProject/1b.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
67 lines (57 sloc)
3.38 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import numpy as np | |
import pandas as pd | |
from dask import dataframe as dd | |
import dask.dataframe as dd | |
# Graph visualisation does not work without this. | |
import matplotlib | |
matplotlib.use('Agg') # Use the Agg backend | |
import matplotlib.pyplot as plt | |
from matplotlib.ticker import MaxNLocator | |
import matplotlib.dates as mdates | |
from datetime import datetime | |
from pandas.plotting import register_matplotlib_converters | |
register_matplotlib_converters() | |
# Load the CSV file using Dask | |
ddd = dd.read_csv('Trips_by_Distance.csv',usecols=['Date', 'Number of Trips 10-25', 'Number of Trips 50-100'], dtype={'Number of Trips 10-25': 'float64', 'Number of Trips 50-100' : 'float64'}) | |
# Data cleaning | |
ddd['Date'] = ddd['Date'].fillna(0) | |
ddd['Number of Trips 10-25'] = ddd['Number of Trips 10-25'].fillna(0) | |
ddd['Number of Trips 50-100'] = ddd['Number of Trips 50-100'].fillna(0) | |
# Trips 10-25 | |
grouped_ddd = ddd.groupby('Date') | |
combined_ddd = grouped_ddd['Number of Trips 10-25'].sum().reset_index() # Combinig duplicate dates | |
# Identify the dates that > 10000000 people conducted 10-25 Number of Trips and compare them to > 10000000 people who did 50-100 Number of trips | |
popfilter = combined_ddd[combined_ddd['Number of Trips 10-25']>100000000] # Filter by dates greater than 10000000 | |
dates_list = popfilter['Date'].to_dask_array().compute().tolist() # Puts the Dates into a list so its easier to PLOT and also convert to pandas | |
dates = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list] | |
# Plot the scatter plot | |
plt.scatter(x=dates, y=popfilter["Number of Trips 10-25"].to_dask_array(lengths=True).compute()) | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y')) | |
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) # I get this error: RuntimeError: Locator attempting to generate 1039 ticks from 736991.0 to 738029.0: exceeds Locator.MAXTICKS | |
plt.xticks(rotation=45) | |
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15)) | |
plt.gcf().autofmt_xdate() # Reference: https://github.com/matplotlib/matplotlib/issues/20202/ | |
plt.title('Scatter plot of dates that > 10000000 people conducted 10-25 number of trips') | |
plt.xlabel('Date') | |
plt.ylabel('Number of Trips 10-25') | |
plt.savefig('scatter1.png') | |
# Trips 50-100 | |
grouped_ddd_50_100 = ddd.groupby('Date') | |
combined_ddd_50_100 = grouped_ddd_50_100['Number of Trips 50-100'].sum().reset_index() | |
# Identify the dates that > 10000000 people conducted 50-100 Number of Trips | |
popfilter_50_100 = combined_ddd_50_100[combined_ddd_50_100['Number of Trips 50-100'] > 10000000] | |
dates_list_50_100 = popfilter_50_100['Date'].to_dask_array().compute().tolist() | |
dates_50_100 = [datetime.strptime(date_str, '%m/%d/%Y') for date_str in dates_list_50_100] | |
# Plot the scatter plot for Trips 50-100 | |
plt.scatter(x=dates_50_100, y=popfilter_50_100["Number of Trips 50-100"].to_dask_array(lengths=True).compute()) | |
plt.ylim(0, max(popfilter_50_100["Number of Trips 50-100"].compute()) * 1.1) # because otherwise theres some random orange line? | |
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y')) | |
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7)) | |
plt.xticks(rotation=45) | |
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True, nbins=15)) | |
plt.gcf().autofmt_xdate() | |
plt.title('Scatter plot of dates that > 10000000 people conducted 50-100 number of trips') | |
plt.xlabel('Date') | |
plt.ylabel('Number of Trips 50-100') | |
plt.savefig('scatter2.png') |