Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
5011CEM_COURSEWORK_CLINTON-EKWUGHA_13293446/4.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
66 lines (48 sloc)
2.27 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import pandas as pd | |
import dask.dataframe as dd | |
from dask.distributed import Client | |
import multiprocessing | |
import plotly.express as px | |
def process_data(file_path): | |
# Load data using Dask dataframe | |
df = dd.read_csv(file_path, assume_missing=True) | |
# Drop rows with any missing values | |
df = df.dropna() | |
# Compute unique values of 'Year of Date' | |
unique_years = df['Year of Date'].nunique().compute() | |
# Compute mean of 'Trips 1-25 Miles' grouped by 'Year of Date' | |
mean_trips = df.groupby(by='Year of Date')['Trips 1-25 Miles'].mean().compute() | |
return unique_years, mean_trips | |
def plot_trips_data(file_path): | |
# Load data using Pandas | |
question2 = pd.read_csv(file_path) | |
# Filter data for trips between 10-25 miles and 50-100 miles with more than 10,000,000 trips | |
trips_10_25 = question2[question2['Number of Trips 10-25'] > 10000000][['Date', 'Number of Trips 50-100']] | |
trips_50_100 = question2[question2['Number of Trips 50-100'] > 10000000][['Date', 'Number of Trips 50-100']] | |
# Create scatter plots using Plotly Express | |
fig1 = px.scatter(trips_10_25, x='Date', y='Number of Trips 50-100', title='Trips 10-25 Miles') | |
fig2 = px.scatter(trips_50_100, x='Date', y='Number of Trips 50-100', title='Trips 50-100 Miles') | |
return fig1, fig2 | |
if __name__ == '__main__': | |
multiprocessing.freeze_support() | |
n_processors = [10, 20] | |
n_processors_time = {} | |
for processor in n_processors: | |
print(f"\n\n\nStarting computation with {processor} processors...\n\n\n") | |
start = time.time() | |
# Start Dask client with specified number of workers | |
client = Client(n_workers=processor) | |
# Process data using Dask | |
unique_years, mean_trips = process_data("/Users/thecreator/Downloads/Trips_Full Data.csv") | |
# Plot data using Plotly Express | |
fig1, fig2 = plot_trips_data("/Users/thecreator/Downloads/Trips_by_Distance.csv") | |
fig1.show() | |
fig2.show() | |
# Calculate execution time | |
dask_time = time.time() - start | |
n_processors_time[processor] = dask_time | |
print(f"\n\n\nTime with {processor} processors: {dask_time} seconds\n\n\n") | |
# Close Dask client | |
client.close() | |
print("\n\n\n", n_processors_time, "\n\n\n") |