Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import time
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client
import multiprocessing
import plotly.express as px
def process_data(file_path):
# Load data using Dask dataframe
df = dd.read_csv(file_path, assume_missing=True)
# Drop rows with any missing values
df = df.dropna()
# Compute unique values of 'Year of Date'
unique_years = df['Year of Date'].nunique().compute()
# Compute mean of 'Trips 1-25 Miles' grouped by 'Year of Date'
mean_trips = df.groupby(by='Year of Date')['Trips 1-25 Miles'].mean().compute()
return unique_years, mean_trips
def plot_trips_data(file_path):
# Load data using Pandas
question2 = pd.read_csv(file_path)
# Filter data for trips between 10-25 miles and 50-100 miles with more than 10,000,000 trips
trips_10_25 = question2[question2['Number of Trips 10-25'] > 10000000][['Date', 'Number of Trips 50-100']]
trips_50_100 = question2[question2['Number of Trips 50-100'] > 10000000][['Date', 'Number of Trips 50-100']]
# Create scatter plots using Plotly Express
fig1 = px.scatter(trips_10_25, x='Date', y='Number of Trips 50-100', title='Trips 10-25 Miles')
fig2 = px.scatter(trips_50_100, x='Date', y='Number of Trips 50-100', title='Trips 50-100 Miles')
return fig1, fig2
if __name__ == '__main__':
multiprocessing.freeze_support()
n_processors = [10, 20]
n_processors_time = {}
for processor in n_processors:
print(f"\n\n\nStarting computation with {processor} processors...\n\n\n")
start = time.time()
# Start Dask client with specified number of workers
client = Client(n_workers=processor)
# Process data using Dask
unique_years, mean_trips = process_data("/Users/thecreator/Downloads/Trips_Full Data.csv")
# Plot data using Plotly Express
fig1, fig2 = plot_trips_data("/Users/thecreator/Downloads/Trips_by_Distance.csv")
fig1.show()
fig2.show()
# Calculate execution time
dask_time = time.time() - start
n_processors_time[processor] = dask_time
print(f"\n\n\nTime with {processor} processors: {dask_time} seconds\n\n\n")
# Close Dask client
client.close()
print("\n\n\n", n_processors_time, "\n\n\n")