Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import time
def main():
# Step 1: Load the data
# Specify dtype for the 'Number of Trips' column as 'float64' to avoid dtype inference issues
dask_data = dd.read_csv("trips_by_distance.csv", dtype={'Number of Trips': 'float64'})
pandas_data = pd.read_csv("trips_by_distance.csv")
# Step 2: Perform data analysis tasks using Dask with 10 processors
cluster_10 = LocalCluster(n_workers=10)
client_10 = Client(cluster_10)
start_time_dask_10 = time.time()
trips_per_week_dask_10 = dask_data.groupby('Week')['Number of Trips'].sum().compute()
end_time_dask_10 = time.time()
execution_time_dask_10 = end_time_dask_10 - start_time_dask_10
# Step 3: Perform data analysis tasks using Dask with 20 processors
cluster_20 = LocalCluster(n_workers=20)
client_20 = Client(cluster_20)
start_time_dask_20 = time.time()
trips_per_week_dask_20 = dask_data.groupby('Week')['Number of Trips'].sum().compute()
end_time_dask_20 = time.time()
execution_time_dask_20 = end_time_dask_20 - start_time_dask_20
# Step 4: Perform data analysis tasks using Pandas
start_time_pandas = time.time()
trips_per_week_pandas = pandas_data.groupby('Week')['Number of Trips'].sum()
end_time_pandas = time.time()
execution_time_pandas = end_time_pandas - start_time_pandas
# Step 5: Measure execution time
print("Execution time using Pandas:", execution_time_pandas)
print("Execution time using Dask with 10 processors:", execution_time_dask_10)
print("Execution time using Dask with 20 processors:", execution_time_dask_20)
if __name__ == "__main__":
main()