Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
5011-BIG-DATA/number1c.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
40 lines (34 sloc)
1.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import dask.dataframe as dd | |
from dask.distributed import Client, LocalCluster | |
import time | |
def main(): | |
# Step 1: Load the data | |
# Specify dtype for the 'Number of Trips' column as 'float64' to avoid dtype inference issues | |
dask_data = dd.read_csv("trips_by_distance.csv", dtype={'Number of Trips': 'float64'}) | |
pandas_data = pd.read_csv("trips_by_distance.csv") | |
# Step 2: Perform data analysis tasks using Dask with 10 processors | |
cluster_10 = LocalCluster(n_workers=10) | |
client_10 = Client(cluster_10) | |
start_time_dask_10 = time.time() | |
trips_per_week_dask_10 = dask_data.groupby('Week')['Number of Trips'].sum().compute() | |
end_time_dask_10 = time.time() | |
execution_time_dask_10 = end_time_dask_10 - start_time_dask_10 | |
# Step 3: Perform data analysis tasks using Dask with 20 processors | |
cluster_20 = LocalCluster(n_workers=20) | |
client_20 = Client(cluster_20) | |
start_time_dask_20 = time.time() | |
trips_per_week_dask_20 = dask_data.groupby('Week')['Number of Trips'].sum().compute() | |
end_time_dask_20 = time.time() | |
execution_time_dask_20 = end_time_dask_20 - start_time_dask_20 | |
# Step 4: Perform data analysis tasks using Pandas | |
start_time_pandas = time.time() | |
trips_per_week_pandas = pandas_data.groupby('Week')['Number of Trips'].sum() | |
end_time_pandas = time.time() | |
execution_time_pandas = end_time_pandas - start_time_pandas | |
# Step 5: Measure execution time | |
print("Execution time using Pandas:", execution_time_pandas) | |
print("Execution time using Dask with 10 processors:", execution_time_dask_10) | |
print("Execution time using Dask with 20 processors:", execution_time_dask_20) | |
if __name__ == "__main__": | |
main() |