Skip to content


Browse files Browse the repository at this point in the history
  • Loading branch information
ullaha18 committed Apr 12, 2024
1 parent c6df5a4 commit 70f4594
Showing 1 changed file with 43 additions and 0 deletions.
43 changes: 43 additions & 0 deletions
@@ -0,0 +1,43 @@
import time
import dask.dataframe as dd
from dask.distributed import Client
from dask.diagnostics import ProgressBar

def main():
n_processors = [10, 20]
n_processors_time = {}

for processor in n_processors:
print(f"\n\nStarting computation with {processor} processors...\n\n")
client = Client(n_workers=min(processor, 4), threads_per_worker=1)

start = time.time()

# Reading only necessary columns from the Trips by Distance data with Dask
df_dask = dd.read_csv(
usecols=['Week', 'Population Staying at Home'],
dtype={'Week': 'int64', 'Population Staying at Home': 'float64'} # Explicitly defining data types

# Task a: Count unique weeks
unique_weeks = df_dask['Week'].nunique().compute()
print(f"Unique weeks in the dataset: {unique_weeks}")

# Task b: Calculate average population staying at home by week
avg_population_staying_home = df_dask.groupby('Week')['Population Staying at Home'].mean().compute()
print("Average Population Staying At Home per week (first few entries):\n", avg_population_staying_home.head())

dask_time = time.time() - start
n_processors_time[processor] = dask_time
print(f"\nTime with {processor} processors: {dask_time:.2f} seconds\n")


print("\n\n", n_processors_time, "\n\n")

if __name__ == '__main__':

0 comments on commit 70f4594

Please sign in to comment.