Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BIG-data/1c.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
43 lines (30 sloc)
1.48 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import dask.dataframe as dd | |
from dask.distributed import Client | |
from dask.diagnostics import ProgressBar | |
def main(): | |
n_processors = [10, 20] | |
n_processors_time = {} | |
for processor in n_processors: | |
print(f"\n\nStarting computation with {processor} processors...\n\n") | |
client = Client(n_workers=min(processor, 4), threads_per_worker=1) | |
start = time.time() | |
# Reading only necessary columns from the Trips by Distance data with Dask | |
df_dask = dd.read_csv( | |
'Trips_By_Distance.csv', | |
usecols=['Week', 'Population Staying at Home'], | |
dtype={'Week': 'int64', 'Population Staying at Home': 'float64'} # Explicitly defining data types | |
) | |
# Task a: Count unique weeks | |
unique_weeks = df_dask['Week'].nunique().compute() | |
print(f"Unique weeks in the dataset: {unique_weeks}") | |
# Task b: Calculate average population staying at home by week | |
avg_population_staying_home = df_dask.groupby('Week')['Population Staying at Home'].mean().compute() | |
print("Average Population Staying At Home per week (first few entries):\n", avg_population_staying_home.head()) | |
dask_time = time.time() - start | |
n_processors_time[processor] = dask_time | |
print(f"\nTime with {processor} processors: {dask_time:.2f} seconds\n") | |
client.close() | |
print("\n\n", n_processors_time, "\n\n") | |
if __name__ == '__main__': | |
main() |