From c39359e8fea1c042ee56de2295fdd40b0a37d393 Mon Sep 17 00:00:00 2001 From: "Alaia Aliause (aliausea)" Date: Tue, 2 Jul 2024 15:30:13 +0100 Subject: [PATCH] Add files via upload --- 5011V1Coursework.py | 126 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 5011V1Coursework.py diff --git a/5011V1Coursework.py b/5011V1Coursework.py new file mode 100644 index 0000000..87aeb5b --- /dev/null +++ b/5011V1Coursework.py @@ -0,0 +1,126 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import multiprocessing as mp +import time + +# Define a function to clean the datasets +def clean_data(df): + df_cleaned = df.drop_duplicates().dropna() + + if 'Date' in df_cleaned.columns: + df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date']) + if 'Trips' in df_cleaned.columns: + df_cleaned['Trips'] = df_cleaned['Trips'].astype(int) + if 'Population Staying at Home' in df_cleaned.columns: + df_cleaned['Population Staying at Home'] = df_cleaned['Population Staying at Home'].astype(int) + if 'People Not Staying at Home' in df_cleaned.columns: + df_cleaned['People Not Staying at Home'] = df_cleaned['People Not Staying at Home'].astype(int) + return df_cleaned + +# Load the datasets +trips_by_distance_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_by_Distance.csv' +trips_full_data_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_Full Data.csv' + +trips_by_distance = pd.read_csv(trips_by_distance_path) +trips_full_data = pd.read_csv(trips_full_data_path) + +# Print the columns of the datasets to check their structure +print("Columns in 'Trips_by_Distance':", trips_by_distance.columns) +print("Columns in 'Trips_Full Data':", trips_full_data.columns) + +# Clean the datasets +cleaned_trips_by_distance = clean_data(trips_by_distance) +cleaned_trips_full_data = clean_data(trips_full_data) + +# Ensure that the required columns exist before proceeding +required_columns = ['Trips', 'Population Staying at Home', 'People Not Staying at Home', 'Date'] +missing_columns = [col for col in required_columns if col not in cleaned_trips_full_data.columns] + +if missing_columns: + print(f"Missing columns in 'Trips_Full Data': {missing_columns}") +else: + # (a) How many people are staying at home? + people_staying_home = cleaned_trips_full_data['Population Staying at Home'].sum() + print(f"Number of people staying at home: {people_staying_home}") + + # (a) How far are people traveling when they don’t stay home? + trips_distance_columns = [col for col in cleaned_trips_full_data.columns if 'Trips' in col and 'Miles' in col] + distance_traveled = cleaned_trips_full_data[trips_distance_columns].sum().sum() + print(f"Total distance traveled by those who don't stay home: {distance_traveled}") + + # (b) Identify dates with > 10,000,000 people conducting 10-25 and 50-100 trips + dates_10_25_trips = cleaned_trips_full_data[ + (cleaned_trips_full_data['Trips'] >= 10) & + (cleaned_trips_full_data['Trips'] <= 25) & + (cleaned_trips_full_data['People Not Staying at Home'] > 10000000) + ]['Date'].unique() + + dates_50_100_trips = cleaned_trips_full_data[ + (cleaned_trips_full_data['Trips'] >= 50) & + (cleaned_trips_full_data['Trips'] <= 100) & + (cleaned_trips_full_data['People Not Staying at Home'] > 10000000) + ]['Date'].unique() + + print("Dates with > 10,000,000 people conducting 10-25 trips:") + print(dates_10_25_trips) + + print("\nDates with > 10,000,000 people conducting 50-100 trips:") + print(dates_50_100_trips) + + # (c) Parallel processing functions + def process_data(df, trips_range): + result = df[(df['Trips'] >= trips_range[0]) & (df['Trips'] <= trips_range[1]) & (df['People Not Staying at Home'] > 10000000)] + return result['Date'].unique() + + def parallel_processing(df, ranges, n_processors): + pool = mp.Pool(processes=n_processors) + results = pool.starmap(process_data, [(df, r) for r in ranges]) + pool.close() + pool.join() + return results + + if __name__ == '__main__': + # (c) Sequential processing + start_time = time.time() + dates_10_25_trips_seq = process_data(cleaned_trips_full_data, (10, 25)) + dates_50_100_trips_seq = process_data(cleaned_trips_full_data, (50, 100)) + sequential_time = time.time() - start_time + print("Sequential processing time:", sequential_time) + + # Parallel processing with 10 processors + start_time = time.time() + ranges = [(10, 25), (50, 100)] + dates_parallel_10 = parallel_processing(cleaned_trips_full_data, ranges, 10) + parallel_time_10_processors = time.time() - start_time + print("Parallel processing time with 10 processors:", parallel_time_10_processors) + + # Parallel processing with 20 processors + start_time = time.time() + dates_parallel_20 = parallel_processing(cleaned_trips_full_data, ranges, 20) + parallel_time_20_processors = time.time() - start_time + print("Parallel processing time with 20 processors:", parallel_time_20_processors) + + # (d) Simulate travel frequency based on trip length + trip_lengths = cleaned_trips_by_distance['Number of Trips'] + travel_frequencies = np.random.poisson(lam=trip_lengths.mean(), size=len(trip_lengths)) + + plt.figure(figsize=(10, 6)) + plt.hist(travel_frequencies, bins=50, alpha=0.75) + plt.xlabel('Trip Length') + plt.ylabel('Frequency') + plt.title('Simulated Travel Frequency by Trip Length') + plt.show() + + # (e) Plot the number of participants by distance-trips + plt.figure(figsize=(10, 6)) + plt.scatter(cleaned_trips_by_distance['Number of Trips'], cleaned_trips_by_distance['Population Not Staying at Home'], alpha=0.75) + plt.xlabel('Number of Trips') + plt.ylabel('Population Not Staying at Home') + plt.title('Number of Participants by Distance-Trips') + plt.show() + + # Compare processing times + print(f"Sequential processing time: {sequential_time} seconds") + print(f"Parallel processing time with 10 processors: {parallel_time_10_processors} seconds") + print(f"Parallel processing time with 20 processors: {parallel_time_20_processors} seconds")