From c39359e8fea1c042ee56de2295fdd40b0a37d393 Mon Sep 17 00:00:00 2001
From: "Alaia Aliause (aliausea)" <aliausea@coventry.ac.uk>
Date: Tue, 2 Jul 2024 15:30:13 +0100
Subject: [PATCH] Add files via upload

---
 5011V1Coursework.py | 126 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 5011V1Coursework.py

diff --git a/5011V1Coursework.py b/5011V1Coursework.py
new file mode 100644
index 0000000..87aeb5b
--- /dev/null
+++ b/5011V1Coursework.py
@@ -0,0 +1,126 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import multiprocessing as mp
+import time
+
+# Define a function to clean the datasets
+def clean_data(df):
+    df_cleaned = df.drop_duplicates().dropna()
+    
+    if 'Date' in df_cleaned.columns:
+        df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])
+    if 'Trips' in df_cleaned.columns:
+        df_cleaned['Trips'] = df_cleaned['Trips'].astype(int)
+    if 'Population Staying at Home' in df_cleaned.columns:
+        df_cleaned['Population Staying at Home'] = df_cleaned['Population Staying at Home'].astype(int)
+    if 'People Not Staying at Home' in df_cleaned.columns:
+        df_cleaned['People Not Staying at Home'] = df_cleaned['People Not Staying at Home'].astype(int)
+    return df_cleaned
+
+# Load the datasets
+trips_by_distance_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_by_Distance.csv'
+trips_full_data_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_Full Data.csv'
+
+trips_by_distance = pd.read_csv(trips_by_distance_path)
+trips_full_data = pd.read_csv(trips_full_data_path)
+
+# Print the columns of the datasets to check their structure
+print("Columns in 'Trips_by_Distance':", trips_by_distance.columns)
+print("Columns in 'Trips_Full Data':", trips_full_data.columns)
+
+# Clean the datasets
+cleaned_trips_by_distance = clean_data(trips_by_distance)
+cleaned_trips_full_data = clean_data(trips_full_data)
+
+# Ensure that the required columns exist before proceeding
+required_columns = ['Trips', 'Population Staying at Home', 'People Not Staying at Home', 'Date']
+missing_columns = [col for col in required_columns if col not in cleaned_trips_full_data.columns]
+
+if missing_columns:
+    print(f"Missing columns in 'Trips_Full Data': {missing_columns}")
+else:
+    # (a) How many people are staying at home?
+    people_staying_home = cleaned_trips_full_data['Population Staying at Home'].sum()
+    print(f"Number of people staying at home: {people_staying_home}")
+
+    # (a) How far are people traveling when they don’t stay home?
+    trips_distance_columns = [col for col in cleaned_trips_full_data.columns if 'Trips' in col and 'Miles' in col]
+    distance_traveled = cleaned_trips_full_data[trips_distance_columns].sum().sum()
+    print(f"Total distance traveled by those who don't stay home: {distance_traveled}")
+
+    # (b) Identify dates with > 10,000,000 people conducting 10-25 and 50-100 trips
+    dates_10_25_trips = cleaned_trips_full_data[
+        (cleaned_trips_full_data['Trips'] >= 10) & 
+        (cleaned_trips_full_data['Trips'] <= 25) & 
+        (cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
+    ]['Date'].unique()
+
+    dates_50_100_trips = cleaned_trips_full_data[
+        (cleaned_trips_full_data['Trips'] >= 50) & 
+        (cleaned_trips_full_data['Trips'] <= 100) & 
+        (cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
+    ]['Date'].unique()
+
+    print("Dates with > 10,000,000 people conducting 10-25 trips:")
+    print(dates_10_25_trips)
+
+    print("\nDates with > 10,000,000 people conducting 50-100 trips:")
+    print(dates_50_100_trips)
+
+    # (c) Parallel processing functions
+    def process_data(df, trips_range):
+        result = df[(df['Trips'] >= trips_range[0]) & (df['Trips'] <= trips_range[1]) & (df['People Not Staying at Home'] > 10000000)]
+        return result['Date'].unique()
+
+    def parallel_processing(df, ranges, n_processors):
+        pool = mp.Pool(processes=n_processors)
+        results = pool.starmap(process_data, [(df, r) for r in ranges])
+        pool.close()
+        pool.join()
+        return results
+
+    if __name__ == '__main__':
+        # (c) Sequential processing
+        start_time = time.time()
+        dates_10_25_trips_seq = process_data(cleaned_trips_full_data, (10, 25))
+        dates_50_100_trips_seq = process_data(cleaned_trips_full_data, (50, 100))
+        sequential_time = time.time() - start_time
+        print("Sequential processing time:", sequential_time)
+
+        # Parallel processing with 10 processors
+        start_time = time.time()
+        ranges = [(10, 25), (50, 100)]
+        dates_parallel_10 = parallel_processing(cleaned_trips_full_data, ranges, 10)
+        parallel_time_10_processors = time.time() - start_time
+        print("Parallel processing time with 10 processors:", parallel_time_10_processors)
+
+        # Parallel processing with 20 processors
+        start_time = time.time()
+        dates_parallel_20 = parallel_processing(cleaned_trips_full_data, ranges, 20)
+        parallel_time_20_processors = time.time() - start_time
+        print("Parallel processing time with 20 processors:", parallel_time_20_processors)
+
+        # (d) Simulate travel frequency based on trip length
+        trip_lengths = cleaned_trips_by_distance['Number of Trips']
+        travel_frequencies = np.random.poisson(lam=trip_lengths.mean(), size=len(trip_lengths))
+
+        plt.figure(figsize=(10, 6))
+        plt.hist(travel_frequencies, bins=50, alpha=0.75)
+        plt.xlabel('Trip Length')
+        plt.ylabel('Frequency')
+        plt.title('Simulated Travel Frequency by Trip Length')
+        plt.show()
+
+        # (e) Plot the number of participants by distance-trips
+        plt.figure(figsize=(10, 6))
+        plt.scatter(cleaned_trips_by_distance['Number of Trips'], cleaned_trips_by_distance['Population Not Staying at Home'], alpha=0.75)
+        plt.xlabel('Number of Trips')
+        plt.ylabel('Population Not Staying at Home')
+        plt.title('Number of Participants by Distance-Trips')
+        plt.show()
+
+        # Compare processing times
+        print(f"Sequential processing time: {sequential_time} seconds")
+        print(f"Parallel processing time with 10 processors: {parallel_time_10_processors} seconds")
+        print(f"Parallel processing time with 20 processors: {parallel_time_20_processors} seconds")