5011V1Coursework.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing as mp
import time

# Define a function to clean the datasets
def clean_data(df):
    df_cleaned = df.drop_duplicates().dropna()

    if 'Date' in df_cleaned.columns:
        df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])
    if 'Trips' in df_cleaned.columns:
        df_cleaned['Trips'] = df_cleaned['Trips'].astype(int)
    if 'Population Staying at Home' in df_cleaned.columns:
        df_cleaned['Population Staying at Home'] = df_cleaned['Population Staying at Home'].astype(int)
    if 'People Not Staying at Home' in df_cleaned.columns:
        df_cleaned['People Not Staying at Home'] = df_cleaned['People Not Staying at Home'].astype(int)
    return df_cleaned

# Load the datasets
trips_by_distance_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_by_Distance.csv'
trips_full_data_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_Full Data.csv'

trips_by_distance = pd.read_csv(trips_by_distance_path)
trips_full_data = pd.read_csv(trips_full_data_path)

# Print the columns of the datasets to check their structure
print("Columns in 'Trips_by_Distance':", trips_by_distance.columns)
print("Columns in 'Trips_Full Data':", trips_full_data.columns)

# Clean the datasets
cleaned_trips_by_distance = clean_data(trips_by_distance)
cleaned_trips_full_data = clean_data(trips_full_data)

# Ensure that the required columns exist before proceeding
required_columns = ['Trips', 'Population Staying at Home', 'People Not Staying at Home', 'Date']
missing_columns = [col for col in required_columns if col not in cleaned_trips_full_data.columns]

if missing_columns:
    print(f"Missing columns in 'Trips_Full Data': {missing_columns}")
else:
    # (a) How many people are staying at home?
    people_staying_home = cleaned_trips_full_data['Population Staying at Home'].sum()
    print(f"Number of people staying at home: {people_staying_home}")

    # (a) How far are people traveling when they don’t stay home?
    trips_distance_columns = [col for col in cleaned_trips_full_data.columns if 'Trips' in col and 'Miles' in col]
    distance_traveled = cleaned_trips_full_data[trips_distance_columns].sum().sum()
    print(f"Total distance traveled by those who don't stay home: {distance_traveled}")

    # (b) Identify dates with > 10,000,000 people conducting 10-25 and 50-100 trips
    dates_10_25_trips = cleaned_trips_full_data[
        (cleaned_trips_full_data['Trips'] >= 10) &
        (cleaned_trips_full_data['Trips'] <= 25) &
        (cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
    ]['Date'].unique()

    dates_50_100_trips = cleaned_trips_full_data[
        (cleaned_trips_full_data['Trips'] >= 50) &
        (cleaned_trips_full_data['Trips'] <= 100) &
        (cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
    ]['Date'].unique()

    print("Dates with > 10,000,000 people conducting 10-25 trips:")
    print(dates_10_25_trips)

    print("\nDates with > 10,000,000 people conducting 50-100 trips:")
    print(dates_50_100_trips)

    # (c) Parallel processing functions
    def process_data(df, trips_range):
        result = df[(df['Trips'] >= trips_range[0]) & (df['Trips'] <= trips_range[1]) & (df['People Not Staying at Home'] > 10000000)]
        return result['Date'].unique()

    def parallel_processing(df, ranges, n_processors):
        pool = mp.Pool(processes=n_processors)
        results = pool.starmap(process_data, [(df, r) for r in ranges])
        pool.close()
        pool.join()
        return results

    if __name__ == '__main__':
        # (c) Sequential processing
        start_time = time.time()
        dates_10_25_trips_seq = process_data(cleaned_trips_full_data, (10, 25))
        dates_50_100_trips_seq = process_data(cleaned_trips_full_data, (50, 100))
        sequential_time = time.time() - start_time
        print("Sequential processing time:", sequential_time)

        # Parallel processing with 10 processors
        start_time = time.time()
        ranges = [(10, 25), (50, 100)]
        dates_parallel_10 = parallel_processing(cleaned_trips_full_data, ranges, 10)
        parallel_time_10_processors = time.time() - start_time
        print("Parallel processing time with 10 processors:", parallel_time_10_processors)

        # Parallel processing with 20 processors
        start_time = time.time()
        dates_parallel_20 = parallel_processing(cleaned_trips_full_data, ranges, 20)
        parallel_time_20_processors = time.time() - start_time
        print("Parallel processing time with 20 processors:", parallel_time_20_processors)

        # (d) Simulate travel frequency based on trip length
        trip_lengths = cleaned_trips_by_distance['Number of Trips']
        travel_frequencies = np.random.poisson(lam=trip_lengths.mean(), size=len(trip_lengths))

        plt.figure(figsize=(10, 6))
        plt.hist(travel_frequencies, bins=50, alpha=0.75)
        plt.xlabel('Trip Length')
        plt.ylabel('Frequency')
        plt.title('Simulated Travel Frequency by Trip Length')
        plt.show()

        # (e) Plot the number of participants by distance-trips
        plt.figure(figsize=(10, 6))
        plt.scatter(cleaned_trips_by_distance['Number of Trips'], cleaned_trips_by_distance['Population Not Staying at Home'], alpha=0.75)
        plt.xlabel('Number of Trips')
        plt.ylabel('Population Not Staying at Home')
        plt.title('Number of Participants by Distance-Trips')
        plt.show()

        # Compare processing times
        print(f"Sequential processing time: {sequential_time} seconds")
        print(f"Parallel processing time with 10 processors: {parallel_time_10_processors} seconds")
        print(f"Parallel processing time with 20 processors: {parallel_time_20_processors} seconds")
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import multiprocessing as mp
	import time

	# Define a function to clean the datasets
	def clean_data(df):
	df_cleaned = df.drop_duplicates().dropna()

	if 'Date' in df_cleaned.columns:
	df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])
	if 'Trips' in df_cleaned.columns:
	df_cleaned['Trips'] = df_cleaned['Trips'].astype(int)
	if 'Population Staying at Home' in df_cleaned.columns:
	df_cleaned['Population Staying at Home'] = df_cleaned['Population Staying at Home'].astype(int)
	if 'People Not Staying at Home' in df_cleaned.columns:
	df_cleaned['People Not Staying at Home'] = df_cleaned['People Not Staying at Home'].astype(int)
	return df_cleaned

	# Load the datasets
	trips_by_distance_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_by_Distance.csv'
	trips_full_data_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_Full Data.csv'

	trips_by_distance = pd.read_csv(trips_by_distance_path)
	trips_full_data = pd.read_csv(trips_full_data_path)

	# Print the columns of the datasets to check their structure
	print("Columns in 'Trips_by_Distance':", trips_by_distance.columns)
	print("Columns in 'Trips_Full Data':", trips_full_data.columns)

	# Clean the datasets
	cleaned_trips_by_distance = clean_data(trips_by_distance)
	cleaned_trips_full_data = clean_data(trips_full_data)

	# Ensure that the required columns exist before proceeding
	required_columns = ['Trips', 'Population Staying at Home', 'People Not Staying at Home', 'Date']
	missing_columns = [col for col in required_columns if col not in cleaned_trips_full_data.columns]

	if missing_columns:
	print(f"Missing columns in 'Trips_Full Data': {missing_columns}")
	else:
	# (a) How many people are staying at home?
	people_staying_home = cleaned_trips_full_data['Population Staying at Home'].sum()
	print(f"Number of people staying at home: {people_staying_home}")

	# (a) How far are people traveling when they don’t stay home?
	trips_distance_columns = [col for col in cleaned_trips_full_data.columns if 'Trips' in col and 'Miles' in col]
	distance_traveled = cleaned_trips_full_data[trips_distance_columns].sum().sum()
	print(f"Total distance traveled by those who don't stay home: {distance_traveled}")

	# (b) Identify dates with > 10,000,000 people conducting 10-25 and 50-100 trips
	dates_10_25_trips = cleaned_trips_full_data[
	(cleaned_trips_full_data['Trips'] >= 10) &
	(cleaned_trips_full_data['Trips'] <= 25) &
	(cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
	]['Date'].unique()

	dates_50_100_trips = cleaned_trips_full_data[
	(cleaned_trips_full_data['Trips'] >= 50) &
	(cleaned_trips_full_data['Trips'] <= 100) &
	(cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
	]['Date'].unique()

	print("Dates with > 10,000,000 people conducting 10-25 trips:")
	print(dates_10_25_trips)

	print("\nDates with > 10,000,000 people conducting 50-100 trips:")
	print(dates_50_100_trips)

	# (c) Parallel processing functions
	def process_data(df, trips_range):
	result = df[(df['Trips'] >= trips_range[0]) & (df['Trips'] <= trips_range[1]) & (df['People Not Staying at Home'] > 10000000)]
	return result['Date'].unique()

	def parallel_processing(df, ranges, n_processors):
	pool = mp.Pool(processes=n_processors)
	results = pool.starmap(process_data, [(df, r) for r in ranges])
	pool.close()
	pool.join()
	return results

	if __name__ == '__main__':
	# (c) Sequential processing
	start_time = time.time()
	dates_10_25_trips_seq = process_data(cleaned_trips_full_data, (10, 25))
	dates_50_100_trips_seq = process_data(cleaned_trips_full_data, (50, 100))
	sequential_time = time.time() - start_time
	print("Sequential processing time:", sequential_time)

	# Parallel processing with 10 processors
	start_time = time.time()
	ranges = [(10, 25), (50, 100)]
	dates_parallel_10 = parallel_processing(cleaned_trips_full_data, ranges, 10)
	parallel_time_10_processors = time.time() - start_time
	print("Parallel processing time with 10 processors:", parallel_time_10_processors)

	# Parallel processing with 20 processors
	start_time = time.time()
	dates_parallel_20 = parallel_processing(cleaned_trips_full_data, ranges, 20)
	parallel_time_20_processors = time.time() - start_time
	print("Parallel processing time with 20 processors:", parallel_time_20_processors)

	# (d) Simulate travel frequency based on trip length
	trip_lengths = cleaned_trips_by_distance['Number of Trips']
	travel_frequencies = np.random.poisson(lam=trip_lengths.mean(), size=len(trip_lengths))

	plt.figure(figsize=(10, 6))
	plt.hist(travel_frequencies, bins=50, alpha=0.75)
	plt.xlabel('Trip Length')
	plt.ylabel('Frequency')
	plt.title('Simulated Travel Frequency by Trip Length')
	plt.show()

	# (e) Plot the number of participants by distance-trips
	plt.figure(figsize=(10, 6))
	plt.scatter(cleaned_trips_by_distance['Number of Trips'], cleaned_trips_by_distance['Population Not Staying at Home'], alpha=0.75)
	plt.xlabel('Number of Trips')
	plt.ylabel('Population Not Staying at Home')
	plt.title('Number of Participants by Distance-Trips')
	plt.show()

	# Compare processing times
	print(f"Sequential processing time: {sequential_time} seconds")
	print(f"Parallel processing time with 10 processors: {parallel_time_10_processors} seconds")
	print(f"Parallel processing time with 20 processors: {parallel_time_20_processors} seconds")