Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
aliausea committed Jul 2, 2024
0 parents commit c39359e
Showing 1 changed file with 126 additions and 0 deletions.
126 changes: 126 additions & 0 deletions 5011V1Coursework.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing as mp
import time

# Define a function to clean the datasets
def clean_data(df):
df_cleaned = df.drop_duplicates().dropna()

if 'Date' in df_cleaned.columns:
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])
if 'Trips' in df_cleaned.columns:
df_cleaned['Trips'] = df_cleaned['Trips'].astype(int)
if 'Population Staying at Home' in df_cleaned.columns:
df_cleaned['Population Staying at Home'] = df_cleaned['Population Staying at Home'].astype(int)
if 'People Not Staying at Home' in df_cleaned.columns:
df_cleaned['People Not Staying at Home'] = df_cleaned['People Not Staying at Home'].astype(int)
return df_cleaned

# Load the datasets
trips_by_distance_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_by_Distance.csv'
trips_full_data_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_Full Data.csv'

trips_by_distance = pd.read_csv(trips_by_distance_path)
trips_full_data = pd.read_csv(trips_full_data_path)

# Print the columns of the datasets to check their structure
print("Columns in 'Trips_by_Distance':", trips_by_distance.columns)
print("Columns in 'Trips_Full Data':", trips_full_data.columns)

# Clean the datasets
cleaned_trips_by_distance = clean_data(trips_by_distance)
cleaned_trips_full_data = clean_data(trips_full_data)

# Ensure that the required columns exist before proceeding
required_columns = ['Trips', 'Population Staying at Home', 'People Not Staying at Home', 'Date']
missing_columns = [col for col in required_columns if col not in cleaned_trips_full_data.columns]

if missing_columns:
print(f"Missing columns in 'Trips_Full Data': {missing_columns}")
else:
# (a) How many people are staying at home?
people_staying_home = cleaned_trips_full_data['Population Staying at Home'].sum()
print(f"Number of people staying at home: {people_staying_home}")

# (a) How far are people traveling when they don’t stay home?
trips_distance_columns = [col for col in cleaned_trips_full_data.columns if 'Trips' in col and 'Miles' in col]
distance_traveled = cleaned_trips_full_data[trips_distance_columns].sum().sum()
print(f"Total distance traveled by those who don't stay home: {distance_traveled}")

# (b) Identify dates with > 10,000,000 people conducting 10-25 and 50-100 trips
dates_10_25_trips = cleaned_trips_full_data[
(cleaned_trips_full_data['Trips'] >= 10) &
(cleaned_trips_full_data['Trips'] <= 25) &
(cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
]['Date'].unique()

dates_50_100_trips = cleaned_trips_full_data[
(cleaned_trips_full_data['Trips'] >= 50) &
(cleaned_trips_full_data['Trips'] <= 100) &
(cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
]['Date'].unique()

print("Dates with > 10,000,000 people conducting 10-25 trips:")
print(dates_10_25_trips)

print("\nDates with > 10,000,000 people conducting 50-100 trips:")
print(dates_50_100_trips)

# (c) Parallel processing functions
def process_data(df, trips_range):
result = df[(df['Trips'] >= trips_range[0]) & (df['Trips'] <= trips_range[1]) & (df['People Not Staying at Home'] > 10000000)]
return result['Date'].unique()

def parallel_processing(df, ranges, n_processors):
pool = mp.Pool(processes=n_processors)
results = pool.starmap(process_data, [(df, r) for r in ranges])
pool.close()
pool.join()
return results

if __name__ == '__main__':
# (c) Sequential processing
start_time = time.time()
dates_10_25_trips_seq = process_data(cleaned_trips_full_data, (10, 25))
dates_50_100_trips_seq = process_data(cleaned_trips_full_data, (50, 100))
sequential_time = time.time() - start_time
print("Sequential processing time:", sequential_time)

# Parallel processing with 10 processors
start_time = time.time()
ranges = [(10, 25), (50, 100)]
dates_parallel_10 = parallel_processing(cleaned_trips_full_data, ranges, 10)
parallel_time_10_processors = time.time() - start_time
print("Parallel processing time with 10 processors:", parallel_time_10_processors)

# Parallel processing with 20 processors
start_time = time.time()
dates_parallel_20 = parallel_processing(cleaned_trips_full_data, ranges, 20)
parallel_time_20_processors = time.time() - start_time
print("Parallel processing time with 20 processors:", parallel_time_20_processors)

# (d) Simulate travel frequency based on trip length
trip_lengths = cleaned_trips_by_distance['Number of Trips']
travel_frequencies = np.random.poisson(lam=trip_lengths.mean(), size=len(trip_lengths))

plt.figure(figsize=(10, 6))
plt.hist(travel_frequencies, bins=50, alpha=0.75)
plt.xlabel('Trip Length')
plt.ylabel('Frequency')
plt.title('Simulated Travel Frequency by Trip Length')
plt.show()

# (e) Plot the number of participants by distance-trips
plt.figure(figsize=(10, 6))
plt.scatter(cleaned_trips_by_distance['Number of Trips'], cleaned_trips_by_distance['Population Not Staying at Home'], alpha=0.75)
plt.xlabel('Number of Trips')
plt.ylabel('Population Not Staying at Home')
plt.title('Number of Participants by Distance-Trips')
plt.show()

# Compare processing times
print(f"Sequential processing time: {sequential_time} seconds")
print(f"Parallel processing time with 10 processors: {parallel_time_10_processors} seconds")
print(f"Parallel processing time with 20 processors: {parallel_time_20_processors} seconds")

0 comments on commit c39359e

Please sign in to comment.