Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing as mp
import time
# Define a function to clean the datasets
def clean_data(df):
df_cleaned = df.drop_duplicates().dropna()
if 'Date' in df_cleaned.columns:
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])
if 'Trips' in df_cleaned.columns:
df_cleaned['Trips'] = df_cleaned['Trips'].astype(int)
if 'Population Staying at Home' in df_cleaned.columns:
df_cleaned['Population Staying at Home'] = df_cleaned['Population Staying at Home'].astype(int)
if 'People Not Staying at Home' in df_cleaned.columns:
df_cleaned['People Not Staying at Home'] = df_cleaned['People Not Staying at Home'].astype(int)
return df_cleaned
# Load the datasets
trips_by_distance_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_by_Distance.csv'
trips_full_data_path = r'C:\Users\Alaia\OneDrive\Desktop\BigData5011\Trips_Full Data.csv'
trips_by_distance = pd.read_csv(trips_by_distance_path)
trips_full_data = pd.read_csv(trips_full_data_path)
# Print the columns of the datasets to check their structure
print("Columns in 'Trips_by_Distance':", trips_by_distance.columns)
print("Columns in 'Trips_Full Data':", trips_full_data.columns)
# Clean the datasets
cleaned_trips_by_distance = clean_data(trips_by_distance)
cleaned_trips_full_data = clean_data(trips_full_data)
# Ensure that the required columns exist before proceeding
required_columns = ['Trips', 'Population Staying at Home', 'People Not Staying at Home', 'Date']
missing_columns = [col for col in required_columns if col not in cleaned_trips_full_data.columns]
if missing_columns:
print(f"Missing columns in 'Trips_Full Data': {missing_columns}")
else:
# (a) How many people are staying at home?
people_staying_home = cleaned_trips_full_data['Population Staying at Home'].sum()
print(f"Number of people staying at home: {people_staying_home}")
# (a) How far are people traveling when they don’t stay home?
trips_distance_columns = [col for col in cleaned_trips_full_data.columns if 'Trips' in col and 'Miles' in col]
distance_traveled = cleaned_trips_full_data[trips_distance_columns].sum().sum()
print(f"Total distance traveled by those who don't stay home: {distance_traveled}")
# (b) Identify dates with > 10,000,000 people conducting 10-25 and 50-100 trips
dates_10_25_trips = cleaned_trips_full_data[
(cleaned_trips_full_data['Trips'] >= 10) &
(cleaned_trips_full_data['Trips'] <= 25) &
(cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
]['Date'].unique()
dates_50_100_trips = cleaned_trips_full_data[
(cleaned_trips_full_data['Trips'] >= 50) &
(cleaned_trips_full_data['Trips'] <= 100) &
(cleaned_trips_full_data['People Not Staying at Home'] > 10000000)
]['Date'].unique()
print("Dates with > 10,000,000 people conducting 10-25 trips:")
print(dates_10_25_trips)
print("\nDates with > 10,000,000 people conducting 50-100 trips:")
print(dates_50_100_trips)
# (c) Parallel processing functions
def process_data(df, trips_range):
result = df[(df['Trips'] >= trips_range[0]) & (df['Trips'] <= trips_range[1]) & (df['People Not Staying at Home'] > 10000000)]
return result['Date'].unique()
def parallel_processing(df, ranges, n_processors):
pool = mp.Pool(processes=n_processors)
results = pool.starmap(process_data, [(df, r) for r in ranges])
pool.close()
pool.join()
return results
if __name__ == '__main__':
# (c) Sequential processing
start_time = time.time()
dates_10_25_trips_seq = process_data(cleaned_trips_full_data, (10, 25))
dates_50_100_trips_seq = process_data(cleaned_trips_full_data, (50, 100))
sequential_time = time.time() - start_time
print("Sequential processing time:", sequential_time)
# Parallel processing with 10 processors
start_time = time.time()
ranges = [(10, 25), (50, 100)]
dates_parallel_10 = parallel_processing(cleaned_trips_full_data, ranges, 10)
parallel_time_10_processors = time.time() - start_time
print("Parallel processing time with 10 processors:", parallel_time_10_processors)
# Parallel processing with 20 processors
start_time = time.time()
dates_parallel_20 = parallel_processing(cleaned_trips_full_data, ranges, 20)
parallel_time_20_processors = time.time() - start_time
print("Parallel processing time with 20 processors:", parallel_time_20_processors)
# (d) Simulate travel frequency based on trip length
trip_lengths = cleaned_trips_by_distance['Number of Trips']
travel_frequencies = np.random.poisson(lam=trip_lengths.mean(), size=len(trip_lengths))
plt.figure(figsize=(10, 6))
plt.hist(travel_frequencies, bins=50, alpha=0.75)
plt.xlabel('Trip Length')
plt.ylabel('Frequency')
plt.title('Simulated Travel Frequency by Trip Length')
plt.show()
# (e) Plot the number of participants by distance-trips
plt.figure(figsize=(10, 6))
plt.scatter(cleaned_trips_by_distance['Number of Trips'], cleaned_trips_by_distance['Population Not Staying at Home'], alpha=0.75)
plt.xlabel('Number of Trips')
plt.ylabel('Population Not Staying at Home')
plt.title('Number of Participants by Distance-Trips')
plt.show()
# Compare processing times
print(f"Sequential processing time: {sequential_time} seconds")
print(f"Parallel processing time with 10 processors: {parallel_time_10_processors} seconds")
print(f"Parallel processing time with 20 processors: {parallel_time_20_processors} seconds")