diff --git a/data cleaning.py b/data cleaning.py new file mode 100644 index 0000000..9d1aeb7 --- /dev/null +++ b/data cleaning.py @@ -0,0 +1,23 @@ +import pandas as pd +import numpy as np +import dask.dataframe as dd + +trips_by_distance = pd.read_csv('Trips_by_Distance.csv') +trips_full_data = pd.read_csv('Trips_Full Data.csv') + +#removes rows with missing values +trips_full_data_cleaned = trips_full_data.dropna() +trips_by_distance = trips_by_distance.dropna() +#removes duplicate rows +trips_full_data_cleaned = trips_full_data_cleaned.drop_duplicates() +trips_by_distance_cleaned = trips_by_distance.drop_duplicates() + + +columns_with_nan_by_distance = trips_by_distance_cleaned.columns[trips_by_distance_cleaned.isnull().all()] +trips_by_distance_cleaned = trips_by_distance_cleaned.drop(columns=columns_with_nan_by_distance) + +columns_with_nan = trips_full_data_cleaned.columns[trips_full_data_cleaned.isnull().all()] +trips_full_data_cleaned = trips_full_data_cleaned.drop(columns=columns_with_nan) + +#convert data to datetime format +trips_full_data_cleaned['Date'] = pd.to_datetime(trips_full_data_cleaned['Date'], errors='coerce')