Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Create data cleaning.py
  • Loading branch information
ullaha18 committed Apr 12, 2024
1 parent 6af0ee9 commit e56ec66
Showing 1 changed file with 23 additions and 0 deletions.
23 changes: 23 additions & 0 deletions data cleaning.py
@@ -0,0 +1,23 @@
import pandas as pd
import numpy as np
import dask.dataframe as dd

trips_by_distance = pd.read_csv('Trips_by_Distance.csv')
trips_full_data = pd.read_csv('Trips_Full Data.csv')

#removes rows with missing values
trips_full_data_cleaned = trips_full_data.dropna()
trips_by_distance = trips_by_distance.dropna()
#removes duplicate rows
trips_full_data_cleaned = trips_full_data_cleaned.drop_duplicates()
trips_by_distance_cleaned = trips_by_distance.drop_duplicates()


columns_with_nan_by_distance = trips_by_distance_cleaned.columns[trips_by_distance_cleaned.isnull().all()]
trips_by_distance_cleaned = trips_by_distance_cleaned.drop(columns=columns_with_nan_by_distance)

columns_with_nan = trips_full_data_cleaned.columns[trips_full_data_cleaned.isnull().all()]
trips_full_data_cleaned = trips_full_data_cleaned.drop(columns=columns_with_nan)

#convert data to datetime format
trips_full_data_cleaned['Date'] = pd.to_datetime(trips_full_data_cleaned['Date'], errors='coerce')

0 comments on commit e56ec66

Please sign in to comment.