From 001e83e5131fea17d8279df9ac946348333f7947 Mon Sep 17 00:00:00 2001 From: "Lucas Lopes Oliveira (lopesoll)" Date: Tue, 28 Feb 2023 11:19:39 +0000 Subject: [PATCH] Undersample data to CSV from original dataset https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents --- create undersampled data.ipynb | 167 +++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 create undersampled data.ipynb diff --git a/create undersampled data.ipynb b/create undersampled data.ipynb new file mode 100644 index 0000000..0cb6c71 --- /dev/null +++ b/create undersampled data.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "54441b41", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import findspark\n", + "findspark.init()\n", + "import pyspark\n", + "\n", + "from pyspark.sql.functions import *\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.types import IntegerType\n", + "\n", + "\n", + "spark=spark = SparkSession \\\n", + " .builder \\\n", + " .appName(\"US_accidents\") \\\n", + " .getOrCreate()\n", + "print(spark)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f6198dcd", + "metadata": {}, + "outputs": [], + "source": [ + "df = spark.read.options(header=\"True\", InferSchema=\"True\", nullValue=\"null\" ).csv(\"dataset/US_Accidents_Dec21_updated.csv\")\n", + "#split the date and creates variable duration stores in minutes\n", + "#cast boolean columns to int\n", + "df = df.withColumn(\"dayofweek\", dayofweek(col(\"Start_Time\")).alias(\"dayofweek\"))\\\n", + " .withColumn(\"year\", year(col(\"Start_Time\")).alias(\"year\"))\\\n", + " .withColumn(\"month\", month(col(\"Start_Time\")).alias(\"month\"))\\\n", + " .withColumn(\"dayofmonth\", dayofmonth(col(\"Start_Time\")).alias(\"dayofmonth\"))\\\n", + " .withColumn(\"hour\", hour(col(\"Start_Time\")).alias(\"hour\"))\\\n", + " .withColumn(\"Duration\", (col(\"End_Time\").cast(\"long\") - col(\"Start_Time\").cast(\"long\"))/60)\\\n", + " .withColumn(\"Amenity\",df.Amenity.cast(IntegerType()))\\\n", + " .withColumn(\"Crossing\",df.Crossing.cast(IntegerType()))\\\n", + " .withColumn(\"Give_Way\",df.Give_Way.cast(IntegerType()))\\\n", + " .withColumn(\"Junction\",df.Junction.cast(IntegerType()))\\\n", + " .withColumn(\"No_Exit\",df.No_Exit.cast(IntegerType()))\\\n", + " .withColumn(\"Railway\",df.Railway.cast(IntegerType()))\\\n", + " .withColumn(\"Roundabout\",df.Roundabout.cast(IntegerType()))\\\n", + " .withColumn(\"Station\",df.Station.cast(IntegerType()))\\\n", + " .withColumn(\"Stop\",df.Stop.cast(IntegerType()))\\\n", + " .withColumn(\"Traffic_Calming\",df.Traffic_Calming.cast(IntegerType()))\\\n", + " .withColumn(\"Traffic_Signal\",df.Traffic_Signal.cast(IntegerType()))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2f991065", + "metadata": {}, + "outputs": [], + "source": [ + "#dropping columns\n", + "#drop starttime, endtime, Weather_Tiestamp because I no longer need them\n", + "#drop number because it has 1700000 missing values and extreemly low correlation with target variable\n", + "#drop country because it's always the same \n", + "#drop ID and Description because were causing an error with fitting the model on the pipeline both are unnecessary\n", + "#drop end_lng, end_lat, bump, wind_chill concluded from the correlation analysis (high corr with other variables)\n", + "#drop turningloop because has no correlation with other variables\n", + "#drop sunrizes and sunsets because I dont need them, I have hour\n", + "#drop airport code and Timezone because are not important and have a few missing values\n", + "df = df.drop('Start_Time', 'End_Time', 'Weather_Timestamp', 'Number', 'Country', 'Description','ID',\n", + " 'End_Lng', 'End_Lat', 'Bump', 'Turning_Loop', 'Wind_Chill(F)', 'Sunrise_Sunset','Civil_Twilight',\n", + " 'Nautical_Twilight','Astronomical_Twilight','Airport_Code','Timezone')\n", + "\n", + "#instead of dropping precipitation, fill with 0 because most of the missing data in this column\n", + "#is when the weather_condition wasn't rainy, therefor are no measures for precipitation\n", + "df=df.na.fill(value=0,subset=[\"Precipitation(in)\"])\n", + "\n", + "#remaining columns with missing values:::\n", + "#Street:2; City:137; Zipcode:1319; Temperature:69274; Humidity:73092; Pressure:59200; Visibility:70546;\n", + "#Wind_Direction:73775; Wind_Speed:157944; Weather_Condition:70636;\n", + "df = df.na.drop() \n", + "#drop rows with missing values, in total 187383 rows lost" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e0999444", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-------+\n", + "|Severity| count|\n", + "+--------+-------+\n", + "| 3| 135914|\n", + "| 4| 112405|\n", + "| 2|2384141|\n", + "| 1| 25499|\n", + "+--------+-------+\n", + "\n" + ] + } + ], + "source": [ + "newdf=df.groupby(\"Severity\").count().show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7d0b3752", + "metadata": {}, + "outputs": [], + "source": [ + "import pyspark.sql.functions as F\n", + "#orders the dataframe by a random column to create garantee that the partitions are not ordered by date, \n", + "#this ensures that various dates (years) are included\n", + "df = df.select(\"*\").orderBy(F.rand())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "93f7a82e", + "metadata": {}, + "outputs": [], + "source": [ + "df.repartition(1).write.option(\"header\",True) \\\n", + " .partitionBy(\"Severity\") \\\n", + " .csv(\"dataset/severity\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}