Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Data-Science/diabetes_pred.ipynb
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
899 lines (899 sloc)
31.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "diabetes_pred.ipynb", | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "X3O_nO8eZqRq", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"!apt-get install openjdk-8-jdk-headless -qq > /dev/null" | |
], | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "pmDnPTPDZ1fo", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 72 | |
}, | |
"outputId": "418b16f4-f2f2-4aa8-da92-ab2ff11795ef" | |
}, | |
"source": [ | |
"!pip install pyspark==2.4.4\n" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied: pyspark==2.4.4 in /usr/local/lib/python3.6/dist-packages (2.4.4)\n", | |
"Requirement already satisfied: py4j==0.10.7 in /usr/local/lib/python3.6/dist-packages (from pyspark==2.4.4) (0.10.7)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Mp6h5o9iaSq6", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Environment Path\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nKVK-xcQaU1T", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import os\n", | |
"os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'" | |
], | |
"execution_count": 14, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "9yK6BQfjakoD", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from pyspark.sql import SparkSession\n", | |
"from pyspark.sql.functions import *\n", | |
"\n", | |
"spark = SparkSession.builder.appName(\"spark\").getOrCreate()" | |
], | |
"execution_count": 49, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "yIjAGHsAcAav", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Clone Diabetes Dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "p7yXof5HcNs9", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
}, | |
"outputId": "2a69b808-579d-4a17-c24d-1cc62daf71b6" | |
}, | |
"source": [ | |
"! git clone \"https://github.com/education454/diabetes_dataset\"" | |
], | |
"execution_count": 37, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"fatal: destination path 'diabetes_dataset' already exists and is not an empty directory.\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "naltoltxcVRe", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "51bed9bc-232c-49a6-f3fa-1c1edef165d6" | |
}, | |
"source": [ | |
"!ls" | |
], | |
"execution_count": 38, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"diabetes_dataset sample_data\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "8ihyUeqJcZNB", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"df = spark.read.csv(\"/content/diabetes_dataset/diabetes.csv\",header = True, inferSchema = True)" | |
], | |
"execution_count": 39, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "MyOFjGJxdJVq", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 488 | |
}, | |
"outputId": "886cdd2d-5fab-4277-b2b2-9ee140728d26" | |
}, | |
"source": [ | |
"df.show()" | |
], | |
"execution_count": 40, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+\n", | |
"|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|\n", | |
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+\n", | |
"| 2| 138| 62| 35| 0|33.6| 0.127| 47| 1|\n", | |
"| 0| 84| 82| 31| 125|38.2| 0.233| 23| 0|\n", | |
"| 0| 145| 0| 0| 0|44.2| 0.63| 31| 1|\n", | |
"| 0| 135| 68| 42| 250|42.3| 0.365| 24| 1|\n", | |
"| 1| 139| 62| 41| 480|40.7| 0.536| 21| 0|\n", | |
"| 0| 173| 78| 32| 265|46.5| 1.159| 58| 0|\n", | |
"| 4| 99| 72| 17| 0|25.6| 0.294| 28| 0|\n", | |
"| 8| 194| 80| 0| 0|26.1| 0.551| 67| 0|\n", | |
"| 2| 83| 65| 28| 66|36.8| 0.629| 24| 0|\n", | |
"| 2| 89| 90| 30| 0|33.5| 0.292| 42| 0|\n", | |
"| 4| 99| 68| 38| 0|32.8| 0.145| 33| 0|\n", | |
"| 4| 125| 70| 18| 122|28.9| 1.144| 45| 1|\n", | |
"| 3| 80| 0| 0| 0| 0.0| 0.174| 22| 0|\n", | |
"| 6| 166| 74| 0| 0|26.6| 0.304| 66| 0|\n", | |
"| 5| 110| 68| 0| 0|26.0| 0.292| 30| 0|\n", | |
"| 2| 81| 72| 15| 76|30.1| 0.547| 25| 0|\n", | |
"| 7| 195| 70| 33| 145|25.1| 0.163| 55| 1|\n", | |
"| 6| 154| 74| 32| 193|29.3| 0.839| 39| 0|\n", | |
"| 2| 117| 90| 19| 71|25.2| 0.313| 21| 0|\n", | |
"| 3| 84| 72| 32| 0|37.2| 0.267| 28| 0|\n", | |
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jUR6XpQqdMWD", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 208 | |
}, | |
"outputId": "fbc8eb62-c131-4c89-861e-38662304da8c" | |
}, | |
"source": [ | |
"df.printSchema()" | |
], | |
"execution_count": 41, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"root\n", | |
" |-- Pregnancies: integer (nullable = true)\n", | |
" |-- Glucose: integer (nullable = true)\n", | |
" |-- BloodPressure: integer (nullable = true)\n", | |
" |-- SkinThickness: integer (nullable = true)\n", | |
" |-- Insulin: integer (nullable = true)\n", | |
" |-- BMI: double (nullable = true)\n", | |
" |-- DiabetesPedigreeFunction: double (nullable = true)\n", | |
" |-- Age: integer (nullable = true)\n", | |
" |-- Outcome: integer (nullable = true)\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nv9wfSr5dYcq", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "2c67a49b-43ed-4d27-d986-74ce5be50063" | |
}, | |
"source": [ | |
"print(df.count(), len(df.columns))" | |
], | |
"execution_count": 42, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"2000 9\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ypmXeML5djzX", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 138 | |
}, | |
"outputId": "9f561434-591c-4596-f637-38a3264be220" | |
}, | |
"source": [ | |
"df.groupby(\"Outcome\").count().show()" | |
], | |
"execution_count": 43, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"+-------+-----+\n", | |
"|Outcome|count|\n", | |
"+-------+-----+\n", | |
"| 1| 684|\n", | |
"| 0| 1316|\n", | |
"+-------+-----+\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "p2iEPQx0d9Gd", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 210 | |
}, | |
"outputId": "3adee5b2-10d0-40e3-b584-807b704c27e0" | |
}, | |
"source": [ | |
"df.describe().show()" | |
], | |
"execution_count": 44, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+\n", | |
"|summary| Pregnancies| Glucose| BloodPressure| SkinThickness| Insulin| BMI|DiabetesPedigreeFunction| Age| Outcome|\n", | |
"+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+\n", | |
"| count| 2000| 2000| 2000| 2000| 2000| 2000| 2000| 2000| 2000|\n", | |
"| mean| 3.7035| 121.1825| 69.1455| 20.935| 80.254|32.192999999999984| 0.47092999999999974| 33.0905| 0.342|\n", | |
"| stddev|3.306063032730656|32.068635649902916|19.188314815604098|16.10324290992682|111.1805335457595| 8.149900701279762| 0.3235525586811429|11.786423106049496|0.4744982342297426|\n", | |
"| min| 0| 0| 0| 0| 0| 0.0| 0.078| 21| 0|\n", | |
"| max| 17| 199| 122| 110| 744| 80.6| 2.42| 81| 1|\n", | |
"+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "m8w3T4VJehV4", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Cleaning Data(EDA)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "AYVPyWtdelI1", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 173 | |
}, | |
"outputId": "7f5ca343-0293-4bda-d115-396e8e0d66d6" | |
}, | |
"source": [ | |
"#finding null values\n", | |
"for col in df.columns:\n", | |
" print(col + \":\",df[df[col].isNull()].count())" | |
], | |
"execution_count": 45, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Pregnancies: 0\n", | |
"Glucose: 0\n", | |
"BloodPressure: 0\n", | |
"SkinThickness: 0\n", | |
"Insulin: 0\n", | |
"BMI: 0\n", | |
"DiabetesPedigreeFunction: 0\n", | |
"Age: 0\n", | |
"Outcome: 0\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "r1p5W2RdendZ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"def count_zeros():\n", | |
" columns_list = [\"Glucose\", \"BloodPressure\", \"SkinThickness\", \"Insulin\", \"BMI\"]\n", | |
" for i in columns_list:\n", | |
" print(i+\":\",df[df[i]==0].count())" | |
], | |
"execution_count": 46, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ZHlZ1DyPkd0I", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 104 | |
}, | |
"outputId": "7e3051a7-4fa2-4707-e21e-2dc3a1898422" | |
}, | |
"source": [ | |
"count_zeros()" | |
], | |
"execution_count": 47, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Glucose: 13\n", | |
"BloodPressure: 90\n", | |
"SkinThickness: 573\n", | |
"Insulin: 956\n", | |
"BMI: 28\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "rrzOraMdkf0Q", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 104 | |
}, | |
"outputId": "3fbc6de4-76f8-400e-9707-b5a8c8accebd" | |
}, | |
"source": [ | |
"for i in df.columns[1:6]:\n", | |
" data = df.agg({i:'mean'}).first()[0]\n", | |
" print(\"Mean value for {} is {}\".format(i,int(data)))\n", | |
" df = df.withColumn(i,when(df[i]==0,int(data)).otherwise(df[i]))" | |
], | |
"execution_count": 82, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Mean value for Glucose is 121\n", | |
"Mean value for BloodPressure is 72\n", | |
"Mean value for SkinThickness is 26\n", | |
"Mean value for Insulin is 118\n", | |
"Mean value for BMI is 32\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "AGDhT_tonlma", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 54 | |
}, | |
"outputId": "4afa2a8f-c4b4-48cb-8321-d94f2309ed02" | |
}, | |
"source": [ | |
"df.show" | |
], | |
"execution_count": 65, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<bound method DataFrame.show of DataFrame[Pregnancies: int, Glucose: int, BloodPressure: int, SkinThickness: int, Insulin: int, BMI: double, DiabetesPedigreeFunction: double, Age: int, Outcome: int]>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 65 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "DA0IL4wipG8V", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 173 | |
}, | |
"outputId": "f1ad5686-54a5-4f7c-c73a-aff96d6e6bc4" | |
}, | |
"source": [ | |
"for col in df.columns:\n", | |
" print(\"correlation to outcome for {} is {}\".format(col,df.stat.corr(\"Outcome\",col)))\n" | |
], | |
"execution_count": 56, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"correlation to outcome for Pregnancies is 0.22443699263363961\n", | |
"correlation to outcome for Glucose is 0.48796646527321064\n", | |
"correlation to outcome for BloodPressure is 0.17171333286446713\n", | |
"correlation to outcome for SkinThickness is 0.1659010662889893\n", | |
"correlation to outcome for Insulin is 0.1711763270226193\n", | |
"correlation to outcome for BMI is 0.2827927569760082\n", | |
"correlation to outcome for DiabetesPedigreeFunction is 0.1554590791569403\n", | |
"correlation to outcome for Age is 0.23650924717620253\n", | |
"correlation to outcome for Outcome is 1.0\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UC-cGFxMzAWW", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from pyspark.ml.feature import VectorAssembler\n", | |
"assembler = VectorAssembler(inputCols = [\"Pregnancies\",\"Glucose\", \"BloodPressure\", \"SkinThickness\", \"Insulin\", \"BMI\",\"DiabetesPedigreeFunction\",\"Age\"], outputCol = \"Features\")\n", | |
"output_data = assembler.transform(df)" | |
], | |
"execution_count": 84, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Wjvi58ck2T1e", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 225 | |
}, | |
"outputId": "2764a872-b4a0-4a91-bd8d-fbbdd73e37b1" | |
}, | |
"source": [ | |
"output_data.printSchema()" | |
], | |
"execution_count": 87, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"root\n", | |
" |-- Pregnancies: integer (nullable = true)\n", | |
" |-- Glucose: integer (nullable = true)\n", | |
" |-- BloodPressure: integer (nullable = true)\n", | |
" |-- SkinThickness: integer (nullable = true)\n", | |
" |-- Insulin: integer (nullable = true)\n", | |
" |-- BMI: double (nullable = true)\n", | |
" |-- DiabetesPedigreeFunction: double (nullable = true)\n", | |
" |-- Age: integer (nullable = true)\n", | |
" |-- Outcome: integer (nullable = true)\n", | |
" |-- Features: vector (nullable = true)\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ixrVfC6K25Ko", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 488 | |
}, | |
"outputId": "32b041be-951c-42b3-a537-66ae28255d2a" | |
}, | |
"source": [ | |
"output_data.show()" | |
], | |
"execution_count": 88, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+\n", | |
"|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome| Features|\n", | |
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+\n", | |
"| 2| 138| 62| 35| 80|33.6| 0.127| 47| 1|[2.0,138.0,62.0,3...|\n", | |
"| 0| 84| 82| 31| 125|38.2| 0.233| 23| 0|[0.0,84.0,82.0,31...|\n", | |
"| 0| 145| 69| 20| 80|44.2| 0.63| 31| 1|[0.0,145.0,69.0,2...|\n", | |
"| 0| 135| 68| 42| 250|42.3| 0.365| 24| 1|[0.0,135.0,68.0,4...|\n", | |
"| 1| 139| 62| 41| 480|40.7| 0.536| 21| 0|[1.0,139.0,62.0,4...|\n", | |
"| 0| 173| 78| 32| 265|46.5| 1.159| 58| 0|[0.0,173.0,78.0,3...|\n", | |
"| 4| 99| 72| 17| 80|25.6| 0.294| 28| 0|[4.0,99.0,72.0,17...|\n", | |
"| 8| 194| 80| 20| 80|26.1| 0.551| 67| 0|[8.0,194.0,80.0,2...|\n", | |
"| 2| 83| 65| 28| 66|36.8| 0.629| 24| 0|[2.0,83.0,65.0,28...|\n", | |
"| 2| 89| 90| 30| 80|33.5| 0.292| 42| 0|[2.0,89.0,90.0,30...|\n", | |
"| 4| 99| 68| 38| 80|32.8| 0.145| 33| 0|[4.0,99.0,68.0,38...|\n", | |
"| 4| 125| 70| 18| 122|28.9| 1.144| 45| 1|[4.0,125.0,70.0,1...|\n", | |
"| 3| 80| 69| 20| 80|32.0| 0.174| 22| 0|[3.0,80.0,69.0,20...|\n", | |
"| 6| 166| 74| 20| 80|26.6| 0.304| 66| 0|[6.0,166.0,74.0,2...|\n", | |
"| 5| 110| 68| 20| 80|26.0| 0.292| 30| 0|[5.0,110.0,68.0,2...|\n", | |
"| 2| 81| 72| 15| 76|30.1| 0.547| 25| 0|[2.0,81.0,72.0,15...|\n", | |
"| 7| 195| 70| 33| 145|25.1| 0.163| 55| 1|[7.0,195.0,70.0,3...|\n", | |
"| 6| 154| 74| 32| 193|29.3| 0.839| 39| 0|[6.0,154.0,74.0,3...|\n", | |
"| 2| 117| 90| 19| 71|25.2| 0.313| 21| 0|[2.0,117.0,90.0,1...|\n", | |
"| 3| 84| 72| 32| 80|37.2| 0.267| 28| 0|[3.0,84.0,72.0,32...|\n", | |
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "mGwX6dAb3DJL", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from pyspark.ml.classification import LogisticRegression\n", | |
"final_data = output_data.select(\"features\",\"Outcome\")" | |
], | |
"execution_count": 90, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "9whLv9iX3Gj-", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 86 | |
}, | |
"outputId": "90bd4030-c500-43ba-f2f5-c02c52f6da56" | |
}, | |
"source": [ | |
"final_data.printSchema()" | |
], | |
"execution_count": 91, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"root\n", | |
" |-- features: vector (nullable = true)\n", | |
" |-- Outcome: integer (nullable = true)\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "52hXBl1l4ZxU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"train,test = final_data.randomSplit([0.7,0.3])\n", | |
"models = LogisticRegression(labelCol=\"Outcome\")\n", | |
"model = models.fit(train)" | |
], | |
"execution_count": 92, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "OvGCMh0y4xsF", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"summary = model.summary" | |
], | |
"execution_count": 93, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "DtT8pL7743_S", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 190 | |
}, | |
"outputId": "05127843-0367-45a3-87ac-70332aa0ba2c" | |
}, | |
"source": [ | |
"summary.predictions.describe().show()" | |
], | |
"execution_count": 104, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"+-------+------------------+-------------------+\n", | |
"|summary| Outcome| prediction|\n", | |
"+-------+------------------+-------------------+\n", | |
"| count| 1384| 1384|\n", | |
"| mean| 0.342485549132948|0.26372832369942195|\n", | |
"| stddev|0.4747125702730636| 0.4408129952509889|\n", | |
"| min| 0.0| 0.0|\n", | |
"| max| 1.0| 1.0|\n", | |
"+-------+------------------+-------------------+\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "gYpreMQP52z9", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Evaluation and Test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "lTDJe6rU6CWy", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", | |
"predictions = model.evaluate(test)" | |
], | |
"execution_count": 101, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7ZJw8Y7J6NS6", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 661 | |
}, | |
"outputId": "d675ec35-120e-4db2-b060-c3dabd95493a" | |
}, | |
"source": [ | |
"predictions.predictions.show(30)" | |
], | |
"execution_count": 108, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+-------+--------------------+--------------------+----------+\n", | |
"| features|Outcome| rawPrediction| probability|prediction|\n", | |
"+--------------------+-------+--------------------+--------------------+----------+\n", | |
"|[0.0,78.0,88.0,29...| 0|[2.75976400212540...|[0.94046242120211...| 0.0|\n", | |
"|[0.0,86.0,68.0,32...| 0|[2.63434783996648...|[0.93303969972330...| 0.0|\n", | |
"|[0.0,91.0,80.0,20...| 0|[2.36442091330221...|[0.91407367350119...| 0.0|\n", | |
"|[0.0,93.0,60.0,20...| 0|[2.23771089675291...|[0.90358521812295...| 0.0|\n", | |
"|[0.0,93.0,60.0,25...| 0|[2.71605230252559...|[0.93796723488538...| 0.0|\n", | |
"|[0.0,93.0,60.0,25...| 0|[2.71605230252559...|[0.93796723488538...| 0.0|\n", | |
"|[0.0,93.0,100.0,3...| 0|[1.07067297656603...|[0.74472487629355...| 0.0|\n", | |
"|[0.0,93.0,100.0,3...| 0|[1.07067297656603...|[0.74472487629355...| 0.0|\n", | |
"|[0.0,94.0,69.0,20...| 0|[2.57159767261367...|[0.92901113378793...| 0.0|\n", | |
"|[0.0,94.0,70.0,27...| 0|[1.57511304007697...|[0.82851129139262...| 0.0|\n", | |
"|[0.0,95.0,80.0,45...| 0|[2.36264571397709...|[0.91393414150497...| 0.0|\n", | |
"|[0.0,95.0,85.0,25...| 1|[2.16883059345644...|[0.89741535980435...| 0.0|\n", | |
"|[0.0,97.0,64.0,36...| 0|[1.85038788785940...|[0.86417263903592...| 0.0|\n", | |
"|[0.0,100.0,70.0,2...| 0|[2.26519831618682...|[0.90595347246032...| 0.0|\n", | |
"|[0.0,100.0,88.0,6...| 0|[0.78543673487209...|[0.68685067022649...| 0.0|\n", | |
"|[0.0,101.0,64.0,1...| 0|[3.35418392107285...|[0.96624157683580...| 0.0|\n", | |
"|[0.0,101.0,65.0,2...| 0|[3.13408229218634...|[0.95827691793505...| 0.0|\n", | |
"|[0.0,101.0,76.0,2...| 0|[2.05507007352773...|[0.88645891983777...| 0.0|\n", | |
"|[0.0,102.0,64.0,4...| 0|[1.55630594646828...|[0.82582264114614...| 0.0|\n", | |
"|[0.0,102.0,75.0,2...| 0|[2.12554447217875...|[0.89336128734001...| 0.0|\n", | |
"|[0.0,102.0,78.0,4...| 0|[2.34879062018792...|[0.91283805159587...| 0.0|\n", | |
"|[0.0,104.0,64.0,3...| 1|[2.01051283795057...|[0.88189644734275...| 0.0|\n", | |
"|[0.0,104.0,64.0,3...| 1|[2.01051283795057...|[0.88189644734275...| 0.0|\n", | |
"|[0.0,104.0,76.0,2...| 0|[3.20400251717622...|[0.96098462149452...| 0.0|\n", | |
"|[0.0,105.0,84.0,2...| 1|[1.65899947069871...|[0.84010364817703...| 0.0|\n", | |
"|[0.0,105.0,84.0,2...| 1|[1.65899947069871...|[0.84010364817703...| 0.0|\n", | |
"|[0.0,106.0,70.0,3...| 0|[1.40726577385673...|[0.80333432617655...| 0.0|\n", | |
"|[0.0,106.0,70.0,3...| 0|[1.40726577385673...|[0.80333432617655...| 0.0|\n", | |
"|[0.0,107.0,60.0,2...| 0|[2.76995046775626...|[0.94103023793567...| 0.0|\n", | |
"|[0.0,108.0,68.0,2...| 0|[1.91862488493013...|[0.87198501200635...| 0.0|\n", | |
"+--------------------+-------+--------------------+--------------------+----------+\n", | |
"only showing top 30 rows\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ffQPi_sZ6Stk", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"evaluator = BinaryClassificationEvaluator(rawPredictionCol=\"rawPrediction\",labelCol=\"Outcome\")" | |
], | |
"execution_count": 109, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ul-sONZP7auv", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "2828f27d-0e62-4380-ebf3-0223f6824a34" | |
}, | |
"source": [ | |
"evaluator.evaluate(model.transform(test))" | |
], | |
"execution_count": 110, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.831280788177339" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 110 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "i527rhzz7j5x", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"model.save(\"diabetes-model\")" | |
], | |
"execution_count": 111, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "JNInrkbp8XWG", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"to use model again\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "oQviGdDo7vix", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from pyspark.ml.classification import LogisticRegressionModel\n", | |
"model = LogisticRegressionModel.load(\"diabetes-model\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |