Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "diabetes_pred.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "X3O_nO8eZqRq",
"colab_type": "code",
"colab": {}
},
"source": [
"!apt-get install openjdk-8-jdk-headless -qq > /dev/null"
],
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "pmDnPTPDZ1fo",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
},
"outputId": "418b16f4-f2f2-4aa8-da92-ab2ff11795ef"
},
"source": [
"!pip install pyspark==2.4.4\n"
],
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: pyspark==2.4.4 in /usr/local/lib/python3.6/dist-packages (2.4.4)\n",
"Requirement already satisfied: py4j==0.10.7 in /usr/local/lib/python3.6/dist-packages (from pyspark==2.4.4) (0.10.7)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Mp6h5o9iaSq6",
"colab_type": "text"
},
"source": [
"Environment Path\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "nKVK-xcQaU1T",
"colab_type": "code",
"colab": {}
},
"source": [
"import os\n",
"os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'"
],
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "9yK6BQfjakoD",
"colab_type": "code",
"colab": {}
},
"source": [
"from pyspark.sql import SparkSession\n",
"from pyspark.sql.functions import *\n",
"\n",
"spark = SparkSession.builder.appName(\"spark\").getOrCreate()"
],
"execution_count": 49,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "yIjAGHsAcAav",
"colab_type": "text"
},
"source": [
"Clone Diabetes Dataset"
]
},
{
"cell_type": "code",
"metadata": {
"id": "p7yXof5HcNs9",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"outputId": "2a69b808-579d-4a17-c24d-1cc62daf71b6"
},
"source": [
"! git clone \"https://github.com/education454/diabetes_dataset\""
],
"execution_count": 37,
"outputs": [
{
"output_type": "stream",
"text": [
"fatal: destination path 'diabetes_dataset' already exists and is not an empty directory.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "naltoltxcVRe",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "51bed9bc-232c-49a6-f3fa-1c1edef165d6"
},
"source": [
"!ls"
],
"execution_count": 38,
"outputs": [
{
"output_type": "stream",
"text": [
"diabetes_dataset sample_data\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "8ihyUeqJcZNB",
"colab_type": "code",
"colab": {}
},
"source": [
"df = spark.read.csv(\"/content/diabetes_dataset/diabetes.csv\",header = True, inferSchema = True)"
],
"execution_count": 39,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "MyOFjGJxdJVq",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 488
},
"outputId": "886cdd2d-5fab-4277-b2b2-9ee140728d26"
},
"source": [
"df.show()"
],
"execution_count": 40,
"outputs": [
{
"output_type": "stream",
"text": [
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+\n",
"|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|\n",
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+\n",
"| 2| 138| 62| 35| 0|33.6| 0.127| 47| 1|\n",
"| 0| 84| 82| 31| 125|38.2| 0.233| 23| 0|\n",
"| 0| 145| 0| 0| 0|44.2| 0.63| 31| 1|\n",
"| 0| 135| 68| 42| 250|42.3| 0.365| 24| 1|\n",
"| 1| 139| 62| 41| 480|40.7| 0.536| 21| 0|\n",
"| 0| 173| 78| 32| 265|46.5| 1.159| 58| 0|\n",
"| 4| 99| 72| 17| 0|25.6| 0.294| 28| 0|\n",
"| 8| 194| 80| 0| 0|26.1| 0.551| 67| 0|\n",
"| 2| 83| 65| 28| 66|36.8| 0.629| 24| 0|\n",
"| 2| 89| 90| 30| 0|33.5| 0.292| 42| 0|\n",
"| 4| 99| 68| 38| 0|32.8| 0.145| 33| 0|\n",
"| 4| 125| 70| 18| 122|28.9| 1.144| 45| 1|\n",
"| 3| 80| 0| 0| 0| 0.0| 0.174| 22| 0|\n",
"| 6| 166| 74| 0| 0|26.6| 0.304| 66| 0|\n",
"| 5| 110| 68| 0| 0|26.0| 0.292| 30| 0|\n",
"| 2| 81| 72| 15| 76|30.1| 0.547| 25| 0|\n",
"| 7| 195| 70| 33| 145|25.1| 0.163| 55| 1|\n",
"| 6| 154| 74| 32| 193|29.3| 0.839| 39| 0|\n",
"| 2| 117| 90| 19| 71|25.2| 0.313| 21| 0|\n",
"| 3| 84| 72| 32| 0|37.2| 0.267| 28| 0|\n",
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+\n",
"only showing top 20 rows\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "jUR6XpQqdMWD",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 208
},
"outputId": "fbc8eb62-c131-4c89-861e-38662304da8c"
},
"source": [
"df.printSchema()"
],
"execution_count": 41,
"outputs": [
{
"output_type": "stream",
"text": [
"root\n",
" |-- Pregnancies: integer (nullable = true)\n",
" |-- Glucose: integer (nullable = true)\n",
" |-- BloodPressure: integer (nullable = true)\n",
" |-- SkinThickness: integer (nullable = true)\n",
" |-- Insulin: integer (nullable = true)\n",
" |-- BMI: double (nullable = true)\n",
" |-- DiabetesPedigreeFunction: double (nullable = true)\n",
" |-- Age: integer (nullable = true)\n",
" |-- Outcome: integer (nullable = true)\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "nv9wfSr5dYcq",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "2c67a49b-43ed-4d27-d986-74ce5be50063"
},
"source": [
"print(df.count(), len(df.columns))"
],
"execution_count": 42,
"outputs": [
{
"output_type": "stream",
"text": [
"2000 9\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ypmXeML5djzX",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 138
},
"outputId": "9f561434-591c-4596-f637-38a3264be220"
},
"source": [
"df.groupby(\"Outcome\").count().show()"
],
"execution_count": 43,
"outputs": [
{
"output_type": "stream",
"text": [
"+-------+-----+\n",
"|Outcome|count|\n",
"+-------+-----+\n",
"| 1| 684|\n",
"| 0| 1316|\n",
"+-------+-----+\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "p2iEPQx0d9Gd",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 210
},
"outputId": "3adee5b2-10d0-40e3-b584-807b704c27e0"
},
"source": [
"df.describe().show()"
],
"execution_count": 44,
"outputs": [
{
"output_type": "stream",
"text": [
"+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+\n",
"|summary| Pregnancies| Glucose| BloodPressure| SkinThickness| Insulin| BMI|DiabetesPedigreeFunction| Age| Outcome|\n",
"+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+\n",
"| count| 2000| 2000| 2000| 2000| 2000| 2000| 2000| 2000| 2000|\n",
"| mean| 3.7035| 121.1825| 69.1455| 20.935| 80.254|32.192999999999984| 0.47092999999999974| 33.0905| 0.342|\n",
"| stddev|3.306063032730656|32.068635649902916|19.188314815604098|16.10324290992682|111.1805335457595| 8.149900701279762| 0.3235525586811429|11.786423106049496|0.4744982342297426|\n",
"| min| 0| 0| 0| 0| 0| 0.0| 0.078| 21| 0|\n",
"| max| 17| 199| 122| 110| 744| 80.6| 2.42| 81| 1|\n",
"+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "m8w3T4VJehV4",
"colab_type": "text"
},
"source": [
"Cleaning Data(EDA)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "AYVPyWtdelI1",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 173
},
"outputId": "7f5ca343-0293-4bda-d115-396e8e0d66d6"
},
"source": [
"#finding null values\n",
"for col in df.columns:\n",
" print(col + \":\",df[df[col].isNull()].count())"
],
"execution_count": 45,
"outputs": [
{
"output_type": "stream",
"text": [
"Pregnancies: 0\n",
"Glucose: 0\n",
"BloodPressure: 0\n",
"SkinThickness: 0\n",
"Insulin: 0\n",
"BMI: 0\n",
"DiabetesPedigreeFunction: 0\n",
"Age: 0\n",
"Outcome: 0\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "r1p5W2RdendZ",
"colab_type": "code",
"colab": {}
},
"source": [
"def count_zeros():\n",
" columns_list = [\"Glucose\", \"BloodPressure\", \"SkinThickness\", \"Insulin\", \"BMI\"]\n",
" for i in columns_list:\n",
" print(i+\":\",df[df[i]==0].count())"
],
"execution_count": 46,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ZHlZ1DyPkd0I",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 104
},
"outputId": "7e3051a7-4fa2-4707-e21e-2dc3a1898422"
},
"source": [
"count_zeros()"
],
"execution_count": 47,
"outputs": [
{
"output_type": "stream",
"text": [
"Glucose: 13\n",
"BloodPressure: 90\n",
"SkinThickness: 573\n",
"Insulin: 956\n",
"BMI: 28\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "rrzOraMdkf0Q",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 104
},
"outputId": "3fbc6de4-76f8-400e-9707-b5a8c8accebd"
},
"source": [
"for i in df.columns[1:6]:\n",
" data = df.agg({i:'mean'}).first()[0]\n",
" print(\"Mean value for {} is {}\".format(i,int(data)))\n",
" df = df.withColumn(i,when(df[i]==0,int(data)).otherwise(df[i]))"
],
"execution_count": 82,
"outputs": [
{
"output_type": "stream",
"text": [
"Mean value for Glucose is 121\n",
"Mean value for BloodPressure is 72\n",
"Mean value for SkinThickness is 26\n",
"Mean value for Insulin is 118\n",
"Mean value for BMI is 32\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "AGDhT_tonlma",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"outputId": "4afa2a8f-c4b4-48cb-8321-d94f2309ed02"
},
"source": [
"df.show"
],
"execution_count": 65,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<bound method DataFrame.show of DataFrame[Pregnancies: int, Glucose: int, BloodPressure: int, SkinThickness: int, Insulin: int, BMI: double, DiabetesPedigreeFunction: double, Age: int, Outcome: int]>"
]
},
"metadata": {
"tags": []
},
"execution_count": 65
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "DA0IL4wipG8V",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 173
},
"outputId": "f1ad5686-54a5-4f7c-c73a-aff96d6e6bc4"
},
"source": [
"for col in df.columns:\n",
" print(\"correlation to outcome for {} is {}\".format(col,df.stat.corr(\"Outcome\",col)))\n"
],
"execution_count": 56,
"outputs": [
{
"output_type": "stream",
"text": [
"correlation to outcome for Pregnancies is 0.22443699263363961\n",
"correlation to outcome for Glucose is 0.48796646527321064\n",
"correlation to outcome for BloodPressure is 0.17171333286446713\n",
"correlation to outcome for SkinThickness is 0.1659010662889893\n",
"correlation to outcome for Insulin is 0.1711763270226193\n",
"correlation to outcome for BMI is 0.2827927569760082\n",
"correlation to outcome for DiabetesPedigreeFunction is 0.1554590791569403\n",
"correlation to outcome for Age is 0.23650924717620253\n",
"correlation to outcome for Outcome is 1.0\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "UC-cGFxMzAWW",
"colab_type": "code",
"colab": {}
},
"source": [
"from pyspark.ml.feature import VectorAssembler\n",
"assembler = VectorAssembler(inputCols = [\"Pregnancies\",\"Glucose\", \"BloodPressure\", \"SkinThickness\", \"Insulin\", \"BMI\",\"DiabetesPedigreeFunction\",\"Age\"], outputCol = \"Features\")\n",
"output_data = assembler.transform(df)"
],
"execution_count": 84,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Wjvi58ck2T1e",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 225
},
"outputId": "2764a872-b4a0-4a91-bd8d-fbbdd73e37b1"
},
"source": [
"output_data.printSchema()"
],
"execution_count": 87,
"outputs": [
{
"output_type": "stream",
"text": [
"root\n",
" |-- Pregnancies: integer (nullable = true)\n",
" |-- Glucose: integer (nullable = true)\n",
" |-- BloodPressure: integer (nullable = true)\n",
" |-- SkinThickness: integer (nullable = true)\n",
" |-- Insulin: integer (nullable = true)\n",
" |-- BMI: double (nullable = true)\n",
" |-- DiabetesPedigreeFunction: double (nullable = true)\n",
" |-- Age: integer (nullable = true)\n",
" |-- Outcome: integer (nullable = true)\n",
" |-- Features: vector (nullable = true)\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ixrVfC6K25Ko",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 488
},
"outputId": "32b041be-951c-42b3-a537-66ae28255d2a"
},
"source": [
"output_data.show()"
],
"execution_count": 88,
"outputs": [
{
"output_type": "stream",
"text": [
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+\n",
"|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome| Features|\n",
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+\n",
"| 2| 138| 62| 35| 80|33.6| 0.127| 47| 1|[2.0,138.0,62.0,3...|\n",
"| 0| 84| 82| 31| 125|38.2| 0.233| 23| 0|[0.0,84.0,82.0,31...|\n",
"| 0| 145| 69| 20| 80|44.2| 0.63| 31| 1|[0.0,145.0,69.0,2...|\n",
"| 0| 135| 68| 42| 250|42.3| 0.365| 24| 1|[0.0,135.0,68.0,4...|\n",
"| 1| 139| 62| 41| 480|40.7| 0.536| 21| 0|[1.0,139.0,62.0,4...|\n",
"| 0| 173| 78| 32| 265|46.5| 1.159| 58| 0|[0.0,173.0,78.0,3...|\n",
"| 4| 99| 72| 17| 80|25.6| 0.294| 28| 0|[4.0,99.0,72.0,17...|\n",
"| 8| 194| 80| 20| 80|26.1| 0.551| 67| 0|[8.0,194.0,80.0,2...|\n",
"| 2| 83| 65| 28| 66|36.8| 0.629| 24| 0|[2.0,83.0,65.0,28...|\n",
"| 2| 89| 90| 30| 80|33.5| 0.292| 42| 0|[2.0,89.0,90.0,30...|\n",
"| 4| 99| 68| 38| 80|32.8| 0.145| 33| 0|[4.0,99.0,68.0,38...|\n",
"| 4| 125| 70| 18| 122|28.9| 1.144| 45| 1|[4.0,125.0,70.0,1...|\n",
"| 3| 80| 69| 20| 80|32.0| 0.174| 22| 0|[3.0,80.0,69.0,20...|\n",
"| 6| 166| 74| 20| 80|26.6| 0.304| 66| 0|[6.0,166.0,74.0,2...|\n",
"| 5| 110| 68| 20| 80|26.0| 0.292| 30| 0|[5.0,110.0,68.0,2...|\n",
"| 2| 81| 72| 15| 76|30.1| 0.547| 25| 0|[2.0,81.0,72.0,15...|\n",
"| 7| 195| 70| 33| 145|25.1| 0.163| 55| 1|[7.0,195.0,70.0,3...|\n",
"| 6| 154| 74| 32| 193|29.3| 0.839| 39| 0|[6.0,154.0,74.0,3...|\n",
"| 2| 117| 90| 19| 71|25.2| 0.313| 21| 0|[2.0,117.0,90.0,1...|\n",
"| 3| 84| 72| 32| 80|37.2| 0.267| 28| 0|[3.0,84.0,72.0,32...|\n",
"+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+\n",
"only showing top 20 rows\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "mGwX6dAb3DJL",
"colab_type": "code",
"colab": {}
},
"source": [
"from pyspark.ml.classification import LogisticRegression\n",
"final_data = output_data.select(\"features\",\"Outcome\")"
],
"execution_count": 90,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "9whLv9iX3Gj-",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 86
},
"outputId": "90bd4030-c500-43ba-f2f5-c02c52f6da56"
},
"source": [
"final_data.printSchema()"
],
"execution_count": 91,
"outputs": [
{
"output_type": "stream",
"text": [
"root\n",
" |-- features: vector (nullable = true)\n",
" |-- Outcome: integer (nullable = true)\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "52hXBl1l4ZxU",
"colab_type": "code",
"colab": {}
},
"source": [
"train,test = final_data.randomSplit([0.7,0.3])\n",
"models = LogisticRegression(labelCol=\"Outcome\")\n",
"model = models.fit(train)"
],
"execution_count": 92,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "OvGCMh0y4xsF",
"colab_type": "code",
"colab": {}
},
"source": [
"summary = model.summary"
],
"execution_count": 93,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "DtT8pL7743_S",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 190
},
"outputId": "05127843-0367-45a3-87ac-70332aa0ba2c"
},
"source": [
"summary.predictions.describe().show()"
],
"execution_count": 104,
"outputs": [
{
"output_type": "stream",
"text": [
"+-------+------------------+-------------------+\n",
"|summary| Outcome| prediction|\n",
"+-------+------------------+-------------------+\n",
"| count| 1384| 1384|\n",
"| mean| 0.342485549132948|0.26372832369942195|\n",
"| stddev|0.4747125702730636| 0.4408129952509889|\n",
"| min| 0.0| 0.0|\n",
"| max| 1.0| 1.0|\n",
"+-------+------------------+-------------------+\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "gYpreMQP52z9",
"colab_type": "text"
},
"source": [
"Evaluation and Test"
]
},
{
"cell_type": "code",
"metadata": {
"id": "lTDJe6rU6CWy",
"colab_type": "code",
"colab": {}
},
"source": [
"from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
"predictions = model.evaluate(test)"
],
"execution_count": 101,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "7ZJw8Y7J6NS6",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 661
},
"outputId": "d675ec35-120e-4db2-b060-c3dabd95493a"
},
"source": [
"predictions.predictions.show(30)"
],
"execution_count": 108,
"outputs": [
{
"output_type": "stream",
"text": [
"+--------------------+-------+--------------------+--------------------+----------+\n",
"| features|Outcome| rawPrediction| probability|prediction|\n",
"+--------------------+-------+--------------------+--------------------+----------+\n",
"|[0.0,78.0,88.0,29...| 0|[2.75976400212540...|[0.94046242120211...| 0.0|\n",
"|[0.0,86.0,68.0,32...| 0|[2.63434783996648...|[0.93303969972330...| 0.0|\n",
"|[0.0,91.0,80.0,20...| 0|[2.36442091330221...|[0.91407367350119...| 0.0|\n",
"|[0.0,93.0,60.0,20...| 0|[2.23771089675291...|[0.90358521812295...| 0.0|\n",
"|[0.0,93.0,60.0,25...| 0|[2.71605230252559...|[0.93796723488538...| 0.0|\n",
"|[0.0,93.0,60.0,25...| 0|[2.71605230252559...|[0.93796723488538...| 0.0|\n",
"|[0.0,93.0,100.0,3...| 0|[1.07067297656603...|[0.74472487629355...| 0.0|\n",
"|[0.0,93.0,100.0,3...| 0|[1.07067297656603...|[0.74472487629355...| 0.0|\n",
"|[0.0,94.0,69.0,20...| 0|[2.57159767261367...|[0.92901113378793...| 0.0|\n",
"|[0.0,94.0,70.0,27...| 0|[1.57511304007697...|[0.82851129139262...| 0.0|\n",
"|[0.0,95.0,80.0,45...| 0|[2.36264571397709...|[0.91393414150497...| 0.0|\n",
"|[0.0,95.0,85.0,25...| 1|[2.16883059345644...|[0.89741535980435...| 0.0|\n",
"|[0.0,97.0,64.0,36...| 0|[1.85038788785940...|[0.86417263903592...| 0.0|\n",
"|[0.0,100.0,70.0,2...| 0|[2.26519831618682...|[0.90595347246032...| 0.0|\n",
"|[0.0,100.0,88.0,6...| 0|[0.78543673487209...|[0.68685067022649...| 0.0|\n",
"|[0.0,101.0,64.0,1...| 0|[3.35418392107285...|[0.96624157683580...| 0.0|\n",
"|[0.0,101.0,65.0,2...| 0|[3.13408229218634...|[0.95827691793505...| 0.0|\n",
"|[0.0,101.0,76.0,2...| 0|[2.05507007352773...|[0.88645891983777...| 0.0|\n",
"|[0.0,102.0,64.0,4...| 0|[1.55630594646828...|[0.82582264114614...| 0.0|\n",
"|[0.0,102.0,75.0,2...| 0|[2.12554447217875...|[0.89336128734001...| 0.0|\n",
"|[0.0,102.0,78.0,4...| 0|[2.34879062018792...|[0.91283805159587...| 0.0|\n",
"|[0.0,104.0,64.0,3...| 1|[2.01051283795057...|[0.88189644734275...| 0.0|\n",
"|[0.0,104.0,64.0,3...| 1|[2.01051283795057...|[0.88189644734275...| 0.0|\n",
"|[0.0,104.0,76.0,2...| 0|[3.20400251717622...|[0.96098462149452...| 0.0|\n",
"|[0.0,105.0,84.0,2...| 1|[1.65899947069871...|[0.84010364817703...| 0.0|\n",
"|[0.0,105.0,84.0,2...| 1|[1.65899947069871...|[0.84010364817703...| 0.0|\n",
"|[0.0,106.0,70.0,3...| 0|[1.40726577385673...|[0.80333432617655...| 0.0|\n",
"|[0.0,106.0,70.0,3...| 0|[1.40726577385673...|[0.80333432617655...| 0.0|\n",
"|[0.0,107.0,60.0,2...| 0|[2.76995046775626...|[0.94103023793567...| 0.0|\n",
"|[0.0,108.0,68.0,2...| 0|[1.91862488493013...|[0.87198501200635...| 0.0|\n",
"+--------------------+-------+--------------------+--------------------+----------+\n",
"only showing top 30 rows\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ffQPi_sZ6Stk",
"colab_type": "code",
"colab": {}
},
"source": [
"evaluator = BinaryClassificationEvaluator(rawPredictionCol=\"rawPrediction\",labelCol=\"Outcome\")"
],
"execution_count": 109,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ul-sONZP7auv",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "2828f27d-0e62-4380-ebf3-0223f6824a34"
},
"source": [
"evaluator.evaluate(model.transform(test))"
],
"execution_count": 110,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.831280788177339"
]
},
"metadata": {
"tags": []
},
"execution_count": 110
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "i527rhzz7j5x",
"colab_type": "code",
"colab": {}
},
"source": [
"model.save(\"diabetes-model\")"
],
"execution_count": 111,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "JNInrkbp8XWG",
"colab_type": "text"
},
"source": [
"to use model again\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "oQviGdDo7vix",
"colab_type": "code",
"colab": {}
},
"source": [
"from pyspark.ml.classification import LogisticRegressionModel\n",
"model = LogisticRegressionModel.load(\"diabetes-model\")"
],
"execution_count": null,
"outputs": []
}
]
}