diabetes_pred.ipynb

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "diabetes_pred.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "X3O_nO8eZqRq",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!apt-get install openjdk-8-jdk-headless -qq > /dev/null"
      ],
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pmDnPTPDZ1fo",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 72
        },
        "outputId": "418b16f4-f2f2-4aa8-da92-ab2ff11795ef"
      },
      "source": [
        "!pip install pyspark==2.4.4\n"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: pyspark==2.4.4 in /usr/local/lib/python3.6/dist-packages (2.4.4)\n",
            "Requirement already satisfied: py4j==0.10.7 in /usr/local/lib/python3.6/dist-packages (from pyspark==2.4.4) (0.10.7)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Mp6h5o9iaSq6",
        "colab_type": "text"
      },
      "source": [
        "Environment Path\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nKVK-xcQaU1T",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import os\n",
        "os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'"
      ],
      "execution_count": 14,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9yK6BQfjakoD",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from pyspark.sql import SparkSession\n",
        "from pyspark.sql.functions import *\n",
        "\n",
        "spark = SparkSession.builder.appName(\"spark\").getOrCreate()"
      ],
      "execution_count": 49,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yIjAGHsAcAav",
        "colab_type": "text"
      },
      "source": [
        "Clone Diabetes Dataset"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "p7yXof5HcNs9",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 54
        },
        "outputId": "2a69b808-579d-4a17-c24d-1cc62daf71b6"
      },
      "source": [
        "! git clone \"https://github.com/education454/diabetes_dataset\""
      ],
      "execution_count": 37,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "fatal: destination path 'diabetes_dataset' already exists and is not an empty directory.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "naltoltxcVRe",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "51bed9bc-232c-49a6-f3fa-1c1edef165d6"
      },
      "source": [
        "!ls"
      ],
      "execution_count": 38,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "diabetes_dataset  sample_data\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8ihyUeqJcZNB",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "df = spark.read.csv(\"/content/diabetes_dataset/diabetes.csv\",header = True, inferSchema = True)"
      ],
      "execution_count": 39,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MyOFjGJxdJVq",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 488
        },
        "outputId": "886cdd2d-5fab-4277-b2b2-9ee140728d26"
      },
      "source": [
        "df.show()"
      ],
      "execution_count": 40,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+\n",
            "|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|\n",
            "+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+\n",
            "|          2|    138|           62|           35|      0|33.6|                   0.127| 47|      1|\n",
            "|          0|     84|           82|           31|    125|38.2|                   0.233| 23|      0|\n",
            "|          0|    145|            0|            0|      0|44.2|                    0.63| 31|      1|\n",
            "|          0|    135|           68|           42|    250|42.3|                   0.365| 24|      1|\n",
            "|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|\n",
            "|          0|    173|           78|           32|    265|46.5|                   1.159| 58|      0|\n",
            "|          4|     99|           72|           17|      0|25.6|                   0.294| 28|      0|\n",
            "|          8|    194|           80|            0|      0|26.1|                   0.551| 67|      0|\n",
            "|          2|     83|           65|           28|     66|36.8|                   0.629| 24|      0|\n",
            "|          2|     89|           90|           30|      0|33.5|                   0.292| 42|      0|\n",
            "|          4|     99|           68|           38|      0|32.8|                   0.145| 33|      0|\n",
            "|          4|    125|           70|           18|    122|28.9|                   1.144| 45|      1|\n",
            "|          3|     80|            0|            0|      0| 0.0|                   0.174| 22|      0|\n",
            "|          6|    166|           74|            0|      0|26.6|                   0.304| 66|      0|\n",
            "|          5|    110|           68|            0|      0|26.0|                   0.292| 30|      0|\n",
            "|          2|     81|           72|           15|     76|30.1|                   0.547| 25|      0|\n",
            "|          7|    195|           70|           33|    145|25.1|                   0.163| 55|      1|\n",
            "|          6|    154|           74|           32|    193|29.3|                   0.839| 39|      0|\n",
            "|          2|    117|           90|           19|     71|25.2|                   0.313| 21|      0|\n",
            "|          3|     84|           72|           32|      0|37.2|                   0.267| 28|      0|\n",
            "+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+\n",
            "only showing top 20 rows\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jUR6XpQqdMWD",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 208
        },
        "outputId": "fbc8eb62-c131-4c89-861e-38662304da8c"
      },
      "source": [
        "df.printSchema()"
      ],
      "execution_count": 41,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "root\n",
            " |-- Pregnancies: integer (nullable = true)\n",
            " |-- Glucose: integer (nullable = true)\n",
            " |-- BloodPressure: integer (nullable = true)\n",
            " |-- SkinThickness: integer (nullable = true)\n",
            " |-- Insulin: integer (nullable = true)\n",
            " |-- BMI: double (nullable = true)\n",
            " |-- DiabetesPedigreeFunction: double (nullable = true)\n",
            " |-- Age: integer (nullable = true)\n",
            " |-- Outcome: integer (nullable = true)\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nv9wfSr5dYcq",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "2c67a49b-43ed-4d27-d986-74ce5be50063"
      },
      "source": [
        "print(df.count(), len(df.columns))"
      ],
      "execution_count": 42,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "2000 9\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ypmXeML5djzX",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 138
        },
        "outputId": "9f561434-591c-4596-f637-38a3264be220"
      },
      "source": [
        "df.groupby(\"Outcome\").count().show()"
      ],
      "execution_count": 43,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "+-------+-----+\n",
            "|Outcome|count|\n",
            "+-------+-----+\n",
            "|      1|  684|\n",
            "|      0| 1316|\n",
            "+-------+-----+\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "p2iEPQx0d9Gd",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 210
        },
        "outputId": "3adee5b2-10d0-40e3-b584-807b704c27e0"
      },
      "source": [
        "df.describe().show()"
      ],
      "execution_count": 44,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+\n",
            "|summary|      Pregnancies|           Glucose|     BloodPressure|    SkinThickness|          Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|\n",
            "+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+\n",
            "|  count|             2000|              2000|              2000|             2000|             2000|              2000|                    2000|              2000|              2000|\n",
            "|   mean|           3.7035|          121.1825|           69.1455|           20.935|           80.254|32.192999999999984|     0.47092999999999974|           33.0905|             0.342|\n",
            "| stddev|3.306063032730656|32.068635649902916|19.188314815604098|16.10324290992682|111.1805335457595| 8.149900701279762|      0.3235525586811429|11.786423106049496|0.4744982342297426|\n",
            "|    min|                0|                 0|                 0|                0|                0|               0.0|                   0.078|                21|                 0|\n",
            "|    max|               17|               199|               122|              110|              744|              80.6|                    2.42|                81|                 1|\n",
            "+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "m8w3T4VJehV4",
        "colab_type": "text"
      },
      "source": [
        "Cleaning Data(EDA)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "AYVPyWtdelI1",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 173
        },
        "outputId": "7f5ca343-0293-4bda-d115-396e8e0d66d6"
      },
      "source": [
        "#finding null values\n",
        "for col in df.columns:\n",
        "  print(col + \":\",df[df[col].isNull()].count())"
      ],
      "execution_count": 45,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Pregnancies: 0\n",
            "Glucose: 0\n",
            "BloodPressure: 0\n",
            "SkinThickness: 0\n",
            "Insulin: 0\n",
            "BMI: 0\n",
            "DiabetesPedigreeFunction: 0\n",
            "Age: 0\n",
            "Outcome: 0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "r1p5W2RdendZ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def count_zeros():\n",
        "  columns_list = [\"Glucose\", \"BloodPressure\", \"SkinThickness\", \"Insulin\", \"BMI\"]\n",
        "  for i in columns_list:\n",
        "    print(i+\":\",df[df[i]==0].count())"
      ],
      "execution_count": 46,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZHlZ1DyPkd0I",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 104
        },
        "outputId": "7e3051a7-4fa2-4707-e21e-2dc3a1898422"
      },
      "source": [
        "count_zeros()"
      ],
      "execution_count": 47,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Glucose: 13\n",
            "BloodPressure: 90\n",
            "SkinThickness: 573\n",
            "Insulin: 956\n",
            "BMI: 28\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "rrzOraMdkf0Q",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 104
        },
        "outputId": "3fbc6de4-76f8-400e-9707-b5a8c8accebd"
      },
      "source": [
        "for i in df.columns[1:6]:\n",
        "  data = df.agg({i:'mean'}).first()[0]\n",
        "  print(\"Mean value for {} is {}\".format(i,int(data)))\n",
        "  df = df.withColumn(i,when(df[i]==0,int(data)).otherwise(df[i]))"
      ],
      "execution_count": 82,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Mean value for Glucose is 121\n",
            "Mean value for BloodPressure is 72\n",
            "Mean value for SkinThickness is 26\n",
            "Mean value for Insulin is 118\n",
            "Mean value for BMI is 32\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "AGDhT_tonlma",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 54
        },
        "outputId": "4afa2a8f-c4b4-48cb-8321-d94f2309ed02"
      },
      "source": [
        "df.show"
      ],
      "execution_count": 65,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<bound method DataFrame.show of DataFrame[Pregnancies: int, Glucose: int, BloodPressure: int, SkinThickness: int, Insulin: int, BMI: double, DiabetesPedigreeFunction: double, Age: int, Outcome: int]>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 65
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DA0IL4wipG8V",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 173
        },
        "outputId": "f1ad5686-54a5-4f7c-c73a-aff96d6e6bc4"
      },
      "source": [
        "for col in df.columns:\n",
        "  print(\"correlation to outcome for {} is {}\".format(col,df.stat.corr(\"Outcome\",col)))\n"
      ],
      "execution_count": 56,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "correlation to outcome for Pregnancies is 0.22443699263363961\n",
            "correlation to outcome for Glucose is 0.48796646527321064\n",
            "correlation to outcome for BloodPressure is 0.17171333286446713\n",
            "correlation to outcome for SkinThickness is 0.1659010662889893\n",
            "correlation to outcome for Insulin is 0.1711763270226193\n",
            "correlation to outcome for BMI is 0.2827927569760082\n",
            "correlation to outcome for DiabetesPedigreeFunction is 0.1554590791569403\n",
            "correlation to outcome for Age is 0.23650924717620253\n",
            "correlation to outcome for Outcome is 1.0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "UC-cGFxMzAWW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from pyspark.ml.feature import VectorAssembler\n",
        "assembler = VectorAssembler(inputCols = [\"Pregnancies\",\"Glucose\", \"BloodPressure\", \"SkinThickness\", \"Insulin\", \"BMI\",\"DiabetesPedigreeFunction\",\"Age\"], outputCol = \"Features\")\n",
        "output_data = assembler.transform(df)"
      ],
      "execution_count": 84,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Wjvi58ck2T1e",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 225
        },
        "outputId": "2764a872-b4a0-4a91-bd8d-fbbdd73e37b1"
      },
      "source": [
        "output_data.printSchema()"
      ],
      "execution_count": 87,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "root\n",
            " |-- Pregnancies: integer (nullable = true)\n",
            " |-- Glucose: integer (nullable = true)\n",
            " |-- BloodPressure: integer (nullable = true)\n",
            " |-- SkinThickness: integer (nullable = true)\n",
            " |-- Insulin: integer (nullable = true)\n",
            " |-- BMI: double (nullable = true)\n",
            " |-- DiabetesPedigreeFunction: double (nullable = true)\n",
            " |-- Age: integer (nullable = true)\n",
            " |-- Outcome: integer (nullable = true)\n",
            " |-- Features: vector (nullable = true)\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ixrVfC6K25Ko",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 488
        },
        "outputId": "32b041be-951c-42b3-a537-66ae28255d2a"
      },
      "source": [
        "output_data.show()"
      ],
      "execution_count": 88,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+\n",
            "|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            Features|\n",
            "+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+\n",
            "|          2|    138|           62|           35|     80|33.6|                   0.127| 47|      1|[2.0,138.0,62.0,3...|\n",
            "|          0|     84|           82|           31|    125|38.2|                   0.233| 23|      0|[0.0,84.0,82.0,31...|\n",
            "|          0|    145|           69|           20|     80|44.2|                    0.63| 31|      1|[0.0,145.0,69.0,2...|\n",
            "|          0|    135|           68|           42|    250|42.3|                   0.365| 24|      1|[0.0,135.0,68.0,4...|\n",
            "|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|[1.0,139.0,62.0,4...|\n",
            "|          0|    173|           78|           32|    265|46.5|                   1.159| 58|      0|[0.0,173.0,78.0,3...|\n",
            "|          4|     99|           72|           17|     80|25.6|                   0.294| 28|      0|[4.0,99.0,72.0,17...|\n",
            "|          8|    194|           80|           20|     80|26.1|                   0.551| 67|      0|[8.0,194.0,80.0,2...|\n",
            "|          2|     83|           65|           28|     66|36.8|                   0.629| 24|      0|[2.0,83.0,65.0,28...|\n",
            "|          2|     89|           90|           30|     80|33.5|                   0.292| 42|      0|[2.0,89.0,90.0,30...|\n",
            "|          4|     99|           68|           38|     80|32.8|                   0.145| 33|      0|[4.0,99.0,68.0,38...|\n",
            "|          4|    125|           70|           18|    122|28.9|                   1.144| 45|      1|[4.0,125.0,70.0,1...|\n",
            "|          3|     80|           69|           20|     80|32.0|                   0.174| 22|      0|[3.0,80.0,69.0,20...|\n",
            "|          6|    166|           74|           20|     80|26.6|                   0.304| 66|      0|[6.0,166.0,74.0,2...|\n",
            "|          5|    110|           68|           20|     80|26.0|                   0.292| 30|      0|[5.0,110.0,68.0,2...|\n",
            "|          2|     81|           72|           15|     76|30.1|                   0.547| 25|      0|[2.0,81.0,72.0,15...|\n",
            "|          7|    195|           70|           33|    145|25.1|                   0.163| 55|      1|[7.0,195.0,70.0,3...|\n",
            "|          6|    154|           74|           32|    193|29.3|                   0.839| 39|      0|[6.0,154.0,74.0,3...|\n",
            "|          2|    117|           90|           19|     71|25.2|                   0.313| 21|      0|[2.0,117.0,90.0,1...|\n",
            "|          3|     84|           72|           32|     80|37.2|                   0.267| 28|      0|[3.0,84.0,72.0,32...|\n",
            "+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+\n",
            "only showing top 20 rows\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mGwX6dAb3DJL",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from pyspark.ml.classification import LogisticRegression\n",
        "final_data = output_data.select(\"features\",\"Outcome\")"
      ],
      "execution_count": 90,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9whLv9iX3Gj-",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 86
        },
        "outputId": "90bd4030-c500-43ba-f2f5-c02c52f6da56"
      },
      "source": [
        "final_data.printSchema()"
      ],
      "execution_count": 91,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "root\n",
            " |-- features: vector (nullable = true)\n",
            " |-- Outcome: integer (nullable = true)\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "52hXBl1l4ZxU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train,test = final_data.randomSplit([0.7,0.3])\n",
        "models = LogisticRegression(labelCol=\"Outcome\")\n",
        "model = models.fit(train)"
      ],
      "execution_count": 92,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OvGCMh0y4xsF",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "summary = model.summary"
      ],
      "execution_count": 93,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DtT8pL7743_S",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 190
        },
        "outputId": "05127843-0367-45a3-87ac-70332aa0ba2c"
      },
      "source": [
        "summary.predictions.describe().show()"
      ],
      "execution_count": 104,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "+-------+------------------+-------------------+\n",
            "|summary|           Outcome|         prediction|\n",
            "+-------+------------------+-------------------+\n",
            "|  count|              1384|               1384|\n",
            "|   mean| 0.342485549132948|0.26372832369942195|\n",
            "| stddev|0.4747125702730636| 0.4408129952509889|\n",
            "|    min|               0.0|                0.0|\n",
            "|    max|               1.0|                1.0|\n",
            "+-------+------------------+-------------------+\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gYpreMQP52z9",
        "colab_type": "text"
      },
      "source": [
        "Evaluation and Test"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lTDJe6rU6CWy",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
        "predictions = model.evaluate(test)"
      ],
      "execution_count": 101,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7ZJw8Y7J6NS6",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 661
        },
        "outputId": "d675ec35-120e-4db2-b060-c3dabd95493a"
      },
      "source": [
        "predictions.predictions.show(30)"
      ],
      "execution_count": 108,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "+--------------------+-------+--------------------+--------------------+----------+\n",
            "|            features|Outcome|       rawPrediction|         probability|prediction|\n",
            "+--------------------+-------+--------------------+--------------------+----------+\n",
            "|[0.0,78.0,88.0,29...|      0|[2.75976400212540...|[0.94046242120211...|       0.0|\n",
            "|[0.0,86.0,68.0,32...|      0|[2.63434783996648...|[0.93303969972330...|       0.0|\n",
            "|[0.0,91.0,80.0,20...|      0|[2.36442091330221...|[0.91407367350119...|       0.0|\n",
            "|[0.0,93.0,60.0,20...|      0|[2.23771089675291...|[0.90358521812295...|       0.0|\n",
            "|[0.0,93.0,60.0,25...|      0|[2.71605230252559...|[0.93796723488538...|       0.0|\n",
            "|[0.0,93.0,60.0,25...|      0|[2.71605230252559...|[0.93796723488538...|       0.0|\n",
            "|[0.0,93.0,100.0,3...|      0|[1.07067297656603...|[0.74472487629355...|       0.0|\n",
            "|[0.0,93.0,100.0,3...|      0|[1.07067297656603...|[0.74472487629355...|       0.0|\n",
            "|[0.0,94.0,69.0,20...|      0|[2.57159767261367...|[0.92901113378793...|       0.0|\n",
            "|[0.0,94.0,70.0,27...|      0|[1.57511304007697...|[0.82851129139262...|       0.0|\n",
            "|[0.0,95.0,80.0,45...|      0|[2.36264571397709...|[0.91393414150497...|       0.0|\n",
            "|[0.0,95.0,85.0,25...|      1|[2.16883059345644...|[0.89741535980435...|       0.0|\n",
            "|[0.0,97.0,64.0,36...|      0|[1.85038788785940...|[0.86417263903592...|       0.0|\n",
            "|[0.0,100.0,70.0,2...|      0|[2.26519831618682...|[0.90595347246032...|       0.0|\n",
            "|[0.0,100.0,88.0,6...|      0|[0.78543673487209...|[0.68685067022649...|       0.0|\n",
            "|[0.0,101.0,64.0,1...|      0|[3.35418392107285...|[0.96624157683580...|       0.0|\n",
            "|[0.0,101.0,65.0,2...|      0|[3.13408229218634...|[0.95827691793505...|       0.0|\n",
            "|[0.0,101.0,76.0,2...|      0|[2.05507007352773...|[0.88645891983777...|       0.0|\n",
            "|[0.0,102.0,64.0,4...|      0|[1.55630594646828...|[0.82582264114614...|       0.0|\n",
            "|[0.0,102.0,75.0,2...|      0|[2.12554447217875...|[0.89336128734001...|       0.0|\n",
            "|[0.0,102.0,78.0,4...|      0|[2.34879062018792...|[0.91283805159587...|       0.0|\n",
            "|[0.0,104.0,64.0,3...|      1|[2.01051283795057...|[0.88189644734275...|       0.0|\n",
            "|[0.0,104.0,64.0,3...|      1|[2.01051283795057...|[0.88189644734275...|       0.0|\n",
            "|[0.0,104.0,76.0,2...|      0|[3.20400251717622...|[0.96098462149452...|       0.0|\n",
            "|[0.0,105.0,84.0,2...|      1|[1.65899947069871...|[0.84010364817703...|       0.0|\n",
            "|[0.0,105.0,84.0,2...|      1|[1.65899947069871...|[0.84010364817703...|       0.0|\n",
            "|[0.0,106.0,70.0,3...|      0|[1.40726577385673...|[0.80333432617655...|       0.0|\n",
            "|[0.0,106.0,70.0,3...|      0|[1.40726577385673...|[0.80333432617655...|       0.0|\n",
            "|[0.0,107.0,60.0,2...|      0|[2.76995046775626...|[0.94103023793567...|       0.0|\n",
            "|[0.0,108.0,68.0,2...|      0|[1.91862488493013...|[0.87198501200635...|       0.0|\n",
            "+--------------------+-------+--------------------+--------------------+----------+\n",
            "only showing top 30 rows\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ffQPi_sZ6Stk",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "evaluator =  BinaryClassificationEvaluator(rawPredictionCol=\"rawPrediction\",labelCol=\"Outcome\")"
      ],
      "execution_count": 109,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ul-sONZP7auv",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "2828f27d-0e62-4380-ebf3-0223f6824a34"
      },
      "source": [
        "evaluator.evaluate(model.transform(test))"
      ],
      "execution_count": 110,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.831280788177339"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 110
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "i527rhzz7j5x",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "model.save(\"diabetes-model\")"
      ],
      "execution_count": 111,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JNInrkbp8XWG",
        "colab_type": "text"
      },
      "source": [
        "to use model again\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oQviGdDo7vix",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from pyspark.ml.classification import LogisticRegressionModel\n",
        "model =  LogisticRegressionModel.load(\"diabetes-model\")"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}