Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "81fcdbff",
"metadata": {},
"outputs": [],
"source": [
"#importing spark to jupyter notebook\n",
"import findspark\n",
"findspark.init()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "fa028e34",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.ml import Pipeline, Estimator, Transformer, Model, PipelineModel\n",
"import pyspark.ml.feature as MFT\n",
"import pyspark.ml.functions as MF\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.sql.functions import split, count, when, isnan, col, regexp_replace"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "6541040d",
"metadata": {},
"outputs": [],
"source": [
"# Create a SparkConf object\n",
"from pyspark.conf import SparkConf\n",
"settings_h = SparkConf().setAppName(\"Cancer APP\").set(\"spark.executor.memory\", \"4g\")\n",
"sc = SparkSession.builder.config(conf=settings_h).getOrCreate()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "01d46235",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <p><b>SparkSession - in-memory</b></p>\n",
" \n",
" <div>\n",
" <p><b>SparkContext</b></p>\n",
"\n",
" <p><a href=\"http://DESKTOP-QMD8AF3:4040\">Spark UI</a></p>\n",
"\n",
" <dl>\n",
" <dt>Version</dt>\n",
" <dd><code>v3.2.4</code></dd>\n",
" <dt>Master</dt>\n",
" <dd><code>local[*]</code></dd>\n",
" <dt>AppName</dt>\n",
" <dd><code>Cancer APP</code></dd>\n",
" </dl>\n",
" </div>\n",
" \n",
" </div>\n",
" "
],
"text/plain": [
"<pyspark.sql.session.SparkSession at 0x1d2480b0c70>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sc"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "80bef2e1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- id: string (nullable = true)\n",
" |-- diagnosis: string (nullable = true)\n",
" |-- radius_mean: string (nullable = true)\n",
" |-- texture_mean: string (nullable = true)\n",
" |-- perimeter_mean: string (nullable = true)\n",
" |-- area_mean: string (nullable = true)\n",
" |-- smoothness_mean: string (nullable = true)\n",
" |-- compactness_mean: string (nullable = true)\n",
" |-- concavity_mean: string (nullable = true)\n",
" |-- concave_points_mean: string (nullable = true)\n",
" |-- symmetry_mean: string (nullable = true)\n",
" |-- fractal_dimension_mean: string (nullable = true)\n",
" |-- radius_se: string (nullable = true)\n",
" |-- texture_se: string (nullable = true)\n",
" |-- perimeter_se: string (nullable = true)\n",
" |-- area_se: string (nullable = true)\n",
" |-- smoothness_se: string (nullable = true)\n",
" |-- compactness_se: string (nullable = true)\n",
" |-- concavity_se: string (nullable = true)\n",
" |-- concave_points_se: string (nullable = true)\n",
" |-- symmetry_se: string (nullable = true)\n",
" |-- fractal_dimension_se: string (nullable = true)\n",
" |-- radius_worst: string (nullable = true)\n",
" |-- texture_worst: string (nullable = true)\n",
" |-- perimeter_worst: string (nullable = true)\n",
" |-- area_worst: string (nullable = true)\n",
" |-- smoothness_worst: string (nullable = true)\n",
" |-- compactness_worst: string (nullable = true)\n",
" |-- concavity_worst: string (nullable = true)\n",
" |-- concave_points_worst: string (nullable = true)\n",
" |-- symmetry_worst: string (nullable = true)\n",
" |-- fractal_dimension_worst: string (nullable = true)\n",
"\n"
]
}
],
"source": [
"df = spark.read.option(\"header\",True) \\\n",
".csv(\"cancer_data.csv\")\n",
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "0c9ec57e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+\n",
"| id|diagnosis|radius_mean|texture_mean|perimeter_mean|area_mean|smoothness_mean|compactness_mean|concavity_mean|concave_points_mean|symmetry_mean|fractal_dimension_mean|radius_se|texture_se|perimeter_se|area_se|smoothness_se|compactness_se|concavity_se|concave_points_se|symmetry_se|fractal_dimension_se|radius_worst|texture_worst|perimeter_worst|area_worst|smoothness_worst|compactness_worst|concavity_worst|concave_points_worst|symmetry_worst|fractal_dimension_worst|\n",
"+--------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+\n",
"| 842302| M| 17.99| 10.38| 122.8| 1001| 0.1184| 0.2776| 0.3001| 0.1471| 0.2419| 0.07871| 1.095| 0.9053| 8.589| 153.4| 0.006399| 0.04904| 0.05373| 0.01587| 0.03003| 0.006193| 25.38| 17.33| 184.6| 2019| 0.1622| 0.6656| 0.7119| 0.2654| 0.4601| 0.1189|\n",
"| 842517| M| 20.57| 17.77| 132.9| 1326| 0.08474| 0.07864| 0.0869| 0.07017| 0.1812| 0.05667| 0.5435| 0.7339| 3.398| 74.08| 0.005225| 0.01308| 0.0186| 0.0134| 0.01389| 0.003532| 24.99| 23.41| 158.8| 1956| 0.1238| 0.1866| 0.2416| 0.186| 0.275| 0.08902|\n",
"|84300903| M| 19.69| 21.25| 130| 1203| 0.1096| 0.1599| 0.1974| 0.1279| 0.2069| 0.05999| 0.7456| 0.7869| 4.585| 94.03| 0.00615| 0.04006| 0.03832| 0.02058| 0.0225| 0.004571| 23.57| 25.53| 152.5| 1709| 0.1444| 0.4245| 0.4504| 0.243| 0.3613| 0.08758|\n",
"|84348301| M| 11.42| 20.38| 77.58| 386.1| 0.1425| 0.2839| 0.2414| 0.1052| 0.2597| 0.09744| 0.4956| 1.156| 3.445| 27.23| 0.00911| 0.07458| 0.05661| 0.01867| 0.05963| 0.009208| 14.91| 26.5| 98.87| 567.7| 0.2098| 0.8663| 0.6869| 0.2575| 0.6638| 0.173|\n",
"|84358402| M| 20.29| 14.34| 135.1| 1297| 0.1003| 0.1328| 0.198| 0.1043| 0.1809| 0.05883| 0.7572| 0.7813| 5.438| 94.44| 0.01149| 0.02461| 0.05688| 0.01885| 0.01756| 0.005115| 22.54| 16.67| 152.2| 1575| 0.1374| 0.205| 0.4| 0.1625| 0.2364| 0.07678|\n",
"| 843786| M| 12.45| 15.7| 82.57| 477.1| 0.1278| 0.17| 0.1578| 0.08089| 0.2087| 0.07613| 0.3345| 0.8902| 2.217| 27.19| 0.00751| 0.03345| 0.03672| 0.01137| 0.02165| 0.005082| 15.47| 23.75| 103.4| 741.6| 0.1791| 0.5249| 0.5355| 0.1741| 0.3985| 0.1244|\n",
"| 844359| M| 18.25| 19.98| 119.6| 1040| 0.09463| 0.109| 0.1127| 0.074| 0.1794| 0.05742| 0.4467| 0.7732| 3.18| 53.91| 0.004314| 0.01382| 0.02254| 0.01039| 0.01369| 0.002179| 22.88| 27.66| 153.2| 1606| 0.1442| 0.2576| 0.3784| 0.1932| 0.3063| 0.08368|\n",
"|84458202| M| 13.71| 20.83| 90.2| 577.9| 0.1189| 0.1645| 0.09366| 0.05985| 0.2196| 0.07451| 0.5835| 1.377| 3.856| 50.96| 0.008805| 0.03029| 0.02488| 0.01448| 0.01486| 0.005412| 17.06| 28.14| 110.6| 897| 0.1654| 0.3682| 0.2678| 0.1556| 0.3196| 0.1151|\n",
"| 844981| M| 13| 21.82| 87.5| 519.8| 0.1273| 0.1932| 0.1859| 0.09353| 0.235| 0.07389| 0.3063| 1.002| 2.406| 24.32| 0.005731| 0.03502| 0.03553| 0.01226| 0.02143| 0.003749| 15.49| 30.73| 106.2| 739.3| 0.1703| 0.5401| 0.539| 0.206| 0.4378| 0.1072|\n",
"|84501001| M| 12.46| 24.04| 83.97| 475.9| 0.1186| 0.2396| 0.2273| 0.08543| 0.203| 0.08243| 0.2976| 1.599| 2.039| 23.94| 0.007149| 0.07217| 0.07743| 0.01432| 0.01789| 0.01008| 15.09| 40.68| 97.65| 711.4| 0.1853| 1.058| 1.105| 0.221| 0.4366| 0.2075|\n",
"| 845636| M| 16.02| 23.24| 102.7| 797.8| 0.08206| 0.06669| 0.03299| 0.03323| 0.1528| 0.05697| 0.3795| 1.187| 2.466| 40.51| 0.004029| 0.009269| 0.01101| 0.007591| 0.0146| 0.003042| 19.19| 33.88| 123.8| 1150| 0.1181| 0.1551| 0.1459| 0.09975| 0.2948| 0.08452|\n",
"|84610002| M| 15.78| 17.89| 103.6| 781| 0.0971| 0.1292| 0.09954| 0.06606| 0.1842| 0.06082| 0.5058| 0.9849| 3.564| 54.16| 0.005771| 0.04061| 0.02791| 0.01282| 0.02008| 0.004144| 20.42| 27.28| 136.5| 1299| 0.1396| 0.5609| 0.3965| 0.181| 0.3792| 0.1048|\n",
"| 846226| M| 19.17| 24.8| 132.4| 1123| 0.0974| 0.2458| 0.2065| 0.1118| 0.2397| 0.078| 0.9555| 3.568| 11.07| 116.2| 0.003139| 0.08297| 0.0889| 0.0409| 0.04484| 0.01284| 20.96| 29.94| 151.7| 1332| 0.1037| 0.3903| 0.3639| 0.1767| 0.3176| 0.1023|\n",
"| 846381| M| 15.85| 23.95| 103.7| 782.7| 0.08401| 0.1002| 0.09938| 0.05364| 0.1847| 0.05338| 0.4033| 1.078| 2.903| 36.58| 0.009769| 0.03126| 0.05051| 0.01992| 0.02981| 0.003002| 16.84| 27.66| 112| 876.5| 0.1131| 0.1924| 0.2322| 0.1119| 0.2809| 0.06287|\n",
"|84667401| M| 13.73| 22.61| 93.6| 578.3| 0.1131| 0.2293| 0.2128| 0.08025| 0.2069| 0.07682| 0.2121| 1.169| 2.061| 19.21| 0.006429| 0.05936| 0.05501| 0.01628| 0.01961| 0.008093| 15.03| 32.01| 108.8| 697.7| 0.1651| 0.7725| 0.6943| 0.2208| 0.3596| 0.1431|\n",
"|84799002| M| 14.54| 27.54| 96.73| 658.8| 0.1139| 0.1595| 0.1639| 0.07364| 0.2303| 0.07077| 0.37| 1.033| 2.879| 32.55| 0.005607| 0.0424| 0.04741| 0.0109| 0.01857| 0.005466| 17.46| 37.13| 124.1| 943.2| 0.1678| 0.6577| 0.7026| 0.1712| 0.4218| 0.1341|\n",
"| 848406| M| 14.68| 20.13| 94.74| 684.5| 0.09867| 0.072| 0.07395| 0.05259| 0.1586| 0.05922| 0.4727| 1.24| 3.195| 45.4| 0.005718| 0.01162| 0.01998| 0.01109| 0.0141| 0.002085| 19.07| 30.88| 123.4| 1138| 0.1464| 0.1871| 0.2914| 0.1609| 0.3029| 0.08216|\n",
"|84862001| M| 16.13| 20.68| 108.1| 798.8| 0.117| 0.2022| 0.1722| 0.1028| 0.2164| 0.07356| 0.5692| 1.073| 3.854| 54.18| 0.007026| 0.02501| 0.03188| 0.01297| 0.01689| 0.004142| 20.96| 31.48| 136.8| 1315| 0.1789| 0.4233| 0.4784| 0.2073| 0.3706| 0.1142|\n",
"| 849014| M| 19.81| 22.15| 130| 1260| 0.09831| 0.1027| 0.1479| 0.09498| 0.1582| 0.05395| 0.7582| 1.017| 5.865| 112.4| 0.006494| 0.01893| 0.03391| 0.01521| 0.01356| 0.001997| 27.32| 30.88| 186.8| 2398| 0.1512| 0.315| 0.5372| 0.2388| 0.2768| 0.07615|\n",
"| 8510426| B| 13.54| 14.36| 87.46| 566.3| 0.09779| 0.08129| 0.06664| 0.04781| 0.1885| 0.05766| 0.2699| 0.7886| 2.058| 23.56| 0.008462| 0.0146| 0.02387| 0.01315| 0.0198| 0.0023| 15.11| 19.26| 99.7| 711.2| 0.144| 0.1773| 0.239| 0.1288| 0.2977| 0.07259|\n",
"+--------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.show() "
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "6e3a6525",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[summary: string, texture_mean: string]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe(['texture_mean'])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "bdab98ae",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------+-----+\n",
"|diagnosis|count|\n",
"+---------+-----+\n",
"| B| 357|\n",
"| M| 212|\n",
"+---------+-----+\n",
"\n"
]
}
],
"source": [
"df.dropDuplicates()\\\n",
" .groupBy('diagnosis')\\\n",
" .count()\\\n",
" .show()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a904e584",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[summary: string, radius_mean: string]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe(['radius_mean'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "42d089bf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+\n",
"| id|diagnosis|radius_mean|texture_mean|perimeter_mean|area_mean|smoothness_mean|compactness_mean|concavity_mean|concave_points_mean|symmetry_mean|fractal_dimension_mean|radius_se|texture_se|perimeter_se|area_se|smoothness_se|compactness_se|concavity_se|concave_points_se|symmetry_se|fractal_dimension_se|radius_worst|texture_worst|perimeter_worst|area_worst|smoothness_worst|compactness_worst|concavity_worst|concave_points_worst|symmetry_worst|fractal_dimension_worst|\n",
"+---+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+\n",
"| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0|\n",
"+---+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+\n",
"\n"
]
}
],
"source": [
"#Check Missing Values\n",
"def check_missing(dataframe):\n",
" return dataframe.select([count(when(isnan(c) | col(c).isNull(), c)). \\\n",
" alias(c) for c in dataframe.columns]).show()\n",
"\n",
"check_missing(df)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "11272daa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['id',\n",
" 'diagnosis',\n",
" 'radius_mean',\n",
" 'texture_mean',\n",
" 'perimeter_mean',\n",
" 'area_mean',\n",
" 'smoothness_mean',\n",
" 'compactness_mean',\n",
" 'concavity_mean',\n",
" 'concave_points_mean',\n",
" 'symmetry_mean',\n",
" 'fractal_dimension_mean',\n",
" 'radius_se',\n",
" 'texture_se',\n",
" 'perimeter_se',\n",
" 'area_se',\n",
" 'smoothness_se',\n",
" 'compactness_se',\n",
" 'concavity_se',\n",
" 'concave_points_se',\n",
" 'symmetry_se',\n",
" 'fractal_dimension_se',\n",
" 'radius_worst',\n",
" 'texture_worst',\n",
" 'perimeter_worst',\n",
" 'area_worst',\n",
" 'smoothness_worst',\n",
" 'compactness_worst',\n",
" 'concavity_worst',\n",
" 'concave_points_worst',\n",
" 'symmetry_worst',\n",
" 'fractal_dimension_worst']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Check column names\n",
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "49e1caa1",
"metadata": {},
"outputs": [],
"source": [
"df_pandas = df.toPandas()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "8b8b8ce0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>diagnosis</th>\n",
" <th>radius_mean</th>\n",
" <th>texture_mean</th>\n",
" <th>perimeter_mean</th>\n",
" <th>area_mean</th>\n",
" <th>smoothness_mean</th>\n",
" <th>compactness_mean</th>\n",
" <th>concavity_mean</th>\n",
" <th>concave_points_mean</th>\n",
" <th>...</th>\n",
" <th>radius_worst</th>\n",
" <th>texture_worst</th>\n",
" <th>perimeter_worst</th>\n",
" <th>area_worst</th>\n",
" <th>smoothness_worst</th>\n",
" <th>compactness_worst</th>\n",
" <th>concavity_worst</th>\n",
" <th>concave_points_worst</th>\n",
" <th>symmetry_worst</th>\n",
" <th>fractal_dimension_worst</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>842302</td>\n",
" <td>M</td>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.8</td>\n",
" <td>1001</td>\n",
" <td>0.1184</td>\n",
" <td>0.2776</td>\n",
" <td>0.3001</td>\n",
" <td>0.1471</td>\n",
" <td>...</td>\n",
" <td>25.38</td>\n",
" <td>17.33</td>\n",
" <td>184.6</td>\n",
" <td>2019</td>\n",
" <td>0.1622</td>\n",
" <td>0.6656</td>\n",
" <td>0.7119</td>\n",
" <td>0.2654</td>\n",
" <td>0.4601</td>\n",
" <td>0.1189</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>842517</td>\n",
" <td>M</td>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.9</td>\n",
" <td>1326</td>\n",
" <td>0.08474</td>\n",
" <td>0.07864</td>\n",
" <td>0.0869</td>\n",
" <td>0.07017</td>\n",
" <td>...</td>\n",
" <td>24.99</td>\n",
" <td>23.41</td>\n",
" <td>158.8</td>\n",
" <td>1956</td>\n",
" <td>0.1238</td>\n",
" <td>0.1866</td>\n",
" <td>0.2416</td>\n",
" <td>0.186</td>\n",
" <td>0.275</td>\n",
" <td>0.08902</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>84300903</td>\n",
" <td>M</td>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130</td>\n",
" <td>1203</td>\n",
" <td>0.1096</td>\n",
" <td>0.1599</td>\n",
" <td>0.1974</td>\n",
" <td>0.1279</td>\n",
" <td>...</td>\n",
" <td>23.57</td>\n",
" <td>25.53</td>\n",
" <td>152.5</td>\n",
" <td>1709</td>\n",
" <td>0.1444</td>\n",
" <td>0.4245</td>\n",
" <td>0.4504</td>\n",
" <td>0.243</td>\n",
" <td>0.3613</td>\n",
" <td>0.08758</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>84348301</td>\n",
" <td>M</td>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.1425</td>\n",
" <td>0.2839</td>\n",
" <td>0.2414</td>\n",
" <td>0.1052</td>\n",
" <td>...</td>\n",
" <td>14.91</td>\n",
" <td>26.5</td>\n",
" <td>98.87</td>\n",
" <td>567.7</td>\n",
" <td>0.2098</td>\n",
" <td>0.8663</td>\n",
" <td>0.6869</td>\n",
" <td>0.2575</td>\n",
" <td>0.6638</td>\n",
" <td>0.173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>84358402</td>\n",
" <td>M</td>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.1</td>\n",
" <td>1297</td>\n",
" <td>0.1003</td>\n",
" <td>0.1328</td>\n",
" <td>0.198</td>\n",
" <td>0.1043</td>\n",
" <td>...</td>\n",
" <td>22.54</td>\n",
" <td>16.67</td>\n",
" <td>152.2</td>\n",
" <td>1575</td>\n",
" <td>0.1374</td>\n",
" <td>0.205</td>\n",
" <td>0.4</td>\n",
" <td>0.1625</td>\n",
" <td>0.2364</td>\n",
" <td>0.07678</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>564</th>\n",
" <td>926424</td>\n",
" <td>M</td>\n",
" <td>21.56</td>\n",
" <td>22.39</td>\n",
" <td>142</td>\n",
" <td>1479</td>\n",
" <td>0.111</td>\n",
" <td>0.1159</td>\n",
" <td>0.2439</td>\n",
" <td>0.1389</td>\n",
" <td>...</td>\n",
" <td>25.45</td>\n",
" <td>26.4</td>\n",
" <td>166.1</td>\n",
" <td>2027</td>\n",
" <td>0.141</td>\n",
" <td>0.2113</td>\n",
" <td>0.4107</td>\n",
" <td>0.2216</td>\n",
" <td>0.206</td>\n",
" <td>0.07115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>565</th>\n",
" <td>926682</td>\n",
" <td>M</td>\n",
" <td>20.13</td>\n",
" <td>28.25</td>\n",
" <td>131.2</td>\n",
" <td>1261</td>\n",
" <td>0.0978</td>\n",
" <td>0.1034</td>\n",
" <td>0.144</td>\n",
" <td>0.09791</td>\n",
" <td>...</td>\n",
" <td>23.69</td>\n",
" <td>38.25</td>\n",
" <td>155</td>\n",
" <td>1731</td>\n",
" <td>0.1166</td>\n",
" <td>0.1922</td>\n",
" <td>0.3215</td>\n",
" <td>0.1628</td>\n",
" <td>0.2572</td>\n",
" <td>0.06637</td>\n",
" </tr>\n",
" <tr>\n",
" <th>566</th>\n",
" <td>926954</td>\n",
" <td>M</td>\n",
" <td>16.6</td>\n",
" <td>28.08</td>\n",
" <td>108.3</td>\n",
" <td>858.1</td>\n",
" <td>0.08455</td>\n",
" <td>0.1023</td>\n",
" <td>0.09251</td>\n",
" <td>0.05302</td>\n",
" <td>...</td>\n",
" <td>18.98</td>\n",
" <td>34.12</td>\n",
" <td>126.7</td>\n",
" <td>1124</td>\n",
" <td>0.1139</td>\n",
" <td>0.3094</td>\n",
" <td>0.3403</td>\n",
" <td>0.1418</td>\n",
" <td>0.2218</td>\n",
" <td>0.0782</td>\n",
" </tr>\n",
" <tr>\n",
" <th>567</th>\n",
" <td>927241</td>\n",
" <td>M</td>\n",
" <td>20.6</td>\n",
" <td>29.33</td>\n",
" <td>140.1</td>\n",
" <td>1265</td>\n",
" <td>0.1178</td>\n",
" <td>0.277</td>\n",
" <td>0.3514</td>\n",
" <td>0.152</td>\n",
" <td>...</td>\n",
" <td>25.74</td>\n",
" <td>39.42</td>\n",
" <td>184.6</td>\n",
" <td>1821</td>\n",
" <td>0.165</td>\n",
" <td>0.8681</td>\n",
" <td>0.9387</td>\n",
" <td>0.265</td>\n",
" <td>0.4087</td>\n",
" <td>0.124</td>\n",
" </tr>\n",
" <tr>\n",
" <th>568</th>\n",
" <td>92751</td>\n",
" <td>B</td>\n",
" <td>7.76</td>\n",
" <td>24.54</td>\n",
" <td>47.92</td>\n",
" <td>181</td>\n",
" <td>0.05263</td>\n",
" <td>0.04362</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>9.456</td>\n",
" <td>30.37</td>\n",
" <td>59.16</td>\n",
" <td>268.6</td>\n",
" <td>0.08996</td>\n",
" <td>0.06444</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.2871</td>\n",
" <td>0.07039</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>569 rows × 32 columns</p>\n",
"</div>"
],
"text/plain": [
" id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n",
"0 842302 M 17.99 10.38 122.8 1001 \n",
"1 842517 M 20.57 17.77 132.9 1326 \n",
"2 84300903 M 19.69 21.25 130 1203 \n",
"3 84348301 M 11.42 20.38 77.58 386.1 \n",
"4 84358402 M 20.29 14.34 135.1 1297 \n",
".. ... ... ... ... ... ... \n",
"564 926424 M 21.56 22.39 142 1479 \n",
"565 926682 M 20.13 28.25 131.2 1261 \n",
"566 926954 M 16.6 28.08 108.3 858.1 \n",
"567 927241 M 20.6 29.33 140.1 1265 \n",
"568 92751 B 7.76 24.54 47.92 181 \n",
"\n",
" smoothness_mean compactness_mean concavity_mean concave_points_mean ... \\\n",
"0 0.1184 0.2776 0.3001 0.1471 ... \n",
"1 0.08474 0.07864 0.0869 0.07017 ... \n",
"2 0.1096 0.1599 0.1974 0.1279 ... \n",
"3 0.1425 0.2839 0.2414 0.1052 ... \n",
"4 0.1003 0.1328 0.198 0.1043 ... \n",
".. ... ... ... ... ... \n",
"564 0.111 0.1159 0.2439 0.1389 ... \n",
"565 0.0978 0.1034 0.144 0.09791 ... \n",
"566 0.08455 0.1023 0.09251 0.05302 ... \n",
"567 0.1178 0.277 0.3514 0.152 ... \n",
"568 0.05263 0.04362 0 0 ... \n",
"\n",
" radius_worst texture_worst perimeter_worst area_worst smoothness_worst \\\n",
"0 25.38 17.33 184.6 2019 0.1622 \n",
"1 24.99 23.41 158.8 1956 0.1238 \n",
"2 23.57 25.53 152.5 1709 0.1444 \n",
"3 14.91 26.5 98.87 567.7 0.2098 \n",
"4 22.54 16.67 152.2 1575 0.1374 \n",
".. ... ... ... ... ... \n",
"564 25.45 26.4 166.1 2027 0.141 \n",
"565 23.69 38.25 155 1731 0.1166 \n",
"566 18.98 34.12 126.7 1124 0.1139 \n",
"567 25.74 39.42 184.6 1821 0.165 \n",
"568 9.456 30.37 59.16 268.6 0.08996 \n",
"\n",
" compactness_worst concavity_worst concave_points_worst symmetry_worst \\\n",
"0 0.6656 0.7119 0.2654 0.4601 \n",
"1 0.1866 0.2416 0.186 0.275 \n",
"2 0.4245 0.4504 0.243 0.3613 \n",
"3 0.8663 0.6869 0.2575 0.6638 \n",
"4 0.205 0.4 0.1625 0.2364 \n",
".. ... ... ... ... \n",
"564 0.2113 0.4107 0.2216 0.206 \n",
"565 0.1922 0.3215 0.1628 0.2572 \n",
"566 0.3094 0.3403 0.1418 0.2218 \n",
"567 0.8681 0.9387 0.265 0.4087 \n",
"568 0.06444 0 0 0.2871 \n",
"\n",
" fractal_dimension_worst \n",
"0 0.1189 \n",
"1 0.08902 \n",
"2 0.08758 \n",
"3 0.173 \n",
"4 0.07678 \n",
".. ... \n",
"564 0.07115 \n",
"565 0.06637 \n",
"566 0.0782 \n",
"567 0.124 \n",
"568 0.07039 \n",
"\n",
"[569 rows x 32 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_pandas"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "38c52a07",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+--------------------+---------+------------------+-----------------+-----------------+-----------------+--------------------+-------------------+-------------------+--------------------+--------------------+----------------------+------------------+------------------+------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+-----------------+--------------------+-------------------+-------------------+--------------------+-------------------+-----------------------+\n",
"|summary| id|diagnosis| radius_mean| texture_mean| perimeter_mean| area_mean| smoothness_mean| compactness_mean| concavity_mean| concave_points_mean| symmetry_mean|fractal_dimension_mean| radius_se| texture_se| perimeter_se| area_se| smoothness_se| compactness_se| concavity_se| concave_points_se| symmetry_se|fractal_dimension_se| radius_worst| texture_worst| perimeter_worst| area_worst| smoothness_worst| compactness_worst| concavity_worst|concave_points_worst| symmetry_worst|fractal_dimension_worst|\n",
"+-------+--------------------+---------+------------------+-----------------+-----------------+-----------------+--------------------+-------------------+-------------------+--------------------+--------------------+----------------------+------------------+------------------+------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+-----------------+--------------------+-------------------+-------------------+--------------------+-------------------+-----------------------+\n",
"| count| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569| 569|\n",
"| mean|3.0371831432337433E7| null|14.127291739894563|19.28964850615117|91.96903339191566|654.8891036906857| 0.096360281195079|0.10434098418277686|0.08879931581722322|0.048919145869947236| 0.181161862917399| 0.06279760984182778|0.4051720562390161|1.2168534270650269|2.8660592267135288|40.33707908611603|0.007040978910369071|0.025478138840070306|0.031893716344463946|0.011796137082601056|0.020542298769771532|0.003794903866432...|16.269189806678394|25.677223198594014| 107.2612126537786|880.5831282952545| 0.13236859402460469|0.25426504393673144|0.27218848330404205| 0.11460622319859404|0.29007557117750454| 0.08394581722319855|\n",
"| stddev|1.2502058561222367E8| null|3.5240488262120793|4.301035768166948| 24.2989810387549|351.9141291816529|0.014064128137673616| 0.0528127579325122|0.07971980870789354| 0.03880284485915361|0.027414281336035712| 0.007060362795084457|0.2773127329861039|0.5516483926172022|2.0218545540421085|45.49100551613178|0.003002517943839066| 0.01790817932567739|0.030186060322988394|0.006170285174046867|0.008266371528798402|0.002646070967089195| 4.833241580469323|6.1462576230383235|33.602542269036356|569.3569926699494|0.022832429404835465| 0.157336488913742|0.20862428060813235| 0.06573234119594208|0.06186746753751871| 0.01806126734889399|\n",
"| min| 842302| B| 10.03| 10.38| 100| 1001| 0.05263| 0.01938| 0| 0| 0.106| 0.04996| 0.1115| 0.3602| 0.757| 10.08| 0.001713| 0.002252| 0| 0| 0.007882| 0.0008948| 10.01| 12.02| 100.2| 1009| 0.07117| 0.02729| 0| 0| 0.1565| 0.05504|\n",
"| max| 92751| M| 9.904| 9.71| 99.58| 998.9| 0.1634| 0.3454| 0.4268| 0.2012| 0.304| 0.09744| 2.873| 4.885| 9.807| 99.04| 0.03113| 0.1354| 0.396| 0.05279| 0.07895| 0.02984| 9.981| 49.54| 99.71| 993.6| 0.2226| 1.058| 1.252| 0.291| 0.6638| 0.2075|\n",
"+-------+--------------------+---------+------------------+-----------------+-----------------+-----------------+--------------------+-------------------+-------------------+--------------------+--------------------+----------------------+------------------+------------------+------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+------------------+-----------------+--------------------+-------------------+-------------------+--------------------+-------------------+-----------------------+\n",
"\n"
]
}
],
"source": [
"#describe with specific variables\n",
"df.describe().show()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "d134fb0d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+---------+\n",
"|summary|diagnosis|\n",
"+-------+---------+\n",
"| count| 569|\n",
"| mean| null|\n",
"| stddev| null|\n",
"| min| B|\n",
"| max| M|\n",
"+-------+---------+\n",
"\n"
]
}
],
"source": [
"#describe with specific variables\n",
"df.describe(['diagnosis']).show()"
]
},
{
"cell_type": "code",
"execution_count": 119,
"id": "9ae0e058",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+------------------+---------+--------------+------------------+---------------+----------------+------------------+----------------+-------------+-----------------+-----------------------+------+------------------+------------------+------------------+--------------------+------------------+-------------+-----------------+------------------+------------------+------+--------+------------------+-------------------+------------------------+-------------+------------------+------------------+---------------------+------------------+------------------+------------------+-----------------------+--------------------+\n",
"|summary| Age|Attrition|BusinessTravel| DailyRate| Department|DistanceFromHome| Education| EducationField|EmployeeCount| EmployeeNumber|EnvironmentSatisfaction|Gender| HourlyRate| JobInvolvement| JobLevel| JobRole| JobSatisfaction|MaritalStatus| MonthlyIncome| MonthlyRate|NumCompaniesWorked|Over18|OverTime| PercentSalaryHike| PerformanceRating|RelationshipSatisfaction|StandardHours| StockOptionLevel| TotalWorkingYears|TrainingTimesLastYear| WorkLifeBalance| YearsAtCompany|YearsInCurrentRole|YearsSinceLastPromotion|YearsWithCurrManager|\n",
"+-------+------------------+---------+--------------+------------------+---------------+----------------+------------------+----------------+-------------+-----------------+-----------------------+------+------------------+------------------+------------------+--------------------+------------------+-------------+-----------------+------------------+------------------+------+--------+------------------+-------------------+------------------------+-------------+------------------+------------------+---------------------+------------------+------------------+------------------+-----------------------+--------------------+\n",
"| count| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470| 1470|\n",
"| mean|36.923809523809524| null| null| 802.4857142857143| null|9.19251700680272| 2.912925170068027| null| 1.0|1024.865306122449| 2.721768707482993| null| 65.89115646258503|2.7299319727891156|2.0639455782312925| null|2.7285714285714286| null|6502.931292517007|14313.103401360544|2.6931972789115646| null| null|15.209523809523809| 3.1537414965986397| 2.7122448979591836| 80.0|0.7938775510204081|11.279591836734694| 2.7993197278911564|2.7612244897959184|7.0081632653061225| 4.229251700680272| 2.1877551020408164| 4.12312925170068|\n",
"| stddev| 9.135373489136729| null| null|403.50909994352804| null|8.10686443566608|1.0241649445978718| null| 0.0|602.0243348474752| 1.0930822146350003| null|20.329427593996176|0.7115611429632297|1.1069398989351202| null|1.1028461230547213| null|4707.956783097992| 7117.786044059972|2.4980090060707463| null| null|3.6599377165396385|0.36082352460434397| 1.0812088864403517| 0.0|0.8520766679308381| 7.780781675514995| 1.2892706207958466|0.7064758297141507| 6.126525152403571| 3.623137034670627| 3.2224302791379693| 3.5681361205404363|\n",
"| min| 18| No| Non-Travel| 1001|Human Resources| 1| 1| Human Resources| 1| 1| 1|Female| 100| 1| 1|Healthcare Repres...| 1| Divorced| 10008| 10007| 0| Y| No| 11| 3| 1| 80| 0| 0| 0| 1| 0| 0| 0| 0|\n",
"| max| 60| Yes| Travel_Rarely| 999| Sales| 9| 5|Technical Degree| 1| 999| 4| Male| 99| 4| 5|Sales Representative| 4| Single| 9998| 9983| 9| Y| Yes| 25| 4| 4| 80| 3| 9| 6| 4| 9| 9| 9| 9|\n",
"+-------+------------------+---------+--------------+------------------+---------------+----------------+------------------+----------------+-------------+-----------------+-----------------------+------+------------------+------------------+------------------+--------------------+------------------+-------------+-----------------+------------------+------------------+------+--------+------------------+-------------------+------------------------+-------------+------------------+------------------+---------------------+------------------+------------------+------------------+-----------------------+--------------------+\n",
"\n"
]
}
],
"source": [
"#describe with numerical columns\n",
"def get_num_cols(dataframe):\n",
" \n",
" num_cols = [col for col in dataframe.columns if dataframe.select(col). \\\n",
" dtypes[0][1] in ['double', 'int']]\n",
" \n",
" return num_cols\n",
"\n",
"num_cols = get_num_cols(df)\n",
" \n",
"df.describe(num_cols).show()"
]
},
{
"cell_type": "code",
"execution_count": 155,
"id": "597ca79e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 21,
"id": "7722c3fa",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import functions as F\n",
"\n",
"df2 = df.withColumn(\"diagnosis\",\n",
"(when(col(\"diagnosis\") == 'B', 0)\n",
".when(col(\"diagnosis\") == 'M', 1)\n",
".otherwise(0)))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "e02086c1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+\n",
"| id|diagnosis|radius_mean|texture_mean|perimeter_mean|area_mean|smoothness_mean|compactness_mean|concavity_mean|concave_points_mean|symmetry_mean|fractal_dimension_mean|radius_se|texture_se|perimeter_se|area_se|smoothness_se|compactness_se|concavity_se|concave_points_se|symmetry_se|fractal_dimension_se|radius_worst|texture_worst|perimeter_worst|area_worst|smoothness_worst|compactness_worst|concavity_worst|concave_points_worst|symmetry_worst|fractal_dimension_worst|\n",
"+--------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+\n",
"| 842302| 1| 17.99| 10.38| 122.8| 1001| 0.1184| 0.2776| 0.3001| 0.1471| 0.2419| 0.07871| 1.095| 0.9053| 8.589| 153.4| 0.006399| 0.04904| 0.05373| 0.01587| 0.03003| 0.006193| 25.38| 17.33| 184.6| 2019| 0.1622| 0.6656| 0.7119| 0.2654| 0.4601| 0.1189|\n",
"| 842517| 1| 20.57| 17.77| 132.9| 1326| 0.08474| 0.07864| 0.0869| 0.07017| 0.1812| 0.05667| 0.5435| 0.7339| 3.398| 74.08| 0.005225| 0.01308| 0.0186| 0.0134| 0.01389| 0.003532| 24.99| 23.41| 158.8| 1956| 0.1238| 0.1866| 0.2416| 0.186| 0.275| 0.08902|\n",
"|84300903| 1| 19.69| 21.25| 130| 1203| 0.1096| 0.1599| 0.1974| 0.1279| 0.2069| 0.05999| 0.7456| 0.7869| 4.585| 94.03| 0.00615| 0.04006| 0.03832| 0.02058| 0.0225| 0.004571| 23.57| 25.53| 152.5| 1709| 0.1444| 0.4245| 0.4504| 0.243| 0.3613| 0.08758|\n",
"|84348301| 1| 11.42| 20.38| 77.58| 386.1| 0.1425| 0.2839| 0.2414| 0.1052| 0.2597| 0.09744| 0.4956| 1.156| 3.445| 27.23| 0.00911| 0.07458| 0.05661| 0.01867| 0.05963| 0.009208| 14.91| 26.5| 98.87| 567.7| 0.2098| 0.8663| 0.6869| 0.2575| 0.6638| 0.173|\n",
"|84358402| 1| 20.29| 14.34| 135.1| 1297| 0.1003| 0.1328| 0.198| 0.1043| 0.1809| 0.05883| 0.7572| 0.7813| 5.438| 94.44| 0.01149| 0.02461| 0.05688| 0.01885| 0.01756| 0.005115| 22.54| 16.67| 152.2| 1575| 0.1374| 0.205| 0.4| 0.1625| 0.2364| 0.07678|\n",
"| 843786| 1| 12.45| 15.7| 82.57| 477.1| 0.1278| 0.17| 0.1578| 0.08089| 0.2087| 0.07613| 0.3345| 0.8902| 2.217| 27.19| 0.00751| 0.03345| 0.03672| 0.01137| 0.02165| 0.005082| 15.47| 23.75| 103.4| 741.6| 0.1791| 0.5249| 0.5355| 0.1741| 0.3985| 0.1244|\n",
"| 844359| 1| 18.25| 19.98| 119.6| 1040| 0.09463| 0.109| 0.1127| 0.074| 0.1794| 0.05742| 0.4467| 0.7732| 3.18| 53.91| 0.004314| 0.01382| 0.02254| 0.01039| 0.01369| 0.002179| 22.88| 27.66| 153.2| 1606| 0.1442| 0.2576| 0.3784| 0.1932| 0.3063| 0.08368|\n",
"|84458202| 1| 13.71| 20.83| 90.2| 577.9| 0.1189| 0.1645| 0.09366| 0.05985| 0.2196| 0.07451| 0.5835| 1.377| 3.856| 50.96| 0.008805| 0.03029| 0.02488| 0.01448| 0.01486| 0.005412| 17.06| 28.14| 110.6| 897| 0.1654| 0.3682| 0.2678| 0.1556| 0.3196| 0.1151|\n",
"| 844981| 1| 13| 21.82| 87.5| 519.8| 0.1273| 0.1932| 0.1859| 0.09353| 0.235| 0.07389| 0.3063| 1.002| 2.406| 24.32| 0.005731| 0.03502| 0.03553| 0.01226| 0.02143| 0.003749| 15.49| 30.73| 106.2| 739.3| 0.1703| 0.5401| 0.539| 0.206| 0.4378| 0.1072|\n",
"|84501001| 1| 12.46| 24.04| 83.97| 475.9| 0.1186| 0.2396| 0.2273| 0.08543| 0.203| 0.08243| 0.2976| 1.599| 2.039| 23.94| 0.007149| 0.07217| 0.07743| 0.01432| 0.01789| 0.01008| 15.09| 40.68| 97.65| 711.4| 0.1853| 1.058| 1.105| 0.221| 0.4366| 0.2075|\n",
"| 845636| 1| 16.02| 23.24| 102.7| 797.8| 0.08206| 0.06669| 0.03299| 0.03323| 0.1528| 0.05697| 0.3795| 1.187| 2.466| 40.51| 0.004029| 0.009269| 0.01101| 0.007591| 0.0146| 0.003042| 19.19| 33.88| 123.8| 1150| 0.1181| 0.1551| 0.1459| 0.09975| 0.2948| 0.08452|\n",
"|84610002| 1| 15.78| 17.89| 103.6| 781| 0.0971| 0.1292| 0.09954| 0.06606| 0.1842| 0.06082| 0.5058| 0.9849| 3.564| 54.16| 0.005771| 0.04061| 0.02791| 0.01282| 0.02008| 0.004144| 20.42| 27.28| 136.5| 1299| 0.1396| 0.5609| 0.3965| 0.181| 0.3792| 0.1048|\n",
"| 846226| 1| 19.17| 24.8| 132.4| 1123| 0.0974| 0.2458| 0.2065| 0.1118| 0.2397| 0.078| 0.9555| 3.568| 11.07| 116.2| 0.003139| 0.08297| 0.0889| 0.0409| 0.04484| 0.01284| 20.96| 29.94| 151.7| 1332| 0.1037| 0.3903| 0.3639| 0.1767| 0.3176| 0.1023|\n",
"| 846381| 1| 15.85| 23.95| 103.7| 782.7| 0.08401| 0.1002| 0.09938| 0.05364| 0.1847| 0.05338| 0.4033| 1.078| 2.903| 36.58| 0.009769| 0.03126| 0.05051| 0.01992| 0.02981| 0.003002| 16.84| 27.66| 112| 876.5| 0.1131| 0.1924| 0.2322| 0.1119| 0.2809| 0.06287|\n",
"|84667401| 1| 13.73| 22.61| 93.6| 578.3| 0.1131| 0.2293| 0.2128| 0.08025| 0.2069| 0.07682| 0.2121| 1.169| 2.061| 19.21| 0.006429| 0.05936| 0.05501| 0.01628| 0.01961| 0.008093| 15.03| 32.01| 108.8| 697.7| 0.1651| 0.7725| 0.6943| 0.2208| 0.3596| 0.1431|\n",
"|84799002| 1| 14.54| 27.54| 96.73| 658.8| 0.1139| 0.1595| 0.1639| 0.07364| 0.2303| 0.07077| 0.37| 1.033| 2.879| 32.55| 0.005607| 0.0424| 0.04741| 0.0109| 0.01857| 0.005466| 17.46| 37.13| 124.1| 943.2| 0.1678| 0.6577| 0.7026| 0.1712| 0.4218| 0.1341|\n",
"| 848406| 1| 14.68| 20.13| 94.74| 684.5| 0.09867| 0.072| 0.07395| 0.05259| 0.1586| 0.05922| 0.4727| 1.24| 3.195| 45.4| 0.005718| 0.01162| 0.01998| 0.01109| 0.0141| 0.002085| 19.07| 30.88| 123.4| 1138| 0.1464| 0.1871| 0.2914| 0.1609| 0.3029| 0.08216|\n",
"|84862001| 1| 16.13| 20.68| 108.1| 798.8| 0.117| 0.2022| 0.1722| 0.1028| 0.2164| 0.07356| 0.5692| 1.073| 3.854| 54.18| 0.007026| 0.02501| 0.03188| 0.01297| 0.01689| 0.004142| 20.96| 31.48| 136.8| 1315| 0.1789| 0.4233| 0.4784| 0.2073| 0.3706| 0.1142|\n",
"| 849014| 1| 19.81| 22.15| 130| 1260| 0.09831| 0.1027| 0.1479| 0.09498| 0.1582| 0.05395| 0.7582| 1.017| 5.865| 112.4| 0.006494| 0.01893| 0.03391| 0.01521| 0.01356| 0.001997| 27.32| 30.88| 186.8| 2398| 0.1512| 0.315| 0.5372| 0.2388| 0.2768| 0.07615|\n",
"| 8510426| 0| 13.54| 14.36| 87.46| 566.3| 0.09779| 0.08129| 0.06664| 0.04781| 0.1885| 0.05766| 0.2699| 0.7886| 2.058| 23.56| 0.008462| 0.0146| 0.02387| 0.01315| 0.0198| 0.0023| 15.11| 19.26| 99.7| 711.2| 0.144| 0.1773| 0.239| 0.1288| 0.2977| 0.07259|\n",
"+--------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df2.show()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "314494af",
"metadata": {},
"outputs": [],
"source": [
"columns_to_cast = [\n",
" 'diagnosis',\n",
" 'radius_mean',\n",
" 'texture_mean',\n",
" 'perimeter_mean',\n",
" 'area_mean',\n",
" 'smoothness_mean',\n",
" 'compactness_mean',\n",
" 'concavity_mean',\n",
" 'concave_points_mean',\n",
" 'symmetry_mean',\n",
" 'fractal_dimension_mean',\n",
" 'radius_se',\n",
" 'texture_se',\n",
" 'perimeter_se',\n",
" 'area_se',\n",
" 'smoothness_se',\n",
" 'compactness_se',\n",
" 'concavity_se',\n",
" 'concave_points_se',\n",
" 'symmetry_se',\n",
" 'fractal_dimension_se',\n",
" 'radius_worst',\n",
" 'texture_worst',\n",
" 'perimeter_worst',\n",
" 'area_worst',\n",
" 'smoothness_worst',\n",
" 'compactness_worst',\n",
" 'concavity_worst',\n",
" 'concave_points_worst',\n",
" 'symmetry_worst',\n",
" 'fractal_dimension_worst']\n",
"df3 = (\n",
" df2\n",
" .select(\n",
" *(c for c in df2.columns if c not in columns_to_cast),\n",
" *(col(c).cast(\"float\").alias(c) for c in columns_to_cast)\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "c87e2cec",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['id',\n",
" 'diagnosis',\n",
" 'radius_mean',\n",
" 'texture_mean',\n",
" 'perimeter_mean',\n",
" 'area_mean',\n",
" 'smoothness_mean',\n",
" 'compactness_mean',\n",
" 'concavity_mean',\n",
" 'concave_points_mean',\n",
" 'symmetry_mean',\n",
" 'fractal_dimension_mean',\n",
" 'radius_se',\n",
" 'texture_se',\n",
" 'perimeter_se',\n",
" 'area_se',\n",
" 'smoothness_se',\n",
" 'compactness_se',\n",
" 'concavity_se',\n",
" 'concave_points_se',\n",
" 'symmetry_se',\n",
" 'fractal_dimension_se',\n",
" 'radius_worst',\n",
" 'texture_worst',\n",
" 'perimeter_worst',\n",
" 'area_worst',\n",
" 'smoothness_worst',\n",
" 'compactness_worst',\n",
" 'concavity_worst',\n",
" 'concave_points_worst',\n",
" 'symmetry_worst',\n",
" 'fractal_dimension_worst']"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3.columns"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "8336d258",
"metadata": {},
"outputs": [],
"source": [
"df3= df3.drop('id')"
]
},
{
"cell_type": "markdown",
"id": "c1fe5804",
"metadata": {},
"source": [
"# MACHINE LEARNING"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "613ae07a",
"metadata": {},
"outputs": [],
"source": [
"\n",
"from pyspark.ml.feature import OneHotEncoder, StringIndexer\n",
"from pyspark.ml.linalg import Vectors\n",
"from pyspark.ml.feature import VectorAssembler\n",
"from pyspark.ml.classification import LogisticRegression\n",
"from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit\n",
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
"from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n",
"from sklearn.metrics import roc_curve,auc\n",
"from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "6f0cf43c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------+--------------------+\n",
"|diagnosis| features|\n",
"+---------+--------------------+\n",
"| 1.0|[17.9899997711181...|\n",
"| 1.0|[20.5699996948242...|\n",
"| 1.0|[19.6900005340576...|\n",
"| 1.0|[11.4200000762939...|\n",
"| 1.0|[20.2900009155273...|\n",
"+---------+--------------------+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"def vector_assembler(dataframe, indep_cols):\n",
" \n",
" assembler = VectorAssembler(inputCols = indep_cols,\n",
" outputCol = 'features')\n",
"\n",
" output = assembler.transform(dataframe).drop(*indep_cols)\n",
" \n",
" return output\n",
"\n",
"df3 = vector_assembler(df3, indep_cols = df3.drop('diagnosis').columns)\n",
"df3.show(5)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "c3eb9f77",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"453"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df, test_df = df3.randomSplit([0.8, 0.2])\n",
"\n",
"train_df.count()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "0bc9ae30",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"116"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df.count()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "9c3fef5f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 47,
"id": "d2cde4c2",
"metadata": {},
"outputs": [],
"source": [
"lr = LogisticRegression(labelCol=\"diagnosis\", featuresCol='features')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "be648eea",
"metadata": {},
"outputs": [],
"source": [
"\n",
"lr = lr.fit(train_df)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "00df9942",
"metadata": {},
"outputs": [],
"source": [
"preds= lr.transform(test_df)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "d64ab8c7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9741379310344828\n",
"Precision: 0.9756592292089248\n"
]
}
],
"source": [
"\n",
"print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='diagnosis',\n",
" metricName='accuracy').evaluate(preds))\n",
"print('Precision: ',MulticlassClassificationEvaluator(labelCol='diagnosis',\n",
" metricName='weightedPrecision').evaluate(preds))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "6f5825ea",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.ml.classification import RandomForestClassifier\n",
"rf = RandomForestClassifier(labelCol=\n",
" 'diagnosis')\n",
"\n",
"model = rf.fit(train_df)\n",
"preds= model.transform(test_df)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "cba1563d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9655172413793104\n",
"Precision: 0.9655172413793104\n"
]
}
],
"source": [
"\n",
"\n",
"print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='diagnosis',\n",
" metricName='accuracy').evaluate(preds))\n",
"print('Precision: ',MulticlassClassificationEvaluator(labelCol='diagnosis',\n",
" metricName='weightedPrecision').evaluate(preds)) "
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "2cbe8f74",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.ml.classification import DecisionTreeClassifier\n"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "f6e79d3c",
"metadata": {},
"outputs": [],
"source": [
"dt_classifier = DecisionTreeClassifier(labelCol=\"diagnosis\", featuresCol=\"features\")\n",
"\n",
"model = dt_classifier.fit(train_df)\n",
"preds= model.transform(test_df)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "9917b43e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9224137931034483\n",
"Precision: 0.9227787872740449\n"
]
}
],
"source": [
"print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='diagnosis',\n",
" metricName='accuracy').evaluate(preds))\n",
"print('Precision: ',MulticlassClassificationEvaluator(labelCol='diagnosis',\n",
" metricName='weightedPrecision').evaluate(preds)) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42073c6b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}