Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# importing the librarys "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"\n",
"import numpy as np\n",
"\n",
"import seaborn as sns\n",
"\n",
"from matplotlib import pyplot as plt\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"from sklearn import svm\n",
"from sklearn.metrics import confusion_matrix\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# reading the data from the cvs files "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"Read_data = pd.read_csv(r\"C:\\Users\\baejr\\OneDrive\\Desktop\\web dev\\income_evaluation.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>workclass</th>\n",
" <th>fnlwgt</th>\n",
" <th>education</th>\n",
" <th>education-num</th>\n",
" <th>marital-status</th>\n",
" <th>occupation</th>\n",
" <th>relationship</th>\n",
" <th>race</th>\n",
" <th>sex</th>\n",
" <th>capital-gain</th>\n",
" <th>capital-loss</th>\n",
" <th>hours-per-week</th>\n",
" <th>native-country</th>\n",
" <th>income</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>39</td>\n",
" <td>State-gov</td>\n",
" <td>77516</td>\n",
" <td>Bachelors</td>\n",
" <td>13</td>\n",
" <td>Never-married</td>\n",
" <td>Adm-clerical</td>\n",
" <td>Not-in-family</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>2174</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>50</td>\n",
" <td>Self-emp-not-inc</td>\n",
" <td>83311</td>\n",
" <td>Bachelors</td>\n",
" <td>13</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Exec-managerial</td>\n",
" <td>Husband</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>13</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38</td>\n",
" <td>Private</td>\n",
" <td>215646</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Divorced</td>\n",
" <td>Handlers-cleaners</td>\n",
" <td>Not-in-family</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>53</td>\n",
" <td>Private</td>\n",
" <td>234721</td>\n",
" <td>11th</td>\n",
" <td>7</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Handlers-cleaners</td>\n",
" <td>Husband</td>\n",
" <td>Black</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>28</td>\n",
" <td>Private</td>\n",
" <td>338409</td>\n",
" <td>Bachelors</td>\n",
" <td>13</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Prof-specialty</td>\n",
" <td>Wife</td>\n",
" <td>Black</td>\n",
" <td>Female</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>Cuba</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>27</td>\n",
" <td>Private</td>\n",
" <td>257302</td>\n",
" <td>Assoc-acdm</td>\n",
" <td>12</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Tech-support</td>\n",
" <td>Wife</td>\n",
" <td>White</td>\n",
" <td>Female</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>38</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>40</td>\n",
" <td>Private</td>\n",
" <td>154374</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Machine-op-inspct</td>\n",
" <td>Husband</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&gt;50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>58</td>\n",
" <td>Private</td>\n",
" <td>151910</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Widowed</td>\n",
" <td>Adm-clerical</td>\n",
" <td>Unmarried</td>\n",
" <td>White</td>\n",
" <td>Female</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>22</td>\n",
" <td>Private</td>\n",
" <td>201490</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Never-married</td>\n",
" <td>Adm-clerical</td>\n",
" <td>Own-child</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>20</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>52</td>\n",
" <td>Self-emp-inc</td>\n",
" <td>287927</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Exec-managerial</td>\n",
" <td>Wife</td>\n",
" <td>White</td>\n",
" <td>Female</td>\n",
" <td>15024</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&gt;50K</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 15 columns</p>\n",
"</div>"
],
"text/plain": [
" age workclass fnlwgt education education-num \\\n",
"0 39 State-gov 77516 Bachelors 13 \n",
"1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
"2 38 Private 215646 HS-grad 9 \n",
"3 53 Private 234721 11th 7 \n",
"4 28 Private 338409 Bachelors 13 \n",
"... ... ... ... ... ... \n",
"32556 27 Private 257302 Assoc-acdm 12 \n",
"32557 40 Private 154374 HS-grad 9 \n",
"32558 58 Private 151910 HS-grad 9 \n",
"32559 22 Private 201490 HS-grad 9 \n",
"32560 52 Self-emp-inc 287927 HS-grad 9 \n",
"\n",
" marital-status occupation relationship race \\\n",
"0 Never-married Adm-clerical Not-in-family White \n",
"1 Married-civ-spouse Exec-managerial Husband White \n",
"2 Divorced Handlers-cleaners Not-in-family White \n",
"3 Married-civ-spouse Handlers-cleaners Husband Black \n",
"4 Married-civ-spouse Prof-specialty Wife Black \n",
"... ... ... ... ... \n",
"32556 Married-civ-spouse Tech-support Wife White \n",
"32557 Married-civ-spouse Machine-op-inspct Husband White \n",
"32558 Widowed Adm-clerical Unmarried White \n",
"32559 Never-married Adm-clerical Own-child White \n",
"32560 Married-civ-spouse Exec-managerial Wife White \n",
"\n",
" sex capital-gain capital-loss hours-per-week native-country \\\n",
"0 Male 2174 0 40 United-States \n",
"1 Male 0 0 13 United-States \n",
"2 Male 0 0 40 United-States \n",
"3 Male 0 0 40 United-States \n",
"4 Female 0 0 40 Cuba \n",
"... ... ... ... ... ... \n",
"32556 Female 0 0 38 United-States \n",
"32557 Male 0 0 40 United-States \n",
"32558 Female 0 0 40 United-States \n",
"32559 Male 0 0 20 United-States \n",
"32560 Female 15024 0 40 United-States \n",
"\n",
" income \n",
"0 <=50K \n",
"1 <=50K \n",
"2 <=50K \n",
"3 <=50K \n",
"4 <=50K \n",
"... ... \n",
"32556 <=50K \n",
"32557 >50K \n",
"32558 <=50K \n",
"32559 <=50K \n",
"32560 >50K \n",
"\n",
"[32561 rows x 15 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Read_data\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cleaning the datasets"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Male</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" Male\n",
"0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 0\n",
"... ...\n",
"32556 0\n",
"32557 1\n",
"32558 0\n",
"32559 1\n",
"32560 0\n",
"\n",
"[32561 rows x 1 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Read_data.columns = [x.strip() for x in Read_data.columns]\n",
"Gender = pd.get_dummies(Read_data[\"sex\"],drop_first = True)\n",
"Gender ## cleaning the sex data\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Adm-clerical</th>\n",
" <th>Armed-Forces</th>\n",
" <th>Craft-repair</th>\n",
" <th>Exec-managerial</th>\n",
" <th>Farming-fishing</th>\n",
" <th>Handlers-cleaners</th>\n",
" <th>Machine-op-inspct</th>\n",
" <th>Other-service</th>\n",
" <th>Priv-house-serv</th>\n",
" <th>Prof-specialty</th>\n",
" <th>Protective-serv</th>\n",
" <th>Sales</th>\n",
" <th>Tech-support</th>\n",
" <th>Transport-moving</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" Adm-clerical Armed-Forces Craft-repair Exec-managerial \\\n",
"0 1 0 0 0 \n",
"1 0 0 0 1 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 0 0 \n",
"... ... ... ... ... \n",
"32556 0 0 0 0 \n",
"32557 0 0 0 0 \n",
"32558 1 0 0 0 \n",
"32559 1 0 0 0 \n",
"32560 0 0 0 1 \n",
"\n",
" Farming-fishing Handlers-cleaners Machine-op-inspct \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 1 0 \n",
"3 0 1 0 \n",
"4 0 0 0 \n",
"... ... ... ... \n",
"32556 0 0 0 \n",
"32557 0 0 1 \n",
"32558 0 0 0 \n",
"32559 0 0 0 \n",
"32560 0 0 0 \n",
"\n",
" Other-service Priv-house-serv Prof-specialty Protective-serv \\\n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 1 0 \n",
"... ... ... ... ... \n",
"32556 0 0 0 0 \n",
"32557 0 0 0 0 \n",
"32558 0 0 0 0 \n",
"32559 0 0 0 0 \n",
"32560 0 0 0 0 \n",
"\n",
" Sales Tech-support Transport-moving \n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 0 0 0 \n",
"... ... ... ... \n",
"32556 0 1 0 \n",
"32557 0 0 0 \n",
"32558 0 0 0 \n",
"32559 0 0 0 \n",
"32560 0 0 0 \n",
"\n",
"[32561 rows x 14 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Occupation = pd.get_dummies(Read_data[\"occupation\"],drop_first = True) \n",
"Occupation ## cleaning and occupation data "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Cambodia</th>\n",
" <th>Canada</th>\n",
" <th>China</th>\n",
" <th>Columbia</th>\n",
" <th>Cuba</th>\n",
" <th>Dominican-Republic</th>\n",
" <th>Ecuador</th>\n",
" <th>El-Salvador</th>\n",
" <th>England</th>\n",
" <th>France</th>\n",
" <th>...</th>\n",
" <th>Portugal</th>\n",
" <th>Puerto-Rico</th>\n",
" <th>Scotland</th>\n",
" <th>South</th>\n",
" <th>Taiwan</th>\n",
" <th>Thailand</th>\n",
" <th>Trinadad&amp;Tobago</th>\n",
" <th>United-States</th>\n",
" <th>Vietnam</th>\n",
" <th>Yugoslavia</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 41 columns</p>\n",
"</div>"
],
"text/plain": [
" Cambodia Canada China Columbia Cuba Dominican-Republic \\\n",
"0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 \n",
"4 0 0 0 0 1 0 \n",
"... ... ... ... ... ... ... \n",
"32556 0 0 0 0 0 0 \n",
"32557 0 0 0 0 0 0 \n",
"32558 0 0 0 0 0 0 \n",
"32559 0 0 0 0 0 0 \n",
"32560 0 0 0 0 0 0 \n",
"\n",
" Ecuador El-Salvador England France ... Portugal \\\n",
"0 0 0 0 0 ... 0 \n",
"1 0 0 0 0 ... 0 \n",
"2 0 0 0 0 ... 0 \n",
"3 0 0 0 0 ... 0 \n",
"4 0 0 0 0 ... 0 \n",
"... ... ... ... ... ... ... \n",
"32556 0 0 0 0 ... 0 \n",
"32557 0 0 0 0 ... 0 \n",
"32558 0 0 0 0 ... 0 \n",
"32559 0 0 0 0 ... 0 \n",
"32560 0 0 0 0 ... 0 \n",
"\n",
" Puerto-Rico Scotland South Taiwan Thailand Trinadad&Tobago \\\n",
"0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 \n",
"... ... ... ... ... ... ... \n",
"32556 0 0 0 0 0 0 \n",
"32557 0 0 0 0 0 0 \n",
"32558 0 0 0 0 0 0 \n",
"32559 0 0 0 0 0 0 \n",
"32560 0 0 0 0 0 0 \n",
"\n",
" United-States Vietnam Yugoslavia \n",
"0 1 0 0 \n",
"1 1 0 0 \n",
"2 1 0 0 \n",
"3 1 0 0 \n",
"4 0 0 0 \n",
"... ... ... ... \n",
"32556 1 0 0 \n",
"32557 1 0 0 \n",
"32558 1 0 0 \n",
"32559 1 0 0 \n",
"32560 1 0 0 \n",
"\n",
"[32561 rows x 41 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Countries = pd.get_dummies(Read_data[\"native-country\"],drop_first = True) ## cleaning and get data for each country \n",
"Countries"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>&gt;50K</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" >50K\n",
"0 0\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 0\n",
"... ...\n",
"32556 0\n",
"32557 1\n",
"32558 0\n",
"32559 0\n",
"32560 1\n",
"\n",
"[32561 rows x 1 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Income = pd.get_dummies(Read_data[\"income\"],drop_first = True)\n",
"Income ## cleaning the income data "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Married-AF-spouse</th>\n",
" <th>Married-civ-spouse</th>\n",
" <th>Married-spouse-absent</th>\n",
" <th>Never-married</th>\n",
" <th>Separated</th>\n",
" <th>Widowed</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" Married-AF-spouse Married-civ-spouse Married-spouse-absent \\\n",
"0 0 0 0 \n",
"1 0 1 0 \n",
"2 0 0 0 \n",
"3 0 1 0 \n",
"4 0 1 0 \n",
"... ... ... ... \n",
"32556 0 1 0 \n",
"32557 0 1 0 \n",
"32558 0 0 0 \n",
"32559 0 0 0 \n",
"32560 0 1 0 \n",
"\n",
" Never-married Separated Widowed \n",
"0 1 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 0 0 0 \n",
"... ... ... ... \n",
"32556 0 0 0 \n",
"32557 0 0 0 \n",
"32558 0 0 1 \n",
"32559 1 0 0 \n",
"32560 0 0 0 \n",
"\n",
"[32561 rows x 6 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Marital_status = pd.get_dummies(Read_data[\"marital-status\"],drop_first = True)\n",
"## cleaning marital_status\n",
"Marital_status"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Not-in-family</th>\n",
" <th>Other-relative</th>\n",
" <th>Own-child</th>\n",
" <th>Unmarried</th>\n",
" <th>Wife</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" Not-in-family Other-relative Own-child Unmarried Wife\n",
"0 1 0 0 0 0\n",
"1 0 0 0 0 0\n",
"2 1 0 0 0 0\n",
"3 0 0 0 0 0\n",
"4 0 0 0 0 1\n",
"... ... ... ... ... ...\n",
"32556 0 0 0 0 1\n",
"32557 0 0 0 0 0\n",
"32558 0 0 0 1 0\n",
"32559 0 0 1 0 0\n",
"32560 0 0 0 0 1\n",
"\n",
"[32561 rows x 5 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Relationship = pd.get_dummies(Read_data[\"relationship\"],drop_first = True)\n",
"\n",
"Relationship ## cleaning the relationship "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Federal-gov</th>\n",
" <th>Local-gov</th>\n",
" <th>Never-worked</th>\n",
" <th>Private</th>\n",
" <th>Self-emp-inc</th>\n",
" <th>Self-emp-not-inc</th>\n",
" <th>State-gov</th>\n",
" <th>Without-pay</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" Federal-gov Local-gov Never-worked Private Self-emp-inc \\\n",
"0 0 0 0 0 0 \n",
"1 0 0 0 0 0 \n",
"2 0 0 0 1 0 \n",
"3 0 0 0 1 0 \n",
"4 0 0 0 1 0 \n",
"... ... ... ... ... ... \n",
"32556 0 0 0 1 0 \n",
"32557 0 0 0 1 0 \n",
"32558 0 0 0 1 0 \n",
"32559 0 0 0 1 0 \n",
"32560 0 0 0 0 1 \n",
"\n",
" Self-emp-not-inc State-gov Without-pay \n",
"0 0 1 0 \n",
"1 1 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 0 0 0 \n",
"... ... ... ... \n",
"32556 0 0 0 \n",
"32557 0 0 0 \n",
"32558 0 0 0 \n",
"32559 0 0 0 \n",
"32560 0 0 0 \n",
"\n",
"[32561 rows x 8 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Work_class = pd.get_dummies(Read_data[\"workclass\"],drop_first = True)\n",
"\n",
"Work_class ## cleaning the work class data "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Asian-Pac-Islander</th>\n",
" <th>Black</th>\n",
" <th>Other</th>\n",
" <th>White</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" Asian-Pac-Islander Black Other White\n",
"0 0 0 0 1\n",
"1 0 0 0 1\n",
"2 0 0 0 1\n",
"3 0 1 0 0\n",
"4 0 1 0 0\n",
"... ... ... ... ...\n",
"32556 0 0 0 1\n",
"32557 0 0 0 1\n",
"32558 0 0 0 1\n",
"32559 0 0 0 1\n",
"32560 0 0 0 1\n",
"\n",
"[32561 rows x 4 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Race = pd.get_dummies(Read_data[\"race\"],drop_first = True)\n",
"\n",
"Race ## cleaning the race data "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# drope all uncleaned data from the dataset "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"Read_data.drop([\"workclass\",\"race\",\"relationship\",\"sex\",\"marital-status\",\"race\",\"native-country\",\"occupation\",\"native-country\",\"education\",\"income\"],axis=1, inplace= True)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>fnlwgt</th>\n",
" <th>education-num</th>\n",
" <th>capital-gain</th>\n",
" <th>capital-loss</th>\n",
" <th>hours-per-week</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>39</td>\n",
" <td>77516</td>\n",
" <td>13</td>\n",
" <td>2174</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>50</td>\n",
" <td>83311</td>\n",
" <td>13</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>38</td>\n",
" <td>215646</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>53</td>\n",
" <td>234721</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>28</td>\n",
" <td>338409</td>\n",
" <td>13</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>27</td>\n",
" <td>257302</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>40</td>\n",
" <td>154374</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>58</td>\n",
" <td>151910</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>22</td>\n",
" <td>201490</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>52</td>\n",
" <td>287927</td>\n",
" <td>9</td>\n",
" <td>15024</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" age fnlwgt education-num capital-gain capital-loss hours-per-week\n",
"0 39 77516 13 2174 0 40\n",
"1 50 83311 13 0 0 13\n",
"2 38 215646 9 0 0 40\n",
"3 53 234721 7 0 0 40\n",
"4 28 338409 13 0 0 40\n",
"... ... ... ... ... ... ...\n",
"32556 27 257302 12 0 0 38\n",
"32557 40 154374 9 0 0 40\n",
"32558 58 151910 9 0 0 40\n",
"32559 22 201490 9 0 0 20\n",
"32560 52 287927 9 15024 0 40\n",
"\n",
"[32561 rows x 6 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Read_data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Adding the all cleaned data to the dataset "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"\n",
"Read_data = pd.concat([Gender,Occupation,Countries,Relationship,Work_class,Marital_status,Race,Read_data,Income],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Male</th>\n",
" <th>Adm-clerical</th>\n",
" <th>Armed-Forces</th>\n",
" <th>Craft-repair</th>\n",
" <th>Exec-managerial</th>\n",
" <th>Farming-fishing</th>\n",
" <th>Handlers-cleaners</th>\n",
" <th>Machine-op-inspct</th>\n",
" <th>Other-service</th>\n",
" <th>Priv-house-serv</th>\n",
" <th>...</th>\n",
" <th>Black</th>\n",
" <th>Other</th>\n",
" <th>White</th>\n",
" <th>age</th>\n",
" <th>fnlwgt</th>\n",
" <th>education-num</th>\n",
" <th>capital-gain</th>\n",
" <th>capital-loss</th>\n",
" <th>hours-per-week</th>\n",
" <th>&gt;50K</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>39</td>\n",
" <td>77516</td>\n",
" <td>13</td>\n",
" <td>2174</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>83311</td>\n",
" <td>13</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>13</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>38</td>\n",
" <td>215646</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>53</td>\n",
" <td>234721</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>28</td>\n",
" <td>338409</td>\n",
" <td>13</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32556</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>27</td>\n",
" <td>257302</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>38</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32557</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>40</td>\n",
" <td>154374</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32558</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>58</td>\n",
" <td>151910</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32559</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>22</td>\n",
" <td>201490</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>20</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32560</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>52</td>\n",
" <td>287927</td>\n",
" <td>9</td>\n",
" <td>15024</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>32561 rows × 86 columns</p>\n",
"</div>"
],
"text/plain": [
" Male Adm-clerical Armed-Forces Craft-repair Exec-managerial \\\n",
"0 1 1 0 0 0 \n",
"1 1 0 0 0 1 \n",
"2 1 0 0 0 0 \n",
"3 1 0 0 0 0 \n",
"4 0 0 0 0 0 \n",
"... ... ... ... ... ... \n",
"32556 0 0 0 0 0 \n",
"32557 1 0 0 0 0 \n",
"32558 0 1 0 0 0 \n",
"32559 1 1 0 0 0 \n",
"32560 0 0 0 0 1 \n",
"\n",
" Farming-fishing Handlers-cleaners Machine-op-inspct \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 1 0 \n",
"3 0 1 0 \n",
"4 0 0 0 \n",
"... ... ... ... \n",
"32556 0 0 0 \n",
"32557 0 0 1 \n",
"32558 0 0 0 \n",
"32559 0 0 0 \n",
"32560 0 0 0 \n",
"\n",
" Other-service Priv-house-serv ... Black Other White age \\\n",
"0 0 0 ... 0 0 1 39 \n",
"1 0 0 ... 0 0 1 50 \n",
"2 0 0 ... 0 0 1 38 \n",
"3 0 0 ... 1 0 0 53 \n",
"4 0 0 ... 1 0 0 28 \n",
"... ... ... ... ... ... ... ... \n",
"32556 0 0 ... 0 0 1 27 \n",
"32557 0 0 ... 0 0 1 40 \n",
"32558 0 0 ... 0 0 1 58 \n",
"32559 0 0 ... 0 0 1 22 \n",
"32560 0 0 ... 0 0 1 52 \n",
"\n",
" fnlwgt education-num capital-gain capital-loss hours-per-week \\\n",
"0 77516 13 2174 0 40 \n",
"1 83311 13 0 0 13 \n",
"2 215646 9 0 0 40 \n",
"3 234721 7 0 0 40 \n",
"4 338409 13 0 0 40 \n",
"... ... ... ... ... ... \n",
"32556 257302 12 0 0 38 \n",
"32557 154374 9 0 0 40 \n",
"32558 151910 9 0 0 40 \n",
"32559 201490 9 0 0 20 \n",
"32560 287927 9 15024 0 40 \n",
"\n",
" >50K \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"... ... \n",
"32556 0 \n",
"32557 1 \n",
"32558 0 \n",
"32559 0 \n",
"32560 1 \n",
"\n",
"[32561 rows x 86 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Read_data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get the correlation between the data"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Male</th>\n",
" <th>Adm-clerical</th>\n",
" <th>Armed-Forces</th>\n",
" <th>Craft-repair</th>\n",
" <th>Exec-managerial</th>\n",
" <th>Farming-fishing</th>\n",
" <th>Handlers-cleaners</th>\n",
" <th>Machine-op-inspct</th>\n",
" <th>Other-service</th>\n",
" <th>Priv-house-serv</th>\n",
" <th>...</th>\n",
" <th>Black</th>\n",
" <th>Other</th>\n",
" <th>White</th>\n",
" <th>age</th>\n",
" <th>fnlwgt</th>\n",
" <th>education-num</th>\n",
" <th>capital-gain</th>\n",
" <th>capital-loss</th>\n",
" <th>hours-per-week</th>\n",
" <th>&gt;50K</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Male</th>\n",
" <td>1.00</td>\n",
" <td>-0.26</td>\n",
" <td>0.01</td>\n",
" <td>0.22</td>\n",
" <td>0.04</td>\n",
" <td>0.10</td>\n",
" <td>0.09</td>\n",
" <td>0.03</td>\n",
" <td>-0.15</td>\n",
" <td>-0.09</td>\n",
" <td>...</td>\n",
" <td>-0.12</td>\n",
" <td>-0.01</td>\n",
" <td>0.10</td>\n",
" <td>0.09</td>\n",
" <td>0.03</td>\n",
" <td>0.01</td>\n",
" <td>0.05</td>\n",
" <td>0.05</td>\n",
" <td>0.23</td>\n",
" <td>0.22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Adm-clerical</th>\n",
" <td>-0.26</td>\n",
" <td>1.00</td>\n",
" <td>-0.01</td>\n",
" <td>-0.14</td>\n",
" <td>-0.14</td>\n",
" <td>-0.06</td>\n",
" <td>-0.08</td>\n",
" <td>-0.09</td>\n",
" <td>-0.12</td>\n",
" <td>-0.02</td>\n",
" <td>...</td>\n",
" <td>0.04</td>\n",
" <td>-0.01</td>\n",
" <td>-0.04</td>\n",
" <td>-0.04</td>\n",
" <td>0.01</td>\n",
" <td>0.00</td>\n",
" <td>-0.03</td>\n",
" <td>-0.02</td>\n",
" <td>-0.08</td>\n",
" <td>-0.09</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Armed-Forces</th>\n",
" <td>0.01</td>\n",
" <td>-0.01</td>\n",
" <td>1.00</td>\n",
" <td>-0.01</td>\n",
" <td>-0.01</td>\n",
" <td>-0.00</td>\n",
" <td>-0.00</td>\n",
" <td>-0.00</td>\n",
" <td>-0.01</td>\n",
" <td>-0.00</td>\n",
" <td>...</td>\n",
" <td>0.00</td>\n",
" <td>-0.00</td>\n",
" <td>-0.00</td>\n",
" <td>-0.01</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>-0.00</td>\n",
" <td>0.01</td>\n",
" <td>0.00</td>\n",
" <td>-0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Craft-repair</th>\n",
" <td>0.22</td>\n",
" <td>-0.14</td>\n",
" <td>-0.01</td>\n",
" <td>1.00</td>\n",
" <td>-0.14</td>\n",
" <td>-0.07</td>\n",
" <td>-0.08</td>\n",
" <td>-0.10</td>\n",
" <td>-0.13</td>\n",
" <td>-0.03</td>\n",
" <td>...</td>\n",
" <td>-0.05</td>\n",
" <td>-0.01</td>\n",
" <td>0.05</td>\n",
" <td>0.01</td>\n",
" <td>0.01</td>\n",
" <td>-0.14</td>\n",
" <td>-0.02</td>\n",
" <td>0.00</td>\n",
" <td>0.06</td>\n",
" <td>-0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Exec-managerial</th>\n",
" <td>0.04</td>\n",
" <td>-0.14</td>\n",
" <td>-0.01</td>\n",
" <td>-0.14</td>\n",
" <td>1.00</td>\n",
" <td>-0.07</td>\n",
" <td>-0.08</td>\n",
" <td>-0.10</td>\n",
" <td>-0.13</td>\n",
" <td>-0.03</td>\n",
" <td>...</td>\n",
" <td>-0.05</td>\n",
" <td>-0.02</td>\n",
" <td>0.05</td>\n",
" <td>0.10</td>\n",
" <td>-0.02</td>\n",
" <td>0.20</td>\n",
" <td>0.06</td>\n",
" <td>0.05</td>\n",
" <td>0.14</td>\n",
" <td>0.21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>education-num</th>\n",
" <td>0.01</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>-0.14</td>\n",
" <td>0.20</td>\n",
" <td>-0.10</td>\n",
" <td>-0.13</td>\n",
" <td>-0.16</td>\n",
" <td>-0.17</td>\n",
" <td>-0.07</td>\n",
" <td>...</td>\n",
" <td>-0.08</td>\n",
" <td>-0.04</td>\n",
" <td>0.05</td>\n",
" <td>0.04</td>\n",
" <td>-0.04</td>\n",
" <td>1.00</td>\n",
" <td>0.12</td>\n",
" <td>0.08</td>\n",
" <td>0.15</td>\n",
" <td>0.34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>capital-gain</th>\n",
" <td>0.05</td>\n",
" <td>-0.03</td>\n",
" <td>-0.00</td>\n",
" <td>-0.02</td>\n",
" <td>0.06</td>\n",
" <td>-0.01</td>\n",
" <td>-0.02</td>\n",
" <td>-0.03</td>\n",
" <td>-0.04</td>\n",
" <td>-0.01</td>\n",
" <td>...</td>\n",
" <td>-0.02</td>\n",
" <td>-0.00</td>\n",
" <td>0.01</td>\n",
" <td>0.08</td>\n",
" <td>0.00</td>\n",
" <td>0.12</td>\n",
" <td>1.00</td>\n",
" <td>-0.03</td>\n",
" <td>0.08</td>\n",
" <td>0.22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>capital-loss</th>\n",
" <td>0.05</td>\n",
" <td>-0.02</td>\n",
" <td>0.01</td>\n",
" <td>0.00</td>\n",
" <td>0.05</td>\n",
" <td>-0.01</td>\n",
" <td>-0.02</td>\n",
" <td>-0.02</td>\n",
" <td>-0.04</td>\n",
" <td>-0.01</td>\n",
" <td>...</td>\n",
" <td>-0.02</td>\n",
" <td>-0.01</td>\n",
" <td>0.02</td>\n",
" <td>0.06</td>\n",
" <td>-0.01</td>\n",
" <td>0.08</td>\n",
" <td>-0.03</td>\n",
" <td>1.00</td>\n",
" <td>0.05</td>\n",
" <td>0.15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>hours-per-week</th>\n",
" <td>0.23</td>\n",
" <td>-0.08</td>\n",
" <td>0.00</td>\n",
" <td>0.06</td>\n",
" <td>0.14</td>\n",
" <td>0.09</td>\n",
" <td>-0.04</td>\n",
" <td>0.01</td>\n",
" <td>-0.16</td>\n",
" <td>-0.04</td>\n",
" <td>...</td>\n",
" <td>-0.05</td>\n",
" <td>-0.01</td>\n",
" <td>0.05</td>\n",
" <td>0.07</td>\n",
" <td>-0.02</td>\n",
" <td>0.15</td>\n",
" <td>0.08</td>\n",
" <td>0.05</td>\n",
" <td>1.00</td>\n",
" <td>0.23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>&gt;50K</th>\n",
" <td>0.22</td>\n",
" <td>-0.09</td>\n",
" <td>-0.01</td>\n",
" <td>-0.01</td>\n",
" <td>0.21</td>\n",
" <td>-0.05</td>\n",
" <td>-0.09</td>\n",
" <td>-0.07</td>\n",
" <td>-0.16</td>\n",
" <td>-0.04</td>\n",
" <td>...</td>\n",
" <td>-0.09</td>\n",
" <td>-0.03</td>\n",
" <td>0.09</td>\n",
" <td>0.23</td>\n",
" <td>-0.01</td>\n",
" <td>0.34</td>\n",
" <td>0.22</td>\n",
" <td>0.15</td>\n",
" <td>0.23</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>86 rows × 86 columns</p>\n",
"</div>"
],
"text/plain": [
" Male Adm-clerical Armed-Forces Craft-repair \\\n",
" Male 1.00 -0.26 0.01 0.22 \n",
" Adm-clerical -0.26 1.00 -0.01 -0.14 \n",
" Armed-Forces 0.01 -0.01 1.00 -0.01 \n",
" Craft-repair 0.22 -0.14 -0.01 1.00 \n",
" Exec-managerial 0.04 -0.14 -0.01 -0.14 \n",
"... ... ... ... ... \n",
"education-num 0.01 0.00 0.00 -0.14 \n",
"capital-gain 0.05 -0.03 -0.00 -0.02 \n",
"capital-loss 0.05 -0.02 0.01 0.00 \n",
"hours-per-week 0.23 -0.08 0.00 0.06 \n",
" >50K 0.22 -0.09 -0.01 -0.01 \n",
"\n",
" Exec-managerial Farming-fishing Handlers-cleaners \\\n",
" Male 0.04 0.10 0.09 \n",
" Adm-clerical -0.14 -0.06 -0.08 \n",
" Armed-Forces -0.01 -0.00 -0.00 \n",
" Craft-repair -0.14 -0.07 -0.08 \n",
" Exec-managerial 1.00 -0.07 -0.08 \n",
"... ... ... ... \n",
"education-num 0.20 -0.10 -0.13 \n",
"capital-gain 0.06 -0.01 -0.02 \n",
"capital-loss 0.05 -0.01 -0.02 \n",
"hours-per-week 0.14 0.09 -0.04 \n",
" >50K 0.21 -0.05 -0.09 \n",
"\n",
" Machine-op-inspct Other-service Priv-house-serv ... \\\n",
" Male 0.03 -0.15 -0.09 ... \n",
" Adm-clerical -0.09 -0.12 -0.02 ... \n",
" Armed-Forces -0.00 -0.01 -0.00 ... \n",
" Craft-repair -0.10 -0.13 -0.03 ... \n",
" Exec-managerial -0.10 -0.13 -0.03 ... \n",
"... ... ... ... ... \n",
"education-num -0.16 -0.17 -0.07 ... \n",
"capital-gain -0.03 -0.04 -0.01 ... \n",
"capital-loss -0.02 -0.04 -0.01 ... \n",
"hours-per-week 0.01 -0.16 -0.04 ... \n",
" >50K -0.07 -0.16 -0.04 ... \n",
"\n",
" Black Other White age fnlwgt education-num \\\n",
" Male -0.12 -0.01 0.10 0.09 0.03 0.01 \n",
" Adm-clerical 0.04 -0.01 -0.04 -0.04 0.01 0.00 \n",
" Armed-Forces 0.00 -0.00 -0.00 -0.01 0.00 0.00 \n",
" Craft-repair -0.05 -0.01 0.05 0.01 0.01 -0.14 \n",
" Exec-managerial -0.05 -0.02 0.05 0.10 -0.02 0.20 \n",
"... ... ... ... ... ... ... \n",
"education-num -0.08 -0.04 0.05 0.04 -0.04 1.00 \n",
"capital-gain -0.02 -0.00 0.01 0.08 0.00 0.12 \n",
"capital-loss -0.02 -0.01 0.02 0.06 -0.01 0.08 \n",
"hours-per-week -0.05 -0.01 0.05 0.07 -0.02 0.15 \n",
" >50K -0.09 -0.03 0.09 0.23 -0.01 0.34 \n",
"\n",
" capital-gain capital-loss hours-per-week >50K \n",
" Male 0.05 0.05 0.23 0.22 \n",
" Adm-clerical -0.03 -0.02 -0.08 -0.09 \n",
" Armed-Forces -0.00 0.01 0.00 -0.01 \n",
" Craft-repair -0.02 0.00 0.06 -0.01 \n",
" Exec-managerial 0.06 0.05 0.14 0.21 \n",
"... ... ... ... ... \n",
"education-num 0.12 0.08 0.15 0.34 \n",
"capital-gain 1.00 -0.03 0.08 0.22 \n",
"capital-loss -0.03 1.00 0.05 0.15 \n",
"hours-per-week 0.08 0.05 1.00 0.23 \n",
" >50K 0.22 0.15 0.23 1.00 \n",
"\n",
"[86 rows x 86 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Correlation = Read_data.corr().round(2)\n",
"Correlation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get the high correlation feature only "
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"Correlation.columns = [x.strip() for x in Read_data.columns]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[' Male',\n",
" ' Exec-managerial',\n",
" ' Prof-specialty',\n",
" ' Married-civ-spouse',\n",
" 'age',\n",
" 'education-num',\n",
" 'capital-gain',\n",
" 'hours-per-week',\n",
" ' >50K']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"High_Correlation = Correlation[Correlation[\">50K\"]>0.15].index.tolist()\n",
"High_Correlation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Represent the Correlation on heatmap"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 900x900 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"Heatmap_Correlation= Read_data[High_Correlation].corr().round(2)\n",
"\n",
"plt.figure(figsize=(12.5,12.5))\n",
"sns.heatmap(Heatmap_Correlation, annot=True)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Split the high correlation data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"X = Read_data[High_Correlation].iloc[:,:-1]\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"Y = Read_data[High_Correlation].iloc[:,-1:]\n",
"Y = np.ravel(Y)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Apply the SVM Classification Model"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"SVC_Model = svm.SVC()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SVC()"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"SVC_Model.fit(X_train, Y_train)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"Y_train_pred = SVC_Model.predict(X_train)\n",
"Y_test_pred = SVC_Model.predict(X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model accuracy result"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8002369252369252"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"SVC_Model.score(X_train,Y_train)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7998771624526564"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"SVC_Model.score(X_test, Y_test)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"confusion_matrix [[7385 22]\n",
" [1933 429]]\n"
]
}
],
"source": [
"print(\"confusion_matrix\",confusion_matrix(Y_test, Y_test_pred))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import plot_confusion_matrix"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plot_confusion_matrix(SVC_Model, X_test, Y_test) \n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}