Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env python
# coding: utf-8
# # importing the librarys
# In[1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix
# # reading the data from the cvs files
# In[2]:
Read_data = pd.read_csv(r"C:\Users\baejr\OneDrive\Desktop\web dev\income_evaluation.csv")
# In[ ]:
# In[3]:
Read_data
# # Cleaning the datasets
# In[4]:
Read_data.columns = [x.strip() for x in Read_data.columns]
Gender = pd.get_dummies(Read_data["sex"],drop_first = True)
Gender ## cleaning the sex data
# In[5]:
Occupation = pd.get_dummies(Read_data["occupation"],drop_first = True)
Occupation ## cleaning and occupation data
# In[6]:
Countries = pd.get_dummies(Read_data["native-country"],drop_first = True) ## cleaning and get data for each country
Countries
# In[7]:
Income = pd.get_dummies(Read_data["income"],drop_first = True)
Income ## cleaning the income data
# In[8]:
Marital_status = pd.get_dummies(Read_data["marital-status"],drop_first = True)
## cleaning marital_status
Marital_status
# In[9]:
Relationship = pd.get_dummies(Read_data["relationship"],drop_first = True)
Relationship ## cleaning the relationship
# In[10]:
Work_class = pd.get_dummies(Read_data["workclass"],drop_first = True)
Work_class ## cleaning the work class data
# In[11]:
Race = pd.get_dummies(Read_data["race"],drop_first = True)
Race ## cleaning the race data
# # drope all uncleaned data from the dataset
# In[12]:
Read_data.drop(["workclass","race","relationship","sex","marital-status","race","native-country","occupation","native-country","education","income"],axis=1, inplace= True)
# In[13]:
Read_data
# # Adding the all cleaned data to the dataset
# In[14]:
Read_data = pd.concat([Gender,Occupation,Countries,Relationship,Work_class,Marital_status,Race,Read_data,Income],axis=1)
# In[15]:
Read_data
# # Get the correlation between the data
# In[16]:
Correlation = Read_data.corr().round(2)
Correlation
# # Get the high correlation feature only
# In[17]:
Correlation.columns = [x.strip() for x in Read_data.columns]
# In[18]:
High_Correlation = Correlation[Correlation[">50K"]>0.15].index.tolist()
High_Correlation
# # Represent the Correlation on heatmap
# In[ ]:
# In[19]:
Heatmap_Correlation= Read_data[High_Correlation].corr().round(2)
plt.figure(figsize=(12.5,12.5))
sns.heatmap(Heatmap_Correlation, annot=True)
plt.show()
# # Split the high correlation data
# In[20]:
X = Read_data[High_Correlation].iloc[:,:-1]
# In[21]:
Y = Read_data[High_Correlation].iloc[:,-1:]
Y = np.ravel(Y)
# In[22]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 0)
# # Apply the SVM Classification Model
# In[23]:
SVC_Model = svm.SVC()
# In[24]:
SVC_Model.fit(X_train, Y_train)
# In[25]:
Y_train_pred = SVC_Model.predict(X_train)
Y_test_pred = SVC_Model.predict(X_test)
# # Model accuracy result
# In[26]:
SVC_Model.score(X_train,Y_train)
# In[27]:
SVC_Model.score(X_test, Y_test)
# In[28]:
print("confusion_matrix",confusion_matrix(Y_test, Y_test_pred))
# In[29]:
from sklearn.metrics import plot_confusion_matrix
# In[30]:
plot_confusion_matrix(SVC_Model, X_test, Y_test)
plt.show()
# In[ ]: