Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Classification_Project/Untitled.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
258 lines (101 sloc)
3.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# # importing the librarys | |
# In[1]: | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
from matplotlib import pyplot as plt | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.model_selection import train_test_split | |
from sklearn import svm | |
from sklearn.metrics import confusion_matrix | |
# # reading the data from the cvs files | |
# In[2]: | |
Read_data = pd.read_csv(r"C:\Users\baejr\OneDrive\Desktop\web dev\income_evaluation.csv") | |
# In[ ]: | |
# In[3]: | |
Read_data | |
# # Cleaning the datasets | |
# In[4]: | |
Read_data.columns = [x.strip() for x in Read_data.columns] | |
Gender = pd.get_dummies(Read_data["sex"],drop_first = True) | |
Gender ## cleaning the sex data | |
# In[5]: | |
Occupation = pd.get_dummies(Read_data["occupation"],drop_first = True) | |
Occupation ## cleaning and occupation data | |
# In[6]: | |
Countries = pd.get_dummies(Read_data["native-country"],drop_first = True) ## cleaning and get data for each country | |
Countries | |
# In[7]: | |
Income = pd.get_dummies(Read_data["income"],drop_first = True) | |
Income ## cleaning the income data | |
# In[8]: | |
Marital_status = pd.get_dummies(Read_data["marital-status"],drop_first = True) | |
## cleaning marital_status | |
Marital_status | |
# In[9]: | |
Relationship = pd.get_dummies(Read_data["relationship"],drop_first = True) | |
Relationship ## cleaning the relationship | |
# In[10]: | |
Work_class = pd.get_dummies(Read_data["workclass"],drop_first = True) | |
Work_class ## cleaning the work class data | |
# In[11]: | |
Race = pd.get_dummies(Read_data["race"],drop_first = True) | |
Race ## cleaning the race data | |
# # drope all uncleaned data from the dataset | |
# In[12]: | |
Read_data.drop(["workclass","race","relationship","sex","marital-status","race","native-country","occupation","native-country","education","income"],axis=1, inplace= True) | |
# In[13]: | |
Read_data | |
# # Adding the all cleaned data to the dataset | |
# In[14]: | |
Read_data = pd.concat([Gender,Occupation,Countries,Relationship,Work_class,Marital_status,Race,Read_data,Income],axis=1) | |
# In[15]: | |
Read_data | |
# # Get the correlation between the data | |
# In[16]: | |
Correlation = Read_data.corr().round(2) | |
Correlation | |
# # Get the high correlation feature only | |
# In[17]: | |
Correlation.columns = [x.strip() for x in Read_data.columns] | |
# In[18]: | |
High_Correlation = Correlation[Correlation[">50K"]>0.15].index.tolist() | |
High_Correlation | |
# # Represent the Correlation on heatmap | |
# In[ ]: | |
# In[19]: | |
Heatmap_Correlation= Read_data[High_Correlation].corr().round(2) | |
plt.figure(figsize=(12.5,12.5)) | |
sns.heatmap(Heatmap_Correlation, annot=True) | |
plt.show() | |
# # Split the high correlation data | |
# In[20]: | |
X = Read_data[High_Correlation].iloc[:,:-1] | |
# In[21]: | |
Y = Read_data[High_Correlation].iloc[:,-1:] | |
Y = np.ravel(Y) | |
# In[22]: | |
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 0) | |
# # Apply the SVM Classification Model | |
# In[23]: | |
SVC_Model = svm.SVC() | |
# In[24]: | |
SVC_Model.fit(X_train, Y_train) | |
# In[25]: | |
Y_train_pred = SVC_Model.predict(X_train) | |
Y_test_pred = SVC_Model.predict(X_test) | |
# # Model accuracy result | |
# In[26]: | |
SVC_Model.score(X_train,Y_train) | |
# In[27]: | |
SVC_Model.score(X_test, Y_test) | |
# In[28]: | |
print("confusion_matrix",confusion_matrix(Y_test, Y_test_pred)) | |
# In[29]: | |
from sklearn.metrics import plot_confusion_matrix | |
# In[30]: | |
plot_confusion_matrix(SVC_Model, X_test, Y_test) | |
plt.show() | |
# In[ ]: | |