Untitled.py

#!/usr/bin/env python
# coding: utf-8

# # importing the librarys

# In[1]:


import pandas as pd

import numpy as np

import seaborn as sns

from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn import svm
from sklearn.metrics import confusion_matrix


# # reading the data from the cvs files

# In[2]:


Read_data = pd.read_csv(r"C:\Users\baejr\OneDrive\Desktop\web dev\income_evaluation.csv")


# In[ ]:


# In[3]:


Read_data


# # Cleaning the datasets

# In[4]:


Read_data.columns = [x.strip() for x in Read_data.columns]
Gender =  pd.get_dummies(Read_data["sex"],drop_first = True)
Gender ## cleaning the sex data


# In[5]:


Occupation = pd.get_dummies(Read_data["occupation"],drop_first = True)
Occupation ## cleaning and occupation data


# In[6]:


Countries = pd.get_dummies(Read_data["native-country"],drop_first = True)  ## cleaning and get data for each country
Countries


# In[7]:


Income =  pd.get_dummies(Read_data["income"],drop_first = True)
Income ## cleaning the income data


# In[8]:


Marital_status =  pd.get_dummies(Read_data["marital-status"],drop_first = True)
## cleaning marital_status
Marital_status


# In[9]:


Relationship =  pd.get_dummies(Read_data["relationship"],drop_first = True)

Relationship  ## cleaning the relationship


# In[10]:


Work_class = pd.get_dummies(Read_data["workclass"],drop_first = True)

Work_class ## cleaning the work class data


# In[11]:


Race = pd.get_dummies(Read_data["race"],drop_first = True)

Race ## cleaning the race data


# # drope all uncleaned data from the dataset

# In[12]:


Read_data.drop(["workclass","race","relationship","sex","marital-status","race","native-country","occupation","native-country","education","income"],axis=1, inplace= True)


# In[13]:


Read_data


# # Adding the all cleaned data to the dataset

# In[14]:


Read_data = pd.concat([Gender,Occupation,Countries,Relationship,Work_class,Marital_status,Race,Read_data,Income],axis=1)


# In[15]:


Read_data


# # Get the correlation between the data

# In[16]:


Correlation = Read_data.corr().round(2)
Correlation


# # Get the high correlation feature only

# In[17]:


Correlation.columns = [x.strip() for x in Read_data.columns]


# In[18]:


High_Correlation = Correlation[Correlation[">50K"]>0.15].index.tolist()
High_Correlation


# # Represent the Correlation on heatmap

# In[ ]:


# In[19]:


Heatmap_Correlation= Read_data[High_Correlation].corr().round(2)

plt.figure(figsize=(12.5,12.5))
sns.heatmap(Heatmap_Correlation, annot=True)
plt.show()


# # Split the high correlation data

# In[20]:


X = Read_data[High_Correlation].iloc[:,:-1]


# In[21]:


Y = Read_data[High_Correlation].iloc[:,-1:]
Y = np.ravel(Y)


# In[22]:


X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 0)


# # Apply the SVM Classification Model

# In[23]:


SVC_Model = svm.SVC()


# In[24]:


SVC_Model.fit(X_train, Y_train)


# In[25]:


Y_train_pred = SVC_Model.predict(X_train)
Y_test_pred = SVC_Model.predict(X_test)


# # Model accuracy result

# In[26]:


SVC_Model.score(X_train,Y_train)


# In[27]:


SVC_Model.score(X_test, Y_test)


# In[28]:


print("confusion_matrix",confusion_matrix(Y_test, Y_test_pred))


# In[29]:


from sklearn.metrics import plot_confusion_matrix


# In[30]:


plot_confusion_matrix(SVC_Model, X_test, Y_test)
plt.show()


# In[ ]:
	#!/usr/bin/env python
	# coding: utf-8

	# # importing the librarys

	# In[1]:


	import pandas as pd

	import numpy as np

	import seaborn as sns

	from matplotlib import pyplot as plt

	from sklearn.preprocessing import StandardScaler

	from sklearn.model_selection import train_test_split

	from sklearn import svm
	from sklearn.metrics import confusion_matrix


	# # reading the data from the cvs files

	# In[2]:


	Read_data = pd.read_csv(r"C:\Users\baejr\OneDrive\Desktop\web dev\income_evaluation.csv")


	# In[ ]:





	# In[3]:


	Read_data


	# # Cleaning the datasets

	# In[4]:


	Read_data.columns = [x.strip() for x in Read_data.columns]
	Gender = pd.get_dummies(Read_data["sex"],drop_first = True)
	Gender ## cleaning the sex data


	# In[5]:


	Occupation = pd.get_dummies(Read_data["occupation"],drop_first = True)
	Occupation ## cleaning and occupation data


	# In[6]:


	Countries = pd.get_dummies(Read_data["native-country"],drop_first = True) ## cleaning and get data for each country
	Countries


	# In[7]:


	Income = pd.get_dummies(Read_data["income"],drop_first = True)
	Income ## cleaning the income data


	# In[8]:


	Marital_status = pd.get_dummies(Read_data["marital-status"],drop_first = True)
	## cleaning marital_status
	Marital_status


	# In[9]:


	Relationship = pd.get_dummies(Read_data["relationship"],drop_first = True)

	Relationship ## cleaning the relationship


	# In[10]:


	Work_class = pd.get_dummies(Read_data["workclass"],drop_first = True)

	Work_class ## cleaning the work class data


	# In[11]:


	Race = pd.get_dummies(Read_data["race"],drop_first = True)

	Race ## cleaning the race data


	# # drope all uncleaned data from the dataset

	# In[12]:


	Read_data.drop(["workclass","race","relationship","sex","marital-status","race","native-country","occupation","native-country","education","income"],axis=1, inplace= True)


	# In[13]:


	Read_data


	# # Adding the all cleaned data to the dataset

	# In[14]:



	Read_data = pd.concat([Gender,Occupation,Countries,Relationship,Work_class,Marital_status,Race,Read_data,Income],axis=1)


	# In[15]:


	Read_data


	# # Get the correlation between the data

	# In[16]:


	Correlation = Read_data.corr().round(2)
	Correlation


	# # Get the high correlation feature only

	# In[17]:


	Correlation.columns = [x.strip() for x in Read_data.columns]


	# In[18]:


	High_Correlation = Correlation[Correlation[">50K"]>0.15].index.tolist()
	High_Correlation


	# # Represent the Correlation on heatmap

	# In[ ]:





	# In[19]:


	Heatmap_Correlation= Read_data[High_Correlation].corr().round(2)

	plt.figure(figsize=(12.5,12.5))
	sns.heatmap(Heatmap_Correlation, annot=True)
	plt.show()


	# # Split the high correlation data

	# In[20]:


	X = Read_data[High_Correlation].iloc[:,:-1]


	# In[21]:


	Y = Read_data[High_Correlation].iloc[:,-1:]
	Y = np.ravel(Y)


	# In[22]:


	X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3,random_state = 0)


	# # Apply the SVM Classification Model

	# In[23]:


	SVC_Model = svm.SVC()


	# In[24]:


	SVC_Model.fit(X_train, Y_train)


	# In[25]:


	Y_train_pred = SVC_Model.predict(X_train)
	Y_test_pred = SVC_Model.predict(X_test)


	# # Model accuracy result

	# In[26]:


	SVC_Model.score(X_train,Y_train)


	# In[27]:


	SVC_Model.score(X_test, Y_test)


	# In[28]:


	print("confusion_matrix",confusion_matrix(Y_test, Y_test_pred))


	# In[29]:


	from sklearn.metrics import plot_confusion_matrix


	# In[30]:


	plot_confusion_matrix(SVC_Model, X_test, Y_test)
	plt.show()


	# In[ ]: