ML.py

#!/usr/bin/env python
# coding: utf-8

# # importing useful libs 

# In[1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# # IMPORT DATA from CSV FILES 

# In[ ]:


# In[2]:


#import train and test CSV files
df = pd.read_csv(r"C:\Users\44742\Desktop\ML\train.csv")
df.describe(include="all")


# In[3]:


pd.isnull(df).sum()


# # checking the data (graphics)

# In[4]:


#gender serviver bar plot 
sns.barplot(x="Sex", y="Survived", data=df)

print("  survived females:", df["Survived"][df["Sex"] == 'female'].value_counts(normalize = True)[1]*100)

print(" survived males :", df["Survived"][df["Sex"] == 'male'].value_counts(normalize = True)[1]*100)


# In[ ]:


# In[5]:


#pclass suriver bar plot 
sns.barplot(x="Pclass", y="Survived", data=df)

print("first class:", df["Survived"][df["Pclass"] == 1].value_counts(normalize = True)[1]*100)

print("second class:", df["Survived"][df["Pclass"] == 2].value_counts(normalize = True)[1]*100)

print("third class:", df["Survived"][df["Pclass"] == 3].value_counts(normalize = True)[1]*100)


# In[ ]:


# In[6]:


#Parch vs. survival bar plot 
sns.barplot(x="Parch", y="Survived", data=df)
plt.show()


# In[7]:


#categories for each people ages 
df["Age"] = df["Age"].fillna(-0.5)
bluk = [-1, 1, 6, 13, 18, 21, 35, 50, np.inf]
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
df['peopleCategory'] = pd.cut(df["Age"], bluk, labels = labels)


# In[8]:


#people Category bar plot
sns.barplot(x="peopleCategory", y="Survived", data=df)
plt.show()


# # cleaning the data 

# In[9]:


#droping not used data 
df = df.drop(['Cabin'], axis = 1)
df = df.drop(['Ticket'], axis = 1)


# In[ ]:


# In[10]:


#filling the unknown values in emabarkes 
southampton = df[df["Embarked"] == "S"].shape[0]
df = df.fillna({"Embarked": "S"})
cherbourg = df[df["Embarked"] == "C"].shape[0]

queenstown = df[df["Embarked"] == "Q"].shape[0]
print("queenstown = ", queenstown , "southampton = ", southampton,"cherbourg = ",   cherbourg  )


# In[11]:


#maping Embarked values to a numerical values
Emapping = {"S": 1, "C": 2, "Q": 3}
df['Embarked'] = df['Embarked'].map(Emapping)

df.head()


# In[12]:


#maping Age values to  numerical values
Amapping = {'Baby': 1, 'Child': 2, 'Teenager': 3, 'Student': 4, 'Young Adult': 5, 'Adult': 6, 'Senior': 7}
df['peopleCategory'] = df['peopleCategory'].map(Amapping)
df.head()


# In[13]:


#dropping the unused feature
df = df.drop(['Age'], axis = 1)
df = df.drop(['Name'], axis = 1)
df = df.drop(['peopleCategory'], axis = 1)


# In[14]:


#maping Sex values to numerical values
Smapping = {"male": 0, "female": 1}
df['Sex'] = df['Sex'].map(Smapping)

df.head()


# In[15]:


#checking for nan values
df.columns[np.isnan(df).any()]


# # correlations

# In[16]:


DfCorr = df.corr() 
DfCorr


# In[17]:


corr_matrix = np.corrcoef(df).round(decimals=2)
corr_matrix


# # data spliting

# In[18]:


pred = df.drop(['Survived', 'PassengerId'], axis=1)
target = df["Survived"]
x_train, x_test, y_train, y_test = train_test_split(pred, target, test_size = 0.22, random_state = 0)


# # Apply Support Vector Machines Model

# In[19]:


# Support Vector Machines

svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
acc_svc = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_svc)


# # confusion matrix for Support Vector Machines Model

# In[20]:


# confusion matrix
matrix = confusion_matrix(y_test,y_pred, labels=[1,0])
print('Confusion matrix : \n',matrix)

tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report 
matrix = classification_report(y_test,y_pred,labels=[1,0])
print('Classification report : \n',matrix)


# # Apply Logistic Regression Model

# In[21]:


# Logistic Regression
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
acc_logreg = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_logreg)


# In[ ]:


# # confusion matrix for Logistic Regression Model

# In[22]:


# confusion matrix
matrix = confusion_matrix(y_test,y_pred, labels=[1,0])
print('Confusion matrix : \n',matrix)

tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report 
matrix = classification_report(y_test,y_pred,labels=[1,0])
print('Classification report : \n',matrix)