Skip to content
Permalink
4b941a83f8
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
275 lines (138 sloc) 4.76 KB
#!/usr/bin/env python
# coding: utf-8
# # importing useful libs
# In[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# # IMPORT DATA from CSV FILES
# In[ ]:
# In[2]:
#import train and test CSV files
df = pd.read_csv(r"C:\Users\44742\Desktop\ML\train.csv")
df.describe(include="all")
# In[3]:
pd.isnull(df).sum()
# # checking the data (graphics)
# In[4]:
#gender serviver bar plot
sns.barplot(x="Sex", y="Survived", data=df)
print(" survived females:", df["Survived"][df["Sex"] == 'female'].value_counts(normalize = True)[1]*100)
print(" survived males :", df["Survived"][df["Sex"] == 'male'].value_counts(normalize = True)[1]*100)
# In[ ]:
# In[5]:
#pclass suriver bar plot
sns.barplot(x="Pclass", y="Survived", data=df)
print("first class:", df["Survived"][df["Pclass"] == 1].value_counts(normalize = True)[1]*100)
print("second class:", df["Survived"][df["Pclass"] == 2].value_counts(normalize = True)[1]*100)
print("third class:", df["Survived"][df["Pclass"] == 3].value_counts(normalize = True)[1]*100)
# In[ ]:
# In[6]:
#Parch vs. survival bar plot
sns.barplot(x="Parch", y="Survived", data=df)
plt.show()
# In[7]:
#categories for each people ages
df["Age"] = df["Age"].fillna(-0.5)
bluk = [-1, 1, 6, 13, 18, 21, 35, 50, np.inf]
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
df['peopleCategory'] = pd.cut(df["Age"], bluk, labels = labels)
# In[8]:
#people Category bar plot
sns.barplot(x="peopleCategory", y="Survived", data=df)
plt.show()
# # cleaning the data
# In[9]:
#droping not used data
df = df.drop(['Cabin'], axis = 1)
df = df.drop(['Ticket'], axis = 1)
# In[ ]:
# In[10]:
#filling the unknown values in emabarkes
southampton = df[df["Embarked"] == "S"].shape[0]
df = df.fillna({"Embarked": "S"})
cherbourg = df[df["Embarked"] == "C"].shape[0]
queenstown = df[df["Embarked"] == "Q"].shape[0]
print("queenstown = ", queenstown , "southampton = ", southampton,"cherbourg = ", cherbourg )
# In[11]:
#maping Embarked values to a numerical values
Emapping = {"S": 1, "C": 2, "Q": 3}
df['Embarked'] = df['Embarked'].map(Emapping)
df.head()
# In[12]:
#maping Age values to numerical values
Amapping = {'Baby': 1, 'Child': 2, 'Teenager': 3, 'Student': 4, 'Young Adult': 5, 'Adult': 6, 'Senior': 7}
df['peopleCategory'] = df['peopleCategory'].map(Amapping)
df.head()
# In[13]:
#dropping the unused feature
df = df.drop(['Age'], axis = 1)
df = df.drop(['Name'], axis = 1)
df = df.drop(['peopleCategory'], axis = 1)
# In[14]:
#maping Sex values to numerical values
Smapping = {"male": 0, "female": 1}
df['Sex'] = df['Sex'].map(Smapping)
df.head()
# In[15]:
#checking for nan values
df.columns[np.isnan(df).any()]
# # correlations
# In[16]:
DfCorr = df.corr()
DfCorr
# In[17]:
corr_matrix = np.corrcoef(df).round(decimals=2)
corr_matrix
# # data spliting
# In[18]:
pred = df.drop(['Survived', 'PassengerId'], axis=1)
target = df["Survived"]
x_train, x_test, y_train, y_test = train_test_split(pred, target, test_size = 0.22, random_state = 0)
# # Apply Support Vector Machines Model
# In[19]:
# Support Vector Machines
svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
acc_svc = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_svc)
# # confusion matrix for Support Vector Machines Model
# In[20]:
# confusion matrix
matrix = confusion_matrix(y_test,y_pred, labels=[1,0])
print('Confusion matrix : \n',matrix)
tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report
matrix = classification_report(y_test,y_pred,labels=[1,0])
print('Classification report : \n',matrix)
# # Apply Logistic Regression Model
# In[21]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
acc_logreg = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_logreg)
# In[ ]:
# # confusion matrix for Logistic Regression Model
# In[22]:
# confusion matrix
matrix = confusion_matrix(y_test,y_pred, labels=[1,0])
print('Confusion matrix : \n',matrix)
tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report
matrix = classification_report(y_test,y_pred,labels=[1,0])
print('Classification report : \n',matrix)