Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ML/ML.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
275 lines (138 sloc)
4.76 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# # importing useful libs | |
# In[1]: | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
get_ipython().run_line_magic('matplotlib', 'inline') | |
from sklearn.model_selection import train_test_split | |
from sklearn.svm import SVC | |
from sklearn.metrics import accuracy_score | |
from sklearn.linear_model import LogisticRegression | |
import warnings | |
warnings.filterwarnings('ignore') | |
from sklearn.metrics import confusion_matrix | |
from sklearn.metrics import classification_report | |
# # IMPORT DATA from CSV FILES | |
# In[ ]: | |
# In[2]: | |
#import train and test CSV files | |
df = pd.read_csv(r"C:\Users\44742\Desktop\ML\train.csv") | |
df.describe(include="all") | |
# In[3]: | |
pd.isnull(df).sum() | |
# # checking the data (graphics) | |
# In[4]: | |
#gender serviver bar plot | |
sns.barplot(x="Sex", y="Survived", data=df) | |
print(" survived females:", df["Survived"][df["Sex"] == 'female'].value_counts(normalize = True)[1]*100) | |
print(" survived males :", df["Survived"][df["Sex"] == 'male'].value_counts(normalize = True)[1]*100) | |
# In[ ]: | |
# In[5]: | |
#pclass suriver bar plot | |
sns.barplot(x="Pclass", y="Survived", data=df) | |
print("first class:", df["Survived"][df["Pclass"] == 1].value_counts(normalize = True)[1]*100) | |
print("second class:", df["Survived"][df["Pclass"] == 2].value_counts(normalize = True)[1]*100) | |
print("third class:", df["Survived"][df["Pclass"] == 3].value_counts(normalize = True)[1]*100) | |
# In[ ]: | |
# In[6]: | |
#Parch vs. survival bar plot | |
sns.barplot(x="Parch", y="Survived", data=df) | |
plt.show() | |
# In[7]: | |
#categories for each people ages | |
df["Age"] = df["Age"].fillna(-0.5) | |
bluk = [-1, 1, 6, 13, 18, 21, 35, 50, np.inf] | |
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior'] | |
df['peopleCategory'] = pd.cut(df["Age"], bluk, labels = labels) | |
# In[8]: | |
#people Category bar plot | |
sns.barplot(x="peopleCategory", y="Survived", data=df) | |
plt.show() | |
# # cleaning the data | |
# In[9]: | |
#droping not used data | |
df = df.drop(['Cabin'], axis = 1) | |
df = df.drop(['Ticket'], axis = 1) | |
# In[ ]: | |
# In[10]: | |
#filling the unknown values in emabarkes | |
southampton = df[df["Embarked"] == "S"].shape[0] | |
df = df.fillna({"Embarked": "S"}) | |
cherbourg = df[df["Embarked"] == "C"].shape[0] | |
queenstown = df[df["Embarked"] == "Q"].shape[0] | |
print("queenstown = ", queenstown , "southampton = ", southampton,"cherbourg = ", cherbourg ) | |
# In[11]: | |
#maping Embarked values to a numerical values | |
Emapping = {"S": 1, "C": 2, "Q": 3} | |
df['Embarked'] = df['Embarked'].map(Emapping) | |
df.head() | |
# In[12]: | |
#maping Age values to numerical values | |
Amapping = {'Baby': 1, 'Child': 2, 'Teenager': 3, 'Student': 4, 'Young Adult': 5, 'Adult': 6, 'Senior': 7} | |
df['peopleCategory'] = df['peopleCategory'].map(Amapping) | |
df.head() | |
# In[13]: | |
#dropping the unused feature | |
df = df.drop(['Age'], axis = 1) | |
df = df.drop(['Name'], axis = 1) | |
df = df.drop(['peopleCategory'], axis = 1) | |
# In[14]: | |
#maping Sex values to numerical values | |
Smapping = {"male": 0, "female": 1} | |
df['Sex'] = df['Sex'].map(Smapping) | |
df.head() | |
# In[15]: | |
#checking for nan values | |
df.columns[np.isnan(df).any()] | |
# # correlations | |
# In[16]: | |
DfCorr = df.corr() | |
DfCorr | |
# In[17]: | |
corr_matrix = np.corrcoef(df).round(decimals=2) | |
corr_matrix | |
# # data spliting | |
# In[18]: | |
pred = df.drop(['Survived', 'PassengerId'], axis=1) | |
target = df["Survived"] | |
x_train, x_test, y_train, y_test = train_test_split(pred, target, test_size = 0.22, random_state = 0) | |
# # Apply Support Vector Machines Model | |
# In[19]: | |
# Support Vector Machines | |
svc = SVC() | |
svc.fit(x_train, y_train) | |
y_pred = svc.predict(x_test) | |
acc_svc = round(accuracy_score(y_pred, y_test) * 100, 2) | |
print(acc_svc) | |
# # confusion matrix for Support Vector Machines Model | |
# In[20]: | |
# confusion matrix | |
matrix = confusion_matrix(y_test,y_pred, labels=[1,0]) | |
print('Confusion matrix : \n',matrix) | |
tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,0]).reshape(-1) | |
print('Outcome values : \n', tp, fn, fp, tn) | |
# classification report | |
matrix = classification_report(y_test,y_pred,labels=[1,0]) | |
print('Classification report : \n',matrix) | |
# # Apply Logistic Regression Model | |
# In[21]: | |
# Logistic Regression | |
logreg = LogisticRegression() | |
logreg.fit(x_train, y_train) | |
y_pred = logreg.predict(x_test) | |
acc_logreg = round(accuracy_score(y_pred, y_test) * 100, 2) | |
print(acc_logreg) | |
# In[ ]: | |
# # confusion matrix for Logistic Regression Model | |
# In[22]: | |
# confusion matrix | |
matrix = confusion_matrix(y_test,y_pred, labels=[1,0]) | |
print('Confusion matrix : \n',matrix) | |
tp, fn, fp, tn = confusion_matrix(y_test,y_pred,labels=[1,0]).reshape(-1) | |
print('Outcome values : \n', tp, fn, fp, tn) | |
# classification report | |
matrix = classification_report(y_test,y_pred,labels=[1,0]) | |
print('Classification report : \n',matrix) |