Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(r'D:Fiverr/Firasal_Task/data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(149757, 21)"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender',\n",
" 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',\n",
" 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud'],\n",
" dtype='object')"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"cc_num float64\n",
"merchant category\n",
"category category\n",
"amt float64\n",
"first object\n",
"last object\n",
"gender category\n",
"street object\n",
"city object\n",
"state category\n",
"zip int64\n",
"lat float64\n",
"long float64\n",
"city_pop int64\n",
"job category\n",
"dob object\n",
"trans_num object\n",
"unix_time int64\n",
"merch_lat float64\n",
"merch_long float64\n",
"is_fraud int64\n",
"dtype: object"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['state'] = df['state'].astype('category')\n",
"df['merchant'] = df['merchant'].astype('category')\n",
"df['category'] = df['category'].astype('category')\n",
"df['gender'] = df['gender'].astype('category')\n",
"df['job'] = df['job'].astype('category')\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"df['state'] = df['state'].cat.codes\n",
"df['merchant'] = df['merchant'].cat.codes\n",
"df['category'] = df['category'].cat.codes\n",
"df['gender'] = df['gender'].cat.codes\n",
"df['job'] = df['job'].cat.codes"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().values.any()"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"columns = ['merchant', 'category', 'amt', 'gender', 'state', 'city_pop', 'job', 'is_fraud']"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"train_df = pd.DataFrame(df, columns= columns)"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"#train_df.to_csv(r'D:Fiverr/Firasal_Task/train_df.csv')"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"correlation = train_df.corr()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>merchant</th>\n",
" <th>category</th>\n",
" <th>amt</th>\n",
" <th>gender</th>\n",
" <th>state</th>\n",
" <th>city_pop</th>\n",
" <th>job</th>\n",
" <th>is_fraud</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>510</td>\n",
" <td>11</td>\n",
" <td>1160.47</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>386</td>\n",
" <td>11</td>\n",
" <td>985.49</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>575</td>\n",
" <td>12</td>\n",
" <td>713.24</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>1565</td>\n",
" <td>124</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>379</td>\n",
" <td>8</td>\n",
" <td>978.91</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>648</td>\n",
" <td>1</td>\n",
" <td>103.90</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>560</td>\n",
" <td>8</td>\n",
" <td>799.11</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>1565</td>\n",
" <td>124</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>510</td>\n",
" <td>11</td>\n",
" <td>890.87</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>539</td>\n",
" <td>3</td>\n",
" <td>11.28</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>272</td>\n",
" <td>2</td>\n",
" <td>9.07</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>545</td>\n",
" <td>12</td>\n",
" <td>829.05</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>52</td>\n",
" <td>8</td>\n",
" <td>733.51</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>487</td>\n",
" <td>11</td>\n",
" <td>860.53</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>6</td>\n",
" <td>11</td>\n",
" <td>927.78</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>36</td>\n",
" <td>8</td>\n",
" <td>859.70</td>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>47119</td>\n",
" <td>141</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>563</td>\n",
" <td>4</td>\n",
" <td>264.87</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>79008</td>\n",
" <td>381</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" merchant category amt gender state city_pop job is_fraud\n",
"0 510 11 1160.47 1 43 47119 141 1\n",
"1 386 11 985.49 1 43 47119 141 1\n",
"2 575 12 713.24 1 17 1565 124 1\n",
"3 379 8 978.91 1 43 47119 141 1\n",
"4 648 1 103.90 1 43 47119 141 1\n",
"5 560 8 799.11 1 17 1565 124 1\n",
"6 510 11 890.87 1 43 47119 141 1\n",
"7 539 3 11.28 1 43 47119 141 1\n",
"8 272 2 9.07 1 43 47119 141 1\n",
"9 545 12 829.05 1 43 47119 141 1\n",
"10 52 8 733.51 1 43 47119 141 1\n",
"11 487 11 860.53 1 43 47119 141 1\n",
"12 6 11 927.78 1 43 47119 141 1\n",
"13 36 8 859.70 1 43 47119 141 1\n",
"14 563 4 264.87 1 9 79008 381 1"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.head(15)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1008x576 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize= (14, 8))\n",
"sns.heatmap(correlation, annot = True, cmap = 'vlag', annot_kws={'size':12}, square= True, fmt = '.1g', vmin= -1, vmax=1)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Random Forest"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import ensemble, metrics\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"X = train_df.iloc[:, 0:7]\n",
"y = train_df.iloc[:, 7:8]"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 144056\n",
"1 5701\n",
"Name: is_fraud, dtype: int64"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y['is_fraud'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"params = {'n_estimators': 1000, 'max_depth': 4, \"min_samples_split\": 5, 'learning_rate': 0.01}"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"gb = ensemble.GradientBoostingClassifier(**params)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Anaconda\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" return f(*args, **kwargs)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 4min 51s\n"
]
},
{
"data": {
"text/plain": [
"GradientBoostingClassifier(learning_rate=0.01, max_depth=4, min_samples_split=5,\n",
" n_estimators=1000)"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"gb.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 555 ms\n"
]
}
],
"source": [
"%%time\n",
"y_pred = gb.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy : 0.979\n",
"F1-Score : 0.699\n",
"Precision : 0.777\n",
"Recall : 0.636\n"
]
}
],
"source": [
"print('Accuracy : ', round(metrics.accuracy_score(y_test, y_pred), 3))\n",
"print('F1-Score : ', round(metrics.f1_score(y_test, y_pred), 3))\n",
"print('Precision : ', round(metrics.precision_score(y_test, y_pred), 3))\n",
"print('Recall : ', round(metrics.recall_score(y_test, y_pred), 3))"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion matrix : \n",
" [[ 719 412]\n",
" [ 206 28615]]\n"
]
}
],
"source": [
"matrix = confusion_matrix(y_test,y_pred, labels=[1,0])\n",
"print('Confusion matrix : \\n',matrix)"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" TP: 719 \n",
" FN: 412 \n",
" FP: 206 \n",
" TN: 28615\n"
]
}
],
"source": [
"tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels=[1,0]).reshape(-1)\n",
"print(' TP:', tp, '\\n', 'FN:', fn, '\\n', 'FP:', fp, '\\n', 'TN:', tn)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"SMOTE"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"smote = SMOTE(sampling_strategy='not majority')\n",
"X_sm, y_sm = smote.fit_resample(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 144056\n",
"0 144056\n",
"Name: is_fraud, dtype: int64"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_sm['is_fraud'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size = 0.2)"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"params = {'n_estimators': 1000, 'max_depth': 4, \"min_samples_split\": 5, 'learning_rate': 0.01}"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 0 ns\n"
]
}
],
"source": [
"%%time\n",
"gb = ensemble.GradientBoostingClassifier(**params)"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Anaconda\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" return f(*args, **kwargs)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 16min 45s\n"
]
},
{
"data": {
"text/plain": [
"GradientBoostingClassifier(learning_rate=0.01, max_depth=4, min_samples_split=5,\n",
" n_estimators=1000)"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"gb.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 2.42 s\n"
]
}
],
"source": [
"%%time\n",
"y_pred = gb.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy : 0.856\n",
"F1-Score : 0.842\n",
"Precision : 0.924\n",
"Recall : 0.774\n"
]
}
],
"source": [
"print('Accuracy : ', round(metrics.accuracy_score(y_test, y_pred), 3))\n",
"print('F1-Score : ', round(metrics.f1_score(y_test, y_pred), 3))\n",
"print('Precision : ', round(metrics.precision_score(y_test, y_pred), 3))\n",
"print('Recall : ', round(metrics.recall_score(y_test, y_pred), 3))"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion matrix : \n",
" [[22252 6492]\n",
" [ 1831 27048]]\n"
]
}
],
"source": [
"matrix = confusion_matrix(y_test,y_pred, labels=[1,0])\n",
"print('Confusion matrix : \\n',matrix)"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" TP: 22252 \n",
" FN: 6492 \n",
" FP: 1831 \n",
" TN: 27048\n"
]
}
],
"source": [
"tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels=[1,0]).reshape(-1)\n",
"print(' TP:', tp, '\\n', 'FN:', fn, '\\n', 'FP:', fp, '\\n', 'TN:', tn)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}