Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import numpy as np # linear algebra\n",
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
"\n",
"df = pd.read_csv('transaction_dataset.csv')\n",
"df.columns = [x.lower() for x in df.columns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Drop unnecessary columns like Token Types, Address, Index\n",
"import seaborn as sns\n",
"\n",
"classes = df.groupby('flag')\n",
"\n",
"print( \"Valid transactions: %s\" % len(classes.groups[0]))\n",
"print( \"Fraud transactions: %s\" % len(classes.groups[1]))\n",
"\n",
"\n",
"# plot = sns.countplot(df['flag'])\n",
"# plot.figure.savefig(\"skew-proof\")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"cols_to_drop = [\n",
" ' erc20 most sent token type',\n",
" ' erc20_most_rec_token_type',\n",
" 'address',\n",
" 'index',\n",
" 'unnamed: 0'\n",
"]\n",
"\n",
"features = [x for x in df.columns if (x != 'flag' and x not in cols_to_drop)]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# Get only features that have unique values greater than 1\n",
"unique_values = df.nunique()\n",
"\n",
"features = [x for x in features if x in unique_values.loc[(unique_values>1)]]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer\n",
"\n",
"class BasePipeStep(BaseEstimator, TransformerMixin):\n",
"\n",
" def __init__(self, columns=[]):\n",
" self.columns = columns\n",
"\n",
" def fit(self, X, y=None):\n",
" return self\n",
"\n",
" def transform(self, X):\n",
" X = X.copy()\n",
" return X\n",
"\n",
"class SelectColumns(BasePipeStep):\n",
"\n",
" def transform(self, X):\n",
" X = X.copy()\n",
" return X[self.columns]\n",
"\n",
"class FillNumericData(BasePipeStep):\n",
"\n",
" def fit(self, X, y=None):\n",
" self.means = { col: X[col].mean() for col in self.columns}\n",
" return self\n",
"\n",
" def transform(self, X):\n",
" X = X.copy()\n",
" for col in self.columns:\n",
" X[col] = X[col].fillna(self.means[col])\n",
" return X\n",
"\n",
"\n",
"class ScaleNumeric(BasePipeStep):\n",
"\n",
" def fit(self, X, y=None):\n",
" self.scaler = StandardScaler()\n",
" self.scaler.fit(X[self.columns])\n",
" return self\n",
"\n",
" def transform(self, X):\n",
" X = X.copy()\n",
" X[self.columns] = self.scaler.transform(X[self.columns])\n",
" return X\n",
"\n",
"class GetValues(BasePipeStep):\n",
"\n",
" def transform(self, X):\n",
" X = X.copy()\n",
" return X.values"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"preprocessing = Pipeline([\n",
" ('feature_selection', SelectColumns(features)),\n",
" ('fill_missing', FillNumericData(features)),\n",
" ('standard_scaling', ScaleNumeric(features)),\n",
" ('returnValues', GetValues())\n",
"])"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from keras.utils import to_categorical\n",
"\n",
"\n",
"X = df[features]\n",
"y = df['flag']\n",
"y = to_categorical(y)\n",
"\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.33, random_state=42)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"X_train = preprocessing.fit_transform(X_train)\n",
"X_test = preprocessing.transform(X_test)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from keras.models import Sequential\n",
"from keras.layers import Dense\n",
"from keras import Input\n",
"#create model\n",
"\n",
"def generate_3L_model() :\n",
" model = Sequential()\n",
" #add model layers\n",
" model.add(Input(shape=(len(features),)))\n",
"\n",
" model.add(Dense(len(features), activation='relu'))\n",
" model.add(Dense(20, activation='relu'))\n",
" model.add(Dense(5, activation='relu'))\n",
" model.add(Dense(2, activation='softmax'))\n",
" model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
"\n",
" return model\n",
"\n",
"\n",
"def generate_4L_model() :\n",
" model = Sequential()\n",
" #add model layers\n",
" model.add(Input(shape=(len(features),)))\n",
"\n",
" model.add(Dense(len(features), activation='relu'))\n",
" model.add(Dense(20, activation='relu'))\n",
" model.add(Dense(10, activation='relu'))\n",
" model.add(Dense(5, activation='relu'))\n",
" model.add(Dense(2, activation='softmax'))\n",
" model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
"\n",
" return model\n",
"\n",
"\n",
"\n",
"def generate_5L_model() :\n",
" model = Sequential()\n",
" #add model layers\n",
" model.add(Input(shape=(len(features),)))\n",
"\n",
" model.add(Dense(len(features), activation='relu'))\n",
" model.add(Dense(len(features), activation='relu'))\n",
" model.add(Dense(20, activation='relu'))\n",
" model.add(Dense(10, activation='relu'))\n",
" model.add(Dense(5, activation='relu'))\n",
" model.add(Dense(2, activation='softmax'))\n",
" model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
"\n",
" return model"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"def plotViz(history, epochs):\n",
" N = epochs\n",
" plt.style.use(\"ggplot\")\n",
" plt.figure()\n",
" plt.plot(np.arange(0, N), history.history[\"loss\"], label=\"train_loss\")\n",
" plt.plot(np.arange(0, N), history.history[\"val_loss\"], label=\"val_loss\")\n",
" plt.plot(np.arange(0, N), history.history[\"accuracy\"], label=\"train_acc\")\n",
" plt.plot(np.arange(0, N), history.history[\"val_accuracy\"], label=\"val_acc\")\n",
" plt.title(\"Training Loss and Accuracy\")\n",
" plt.xlabel(\"Epoch #\")\n",
" plt.ylabel(\"Loss/Accuracy\")\n",
" plt.legend(loc=\"lower left\")\n",
" plt.savefig(\"plot1\")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import seaborn as sns\n",
"from imblearn.over_sampling import SMOTE\n",
"\n",
"\n",
"sm = SMOTE(random_state=42)\n",
"X_train_smote, Y_train_smote = sm.fit_resample(X_train,y_train)\n",
"\n",
"df_new = pd.DataFrame(Y_train_smote)\n",
"\n",
"\n",
"# Y_train_smote\n",
"plot = sns.countplot(df_new[0])\n",
"plot.figure.savefig(\"oversampling-proof\")\n",
"# sns.countplot(Y_train_smote)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"model = generate_5L_model()\n",
"model.summary()"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"num_epochs = 20\n",
"categorical_smote = to_categorical(Y_train_smote)\n",
"history = model.fit(X_train_smote, categorical_smote, validation_data=(X_test, y_test), epochs=num_epochs)\n",
"plotViz(history, num_epochs)\n",
"\n",
"\n",
"[test_loss, test_acc] = model.evaluate(X_test, y_test)\n",
"print(\"Evaluation result on Test Data : Loss = {}, accuracy = {}\".format(test_loss, test_acc))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn import metrics\n",
"test_prediction = [np.argmax(x) for x in model.predict(X_test)]\n",
"\n",
"acc = metrics.accuracy_score(test_prediction, [np.argmax(y) for y in y_test])\n",
"\n",
"print(f'Accuracy: {acc:,.2%}')\n",
"\n",
"score = metrics.roc_auc_score([np.argmax(y) for y in y_test], model.predict(X_test)[:,1])\n",
"\n",
"print(f'Area under ROC of Model On Test Set - {score:,.2%}')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"disp = metrics.confusion_matrix([np.argmax(y) for y in y_test], test_prediction)\n",
"\n",
"df_cm = pd.DataFrame(disp, index = ['Valid', 'Fraud'],\n",
" columns = ['Valid', 'Fraud'])\n",
"\n",
"\n",
"plt.figure(figsize = (10,7))\n",
"cf_matrix = sns.heatmap(df_cm, annot=True, fmt='.1f')\n",
"cf_matrix.figure.savefig(\"confusion_matrix\")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"name": "pycharm-14024ecd",
"language": "python",
"display_name": "PyCharm (ethereum-fraud-detection)"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}