Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import datetime
import glob
import os
import re
import numpy
import pandas
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Import all csv
csvpath = "C:/Users/thetr/Downloads/MachineLearningCVE"
csv = [file for file in glob.glob(csvpath + "**/*.csv", recursive=True)]
# Add all csv files to panda dataframe
dataset = [pandas.read_csv(f) for f in csv]
# Combine all tables into one
dataset = pandas.concat([d for d in dataset]).drop_duplicates(keep=False)
dataset.reset_index(drop=True, inplace=True)
# Remove white spaces
col_names = [col.replace(' ', '') for col in dataset.columns]
dataset.columns = col_names
# Remove unusual characters using regex
label_names = dataset['Label'].unique()
label_names = [re.sub("[^a-zA-Z ]+", "", l) for l in label_names]
label_names = [re.sub("[\s\s]", '_', l) for l in label_names]
label_names = [lab.replace("__", "_") for lab in label_names]
# Replacing labels using label_names to replace the ones in the dataset
labels = dataset['Label'].unique()
for i in range(0, len(label_names)):
dataset['Label'] = dataset['Label'].replace({labels[i]: label_names[i]})
# Remove null values
dataset.dropna(inplace=True)
# Removing non-finite values
labl = dataset['Label']
dataset = dataset.loc[:, dataset.columns != 'Label'].astype('float64')
dataset = dataset.replace([numpy.inf, -numpy.inf], numpy.nan)
dataset = dataset.merge(labl, how='outer', left_index=True, right_index=True)
dataset.dropna(inplace=True)
# Seperating dataset into labels and features
labels = dataset['Label']
features = dataset.loc[:, dataset.columns != 'Label'].astype('float64')
# Scaling data using RobustScaler
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)
# Label encoding
labelenc = LabelEncoder()
labelenc.fit(labels)
labels = labelenc.transform(labels)
# Split training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=.2)
# Clear variables
dataset = None
labl = None
features = None
labels = None
# Defining model using 78 neurons one for each feature
# Hidden layer has 67 which is 2/3 of input neuropns + output neurons
# Output layer has 15 neurons, one for each class
# Dropout of 0.2 to prevent over fitting
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(78,)),
tf.keras.layers.Dense(67, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(15, activation='softmax')
])
# Configure model with optimiser Adam, loss function as sparse_categorical_crossentropy
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Configure tensorboard with callback from training logs
log_dir = os.path.join("train_logs")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
# Tensor flow callback for stopping training when the model is no longer improving, Best weights restored on early
# completion
eary_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
# Running model
model.fit(features_train, labels_train, epochs=100, callbacks=[tensorboard_callback, eary_stop_callback])
# Evaluate model accuracy
model.evaluate(features_test, labels_test, verbose=2)
# Saving the model.
model.save('saved_models/IDS_model_' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5')