diff --git a/Scripts/TransformerExample.ipynb b/Scripts/TransformerExample.ipynb deleted file mode 100644 index e6c6dce..0000000 --- a/Scripts/TransformerExample.ipynb +++ /dev/null @@ -1,656 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "import pandas as pd\n", - "import numpy as np\n", - "from numpy import argmax\n", - "\n", - "import string \n", - "import nltk\n", - "import re\n", - "import demoji\n", - "\n", - "from nltk.tokenize import sent_tokenize, word_tokenize\n", - "from nltk.corpus import stopwords\n", - "from nltk.stem import WordNetLemmatizer\n", - "from nltk.corpus import wordnet\n", - "from spellchecker import SpellChecker\n", - "\n", - "from copy import deepcopy\n", - "from sklearn.base import clone\n", - "from sklearn.metrics import brier_score_loss\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.pipeline import Pipeline, FeatureUnion\n", - "from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.base import BaseEstimator,TransformerMixin\n", - "\n", - "from imblearn.under_sampling import RandomUnderSampler\n", - "from imblearn.over_sampling import RandomOverSampler\n", - "\n", - "from joblib import parallel_backend\n", - "\n", - "from sklearn.svm import SVC, LinearSVC\n", - "from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB\n", - "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier\n", - "from sklearn.tree import DecisionTreeClassifier\n", - "from sklearn.neighbors import KNeighborsClassifier\n", - "\n", - "from sklearn.model_selection import RandomizedSearchCV\n", - "from skopt import BayesSearchCV\n", - "\n", - "from sklearn.calibration import CalibratedClassifierCV,calibration_curve\n", - "\n", - "from sklearn.metrics import confusion_matrix,f1_score,classification_report,precision_recall_curve\n", - "\n", - "from Modules.Transformers import *" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 1000 entries, 0 to 999\n", - "Data columns (total 26 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 BIO_AUT 1000 non-null bool \n", - " 1 BIO_FOL 1000 non-null int64 \n", - " 2 BIO_PER 1000 non-null bool \n", - " 3 BIO_PIC 1000 non-null int64 \n", - " 4 BIO_POL 1000 non-null bool \n", - " 5 COMMENT 1000 non-null object \n", - " 6 INTERACTION_COM 1000 non-null int64 \n", - " 7 INTERACTION_NEM 1000 non-null int64 \n", - " 8 INTERACTION_OEM 1000 non-null float64\n", - " 9 INTERACTION_PEM 1000 non-null int64 \n", - " 10 STATUS_ART 1000 non-null float64\n", - " 11 STATUS_AVG 1000 non-null float64\n", - " 12 STATUS_COM 1000 non-null float64\n", - " 13 STATUS_EXT_PIC 1000 non-null float64\n", - " 14 STATUS_EXT_VID 1000 non-null float64\n", - " 15 STATUS_INT_PIC 1000 non-null float64\n", - " 16 STATUS_INT_VID 1000 non-null float64\n", - " 17 STATUS_NEM 1000 non-null float64\n", - " 18 STATUS_OEM 1000 non-null float64\n", - " 19 STATUS_OFF 1000 non-null float64\n", - " 20 STATUS_PEM 1000 non-null float64\n", - " 21 STATUS_POL 1000 non-null float64\n", - " 22 STATUS_REM 1000 non-null float64\n", - " 23 STATUS_SHA 1000 non-null float64\n", - " 24 STATUS_STD 1000 non-null float64\n", - " 25 TROLL 1000 non-null bool \n", - "dtypes: bool(4), float64(16), int64(5), object(1)\n", - "memory usage: 175.9+ KB\n" - ] - } - ], - "source": [ - "df = pd.read_csv(\"./Datasets/Data.csv\", delimiter=\";\") # load data\n", - "df.info() " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "X = df.iloc[:,1:] # comments\n", - "y = df.iloc[:,:1] # labels" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
BIO_AUTBIO_FOLBIO_PERBIO_PICBIO_POLCOMMENTINTERACTION_COMINTERACTION_NEMINTERACTION_OEMINTERACTION_PEM...STATUS_STDTEXT_AVG_SENTENCE_LENGTHTEXT_AVG_WORD_LENGTHTEXT_FREQWORDSTEXT_STOPWORDSTEXT_VERBSTEXT_WORDSTOXIC_PHRASESTOXIC_WORDSTROLL
0False820False135Truetime truth need exposed regardless messenger001.05...0.0000007.04.1428572421400True
1False0True6Falsebroke every promise made trump problem100.08...0.06836716.03.5000001621600False
2False0True3Falsejoe please please please pay attention happeni...400.06...0.40162838.03.95614011592211400False
3False0True2Falsejoe color u voter matter buy gas food hurt ec...100.015...0.09703322.03.2272731922201False
4False0True7Truediesel almost gallon end site yeah joe s beauty200.010...0.10788910.52.8095241602100False
\n", - "

5 rows × 43 columns

\n", - "
" - ], - "text/plain": [ - " BIO_AUT BIO_FOL BIO_PER BIO_PIC BIO_POL \\\n", - "0 False 820 False 135 True \n", - "1 False 0 True 6 False \n", - "2 False 0 True 3 False \n", - "3 False 0 True 2 False \n", - "4 False 0 True 7 True \n", - "\n", - " COMMENT INTERACTION_COM \\\n", - "0 time truth need exposed regardless messenger 0 \n", - "1 broke every promise made trump problem 1 \n", - "2 joe please please please pay attention happeni... 4 \n", - "3 joe color u voter matter buy gas food hurt ec... 1 \n", - "4 diesel almost gallon end site yeah joe s beauty 2 \n", - "\n", - " INTERACTION_NEM INTERACTION_OEM INTERACTION_PEM ... STATUS_STD \\\n", - "0 0 1.0 5 ... 0.000000 \n", - "1 0 0.0 8 ... 0.068367 \n", - "2 0 0.0 6 ... 0.401628 \n", - "3 0 0.0 15 ... 0.097033 \n", - "4 0 0.0 10 ... 0.107889 \n", - "\n", - " TEXT_AVG_SENTENCE_LENGTH TEXT_AVG_WORD_LENGTH TEXT_FREQWORDS \\\n", - "0 7.0 4.142857 2 \n", - "1 16.0 3.500000 1 \n", - "2 38.0 3.956140 11 \n", - "3 22.0 3.227273 1 \n", - "4 10.5 2.809524 1 \n", - "\n", - " TEXT_STOPWORDS TEXT_VERBS TEXT_WORDS TOXIC_PHRASES TOXIC_WORDS TROLL \n", - "0 4 2 14 0 0 True \n", - "1 6 2 16 0 0 False \n", - "2 59 22 114 0 0 False \n", - "3 9 2 22 0 1 False \n", - "4 6 0 21 0 0 False \n", - "\n", - "[5 rows x 43 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dfFreqWords = pd.read_csv(\"./Datasets/FreqWords.csv\") # load data\n", - "dfBadPhrases = pd.read_csv(\"./Datasets/BadPhrases.csv\") # load data\n", - "dfBadWords = pd.read_csv(\"./Datasets/BadWords.csv\") # load data\n", - "\n", - "example = SentimentalTransformer(\"COMMENT\").transform(df[:100]) # Create example using 100 entries \n", - "example = TextTransformer(\"COMMENT\",dfFreqWords[\"word\"].to_list()).transform(example) \n", - "example = ToxicTransformer(dfBadWords, dfBadPhrases, \"COMMENT\").transform(example) # Apply preprocessor class on comment\n", - "\n", - "example = example.reindex(sorted(example.columns), axis=1)\n", - "example.head() " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['BIO_AUT',\n", - " 'BIO_FOL',\n", - " 'BIO_PER',\n", - " 'BIO_PIC',\n", - " 'BIO_POL',\n", - " 'COMMENT',\n", - " 'INTERACTION_COM',\n", - " 'INTERACTION_NEM',\n", - " 'INTERACTION_OEM',\n", - " 'INTERACTION_PEM',\n", - " 'SENT_DIGITS',\n", - " 'SENT_EMOJIS',\n", - " 'SENT_GIFS',\n", - " 'SENT_HASHTAGS',\n", - " 'SENT_NAMES',\n", - " 'SENT_PICTURES',\n", - " 'SENT_SPECIAL_UNICODES',\n", - " 'SENT_UPPERCASE',\n", - " 'SENT_URLS',\n", - " 'STATUS_ART',\n", - " 'STATUS_AVG',\n", - " 'STATUS_COM',\n", - " 'STATUS_EXT_PIC',\n", - " 'STATUS_EXT_VID',\n", - " 'STATUS_INT_PIC',\n", - " 'STATUS_INT_VID',\n", - " 'STATUS_NEM',\n", - " 'STATUS_OEM',\n", - " 'STATUS_OFF',\n", - " 'STATUS_PEM',\n", - " 'STATUS_POL',\n", - " 'STATUS_REM',\n", - " 'STATUS_SHA',\n", - " 'STATUS_STD',\n", - " 'TEXT_AVG_SENTENCE_LENGTH',\n", - " 'TEXT_AVG_WORD_LENGTH',\n", - " 'TEXT_FREQWORDS',\n", - " 'TEXT_STOPWORDS',\n", - " 'TEXT_VERBS',\n", - " 'TEXT_WORDS',\n", - " 'TOXIC_PHRASES',\n", - " 'TOXIC_WORDS',\n", - " 'TROLL']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "[\"BIO_AUT\",\n", - " \"BIO_FOL\",\n", - " \"BIO_PER\",\n", - " \"BIO_PIC\",\n", - " \"BIO_POL\",\n", - " \"COMMENT\",\n", - " \"INTERACTION_COM\",\n", - " \"INTERACTION_NEM\",\n", - " \"INTERACTION_OEM\",\n", - " \"INTERACTION_PEM\",\n", - " \"SENT_DIGITS\",\n", - " \"SENT_EMOJIS\",\n", - " \"SENT_GIFS\",\n", - " \"SENT_HASHTAGS\",\n", - " \"SENT_NAMES\",\n", - " \"SENT_PICTURES\",\n", - " \"SENT_SPECIAL_UNICODES\",\n", - " \"SENT_UPPERCASE\",\n", - " \"SENT_URLS\",\n", - " \"STATUS_ART\",\n", - " \"STATUS_AVG\",\n", - " \"STATUS_COM\",\n", - " \"STATUS_EXT_PIC\",\n", - " \"STATUS_EXT_VID\",\n", - " \"STATUS_INT_PIC\",\n", - " \"STATUS_INT_VID\",\n", - " \"STATUS_NEM\",\n", - " \"STATUS_OEM\",\n", - " \"STATUS_OFF\",\n", - " \"STATUS_PEM\",\n", - " \"STATUS_POL\",\n", - " \"STATUS_REM\",\n", - " \"STATUS_SHA\",\n", - " \"STATUS_STD\",\n", - " \"TEXT_AVG_SENTENCE_LENGTH\",\n", - " \"TEXT_AVG_WORD_LENGTH\",\n", - " \"TEXT_FREQWORDS\",\n", - " \"TEXT_STOPWORDS\",\n", - " \"TEXT_VERBS\",\n", - " \"TEXT_WORDS\",\n", - " \"TOXIC_PHRASES\",\n", - " \"TOXIC_WORDS\",\n", - " \"TROLL\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 100 entries, 0 to 99\n", - "Data columns (total 43 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 BIO_AUT 100 non-null bool \n", - " 1 BIO_FOL 100 non-null int64 \n", - " 2 BIO_PER 100 non-null bool \n", - " 3 BIO_PIC 100 non-null int64 \n", - " 4 BIO_POL 100 non-null bool \n", - " 5 COMMENT 100 non-null object \n", - " 6 INTERACTION_COM 100 non-null int64 \n", - " 7 INTERACTION_NEM 100 non-null int64 \n", - " 8 INTERACTION_OEM 100 non-null float64\n", - " 9 INTERACTION_PEM 100 non-null int64 \n", - " 10 SENT_DIGITS 100 non-null int64 \n", - " 11 SENT_EMOJIS 100 non-null int64 \n", - " 12 SENT_GIFS 100 non-null int64 \n", - " 13 SENT_HASHTAGS 100 non-null int64 \n", - " 14 SENT_NAMES 100 non-null int64 \n", - " 15 SENT_PICTURES 100 non-null int64 \n", - " 16 SENT_SPECIAL_UNICODES 100 non-null int64 \n", - " 17 SENT_UPPERCASE 100 non-null int64 \n", - " 18 SENT_URLS 100 non-null int64 \n", - " 19 STATUS_ART 100 non-null float64\n", - " 20 STATUS_AVG 100 non-null float64\n", - " 21 STATUS_COM 100 non-null float64\n", - " 22 STATUS_EXT_PIC 100 non-null float64\n", - " 23 STATUS_EXT_VID 100 non-null float64\n", - " 24 STATUS_INT_PIC 100 non-null float64\n", - " 25 STATUS_INT_VID 100 non-null float64\n", - " 26 STATUS_NEM 100 non-null float64\n", - " 27 STATUS_OEM 100 non-null float64\n", - " 28 STATUS_OFF 100 non-null float64\n", - " 29 STATUS_PEM 100 non-null float64\n", - " 30 STATUS_POL 100 non-null float64\n", - " 31 STATUS_REM 100 non-null float64\n", - " 32 STATUS_SHA 100 non-null float64\n", - " 33 STATUS_STD 100 non-null float64\n", - " 34 TEXT_AVG_SENTENCE_LENGTH 100 non-null float64\n", - " 35 TEXT_AVG_WORD_LENGTH 100 non-null float64\n", - " 36 TEXT_FREQWORDS 100 non-null int64 \n", - " 37 TEXT_STOPWORDS 100 non-null int64 \n", - " 38 TEXT_VERBS 100 non-null int64 \n", - " 39 TEXT_WORDS 100 non-null int64 \n", - " 40 TOXIC_PHRASES 100 non-null int64 \n", - " 41 TOXIC_WORDS 100 non-null int64 \n", - " 42 TROLL 100 non-null bool \n", - "dtypes: bool(4), float64(18), int64(20), object(1)\n", - "memory usage: 31.0+ KB\n" - ] - } - ], - "source": [ - "example.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dfComment = df[:1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def FreqWordCount(text):\n", - " sentenceToken = nltk.sent_tokenize(text)\n", - " counter = 0\n", - " for sentence in sentenceToken:\n", - " wordToken = nltk.word_tokenize(sentence)\n", - " for _ , pos in nltk.pos_tag(wordToken):\n", - " if pos in [\"VB\",\"VBG\",\"VBD\",\"VBN\",\"VBP\"]:\n", - " counter += 1\n", - " return counter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "text = \"this is my text. and you are gnoomed. lool yes you are dog told\"\n", - "FreqWordCount(text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "F = FrequentWords(10,200)\n", - "example[\"COMMENT\"] = F.transform(example[\"COMMENT\"]) # Remove frequent words from comments\n", - "F.Show() " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cv = CountVectorizer(ngram_range = (2,3), max_features = 5000) # Using ngrams (bigrams and trigrams)\n", - "countMatrix = cv.fit_transform(example[\"COMMENT\"]) # Bag of words\n", - "countMatrix" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "countMatrix.toarray()[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cv.vocabulary_ # Display corpus " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Display bag of words\n", - "countTokens = cv.get_feature_names()\n", - "dfCount = pd.DataFrame(data=countMatrix.toarray(),columns=countTokens)\n", - "dfCount.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Convert a collection of raw documents to a matrix of TF-IDF features\n", - "# Reflect how important a word is to a document in a collection or corpus.\n", - "tfid = TfidfTransformer() \n", - "tfidMatrix = tfid.fit_transform(countMatrix)\n", - "tfidMatrix" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create and display TF-IDF features\n", - "tfidCount = pd.DataFrame(data=tfidMatrix.toarray(),columns=countTokens) \n", - "tfidCount.head(10)" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "43f9e24b6ed1bb06037d4a0b70c9bd285081bf18f02f4b6444a4c7a3e23233e5" - }, - "kernelspec": { - "display_name": "Python 3.8.8 ('base')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -}