diff --git a/CW1.ipynb b/CW1.ipynb new file mode 100644 index 0000000..898cebc --- /dev/null +++ b/CW1.ipynb @@ -0,0 +1,812 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Installing Required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.12/site-packages (2.2.2)\n", + "Requirement already satisfied: nltk in /opt/anaconda3/lib/python3.12/site-packages (3.8.1)\n", + "Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.12/site-packages (1.4.2)\n", + "Requirement already satisfied: numpy>=1.26.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (1.26.4)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2023.3)\n", + "Requirement already satisfied: click in /opt/anaconda3/lib/python3.12/site-packages (from nltk) (8.1.7)\n", + "Requirement already satisfied: joblib in /opt/anaconda3/lib/python3.12/site-packages (from nltk) (1.4.2)\n", + "Requirement already satisfied: regex>=2021.8.3 in /opt/anaconda3/lib/python3.12/site-packages (from nltk) (2023.10.3)\n", + "Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.12/site-packages (from nltk) (4.66.4)\n", + "Requirement already satisfied: scipy>=1.6.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.14.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (2.2.0)\n", + "Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install pandas nltk scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"rct_data.txt\", delimiter=\"\\t\",header=None)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysing the data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of the dataset: (500068, 5)\n", + "--------\n", + "Index([0, 1, 2, 3, 4], dtype='int64')\n", + "--------\n", + "Missing values: 0 0\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(\"Shape of the dataset:\",df.shape)\n", + "print(\"--------\")\n", + "\n", + "print(df.columns)\n", + "print(\"--------\")\n", + "\n", + "# to find the missing values\n", + "print(\"Missing values: \",df.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234
01843978102011Two patients subdued with a TASER® device: cas...In the United States, an increasing number of ...
11846883302011A case of Takayasu arteritis causing subclavia...The American Heart Association website defines...
21848118102012Pathophysiology of hypopituitarism in the sett...The complex pathophysiology of traumatic brain...
31872805612011The cardiovascular risk factor, soluble CD40 l...[BACKGROUND] Soluble CD40 ligand (sCD40L) is a...
41879059002011Horner syndrome due to carotid dissection.[BACKGROUND] Internal carotid artery dissectio...
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 \\\n", + "0 18439781 0 2011 Two patients subdued with a TASER® device: cas... \n", + "1 18468833 0 2011 A case of Takayasu arteritis causing subclavia... \n", + "2 18481181 0 2012 Pathophysiology of hypopituitarism in the sett... \n", + "3 18728056 1 2011 The cardiovascular risk factor, soluble CD40 l... \n", + "4 18790590 0 2011 Horner syndrome due to carotid dissection. \n", + "\n", + " 4 \n", + "0 In the United States, an increasing number of ... \n", + "1 The American Heart Association website defines... \n", + "2 The complex pathophysiology of traumatic brain... \n", + "3 [BACKGROUND] Soluble CD40 ligand (sCD40L) is a... \n", + "4 [BACKGROUND] Internal carotid artery dissectio... " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# first 5 rows of the dataset\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Assigning the headers based on dataset structure\n", + "df.columns = ['ID', 'label', 'year', 'title', 'abstract']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDlabelyeartitleabstract
01843978102011Two patients subdued with a TASER® device: cas...In the United States, an increasing number of ...
11846883302011A case of Takayasu arteritis causing subclavia...The American Heart Association website defines...
21848118102012Pathophysiology of hypopituitarism in the sett...The complex pathophysiology of traumatic brain...
31872805612011The cardiovascular risk factor, soluble CD40 l...[BACKGROUND] Soluble CD40 ligand (sCD40L) is a...
41879059002011Horner syndrome due to carotid dissection.[BACKGROUND] Internal carotid artery dissectio...
\n", + "
" + ], + "text/plain": [ + " ID label year title \\\n", + "0 18439781 0 2011 Two patients subdued with a TASER® device: cas... \n", + "1 18468833 0 2011 A case of Takayasu arteritis causing subclavia... \n", + "2 18481181 0 2012 Pathophysiology of hypopituitarism in the sett... \n", + "3 18728056 1 2011 The cardiovascular risk factor, soluble CD40 l... \n", + "4 18790590 0 2011 Horner syndrome due to carotid dissection. \n", + "\n", + " abstract \n", + "0 In the United States, an increasing number of ... \n", + "1 The American Heart Association website defines... \n", + "2 The complex pathophysiology of traumatic brain... \n", + "3 [BACKGROUND] Soluble CD40 ligand (sCD40L) is a... \n", + "4 [BACKGROUND] Internal carotid artery dissectio... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Checking first five rows and headers\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import nltk\n", + "from nltk.corpus import stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /tmp/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "\n", + "nltk_data_path = \"/tmp/nltk_data\" # Path to download stopwords\n", + "os.makedirs(nltk_data_path, exist_ok=True)\n", + "nltk.data.path.append(nltk_data_path)\n", + "\n", + "# Now download stopwords\n", + "nltk.download('stopwords', download_dir=nltk_data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# converting all the text in the abstract column into lower case\n", + "df['abstract'] = df['abstract'].astype(str).str.lower()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# deleting special characters \n", + "# function to clean the text using regular expression\n", + "def clean_text(text):\n", + " text = re.sub(r'[^a-zA-Z\\s]', '', text)\n", + " return text\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# cleaned abstract colunmn data is added in the new column with name cleaned_abstract\n", + "df['cleaned_abstract'] = df['abstract'].apply(clean_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDlabelyeartitleabstractcleaned_abstract
01843978102011Two patients subdued with a TASER® device: cas...in the united states, an increasing number of ...in the united states an increasing number of l...
11846883302011A case of Takayasu arteritis causing subclavia...the american heart association website defines...the american heart association website defines...
21848118102012Pathophysiology of hypopituitarism in the sett...the complex pathophysiology of traumatic brain...the complex pathophysiology of traumatic brain...
31872805612011The cardiovascular risk factor, soluble CD40 l...[background] soluble cd40 ligand (scd40l) is a...background soluble cd ligand scdl is a powerfu...
41879059002011Horner syndrome due to carotid dissection.[background] internal carotid artery dissectio...background internal carotid artery dissection ...
\n", + "
" + ], + "text/plain": [ + " ID label year title \\\n", + "0 18439781 0 2011 Two patients subdued with a TASER® device: cas... \n", + "1 18468833 0 2011 A case of Takayasu arteritis causing subclavia... \n", + "2 18481181 0 2012 Pathophysiology of hypopituitarism in the sett... \n", + "3 18728056 1 2011 The cardiovascular risk factor, soluble CD40 l... \n", + "4 18790590 0 2011 Horner syndrome due to carotid dissection. \n", + "\n", + " abstract \\\n", + "0 in the united states, an increasing number of ... \n", + "1 the american heart association website defines... \n", + "2 the complex pathophysiology of traumatic brain... \n", + "3 [background] soluble cd40 ligand (scd40l) is a... \n", + "4 [background] internal carotid artery dissectio... \n", + "\n", + " cleaned_abstract \n", + "0 in the united states an increasing number of l... \n", + "1 the american heart association website defines... \n", + "2 the complex pathophysiology of traumatic brain... \n", + "3 background soluble cd ligand scdl is a powerfu... \n", + "4 background internal carotid artery dissection ... " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# removing unnecessary columns\n", + "# Keep only the 'abstract' and 'label' columns\n", + "df = df[['abstract', 'label', 'cleaned_abstract']]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abstractlabelcleaned_abstract
0in the united states, an increasing number of ...0in the united states an increasing number of l...
1the american heart association website defines...0the american heart association website defines...
2the complex pathophysiology of traumatic brain...0the complex pathophysiology of traumatic brain...
3[background] soluble cd40 ligand (scd40l) is a...1background soluble cd ligand scdl is a powerfu...
4[background] internal carotid artery dissectio...0background internal carotid artery dissection ...
\n", + "
" + ], + "text/plain": [ + " abstract label \\\n", + "0 in the united states, an increasing number of ... 0 \n", + "1 the american heart association website defines... 0 \n", + "2 the complex pathophysiology of traumatic brain... 0 \n", + "3 [background] soluble cd40 ligand (scd40l) is a... 1 \n", + "4 [background] internal carotid artery dissectio... 0 \n", + "\n", + " cleaned_abstract \n", + "0 in the united states an increasing number of l... \n", + "1 the american heart association website defines... \n", + "2 the complex pathophysiology of traumatic brain... \n", + "3 background soluble cd ligand scdl is a powerfu... \n", + "4 background internal carotid artery dissection ... " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Removing Stop words" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "stop_words =set(stopwords.words ('english'))\n", + "\n", + "# Function to remove stopwords\n", + "def remove_stopwords(text):\n", + " words = text.split()\n", + " words = [word for word in words if word not in stop_words]\n", + " return \" \".join(words)\n", + "\n", + "# Apply function\n", + "df['cleaned_abstract'] =df['cleaned_abstract'].apply(remove_stopwords)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convert Text into Numerical Format Using TF-IDF" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of TF-IDF matrix: (500068, 5000)\n" + ] + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "# Initializing TF-IDF Vectorizer\n", + "vectorizer = TfidfVectorizer(max_features=5000) # Convert text into a numerical form\n", + "\n", + "# Fit and transform the text data\n", + "X = vectorizer.fit_transform(df['cleaned_abstract'])\n", + "\n", + "# Convert to DataFrame\n", + "X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())\n", + "\n", + "# Target variable (labels)\n", + "y = df['label']\n", + "\n", + "# Display shape of transformed data\n", + "print(\"Shape of TF-IDF matrix:\", X.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training Data Shape: (400054, 5000)\n", + "Testing Data Shape: (100014, 5000)\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Split data into 80% training and 20% testing\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Check shape\n", + "print(\"Training Data Shape:\", X_train.shape)\n", + "print(\"Testing Data Shape:\", X_test.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.naive_bayes import MultinomialNB\n", + "\n", + "# Initialize and train Naïve Bayes model\n", + "nb = MultinomialNB()\n", + "nb.fit(X_train, y_train)\n", + "\n", + "# Prediction on test data\n", + "y_pred_nb = nb.predict(X_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Model: Naive Bayes\n", + "Accuracy: 0.901603775471434\n", + "Precision: 0.7314131997588238\n", + "Recall: 0.7956609485368314\n", + "F1 Score: 0.7621855440902829\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", + "\n", + "print(f\"\\nModel: Naive Bayes\")\n", + "print(\"Accuracy:\", accuracy_score(y_test, y_pred_nb))\n", + "print(\"Precision:\", precision_score(y_test, y_pred_nb))\n", + "print(\"Recall:\", recall_score(y_test, y_pred_nb))\n", + "print(\"F1 Score:\", f1_score(y_test, y_pred_nb))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Model: Logistic Regression\n", + "Accuracy: 0.9494970704101425\n", + "Precision: 0.9094084382103454\n", + "Recall: 0.827598385469223\n", + "F1 Score: 0.866576855007\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "# Initialize and train Logistic Regression model\n", + "log_reg = LogisticRegression()\n", + "log_reg.fit(X_train, y_train)\n", + "\n", + "# Predict on test data\n", + "y_pred_log = log_reg.predict(X_test)\n", + "\n", + "print(f\"\\nModel: Logistic Regression\")\n", + "print(\"Accuracy:\", accuracy_score(y_test, y_pred_log))\n", + "print(\"Precision:\", precision_score(y_test, y_pred_log))\n", + "print(\"Recall:\", recall_score(y_test, y_pred_log))\n", + "print(\"F1 Score:\", f1_score(y_test, y_pred_log))\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}