diff --git a/Week2/.ipynb_checkpoints/exercise4-checkpoint.ipynb b/Week2/.ipynb_checkpoints/exercise4-checkpoint.ipynb new file mode 100644 index 0000000..067253a --- /dev/null +++ b/Week2/.ipynb_checkpoints/exercise4-checkpoint.ipynb @@ -0,0 +1,516 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook showing data visualisation methods\n", + "#### by Salih MSA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Importing libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import statistics # mean, median, etc.\n", + "\n", + "# Data visualisation functionality\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "\n", + "from sklearn.model_selection import train_test_split # method to split dataset into 4\n", + "from sklearn.linear_model import LinearRegression # linear regression algorithm\n", + "from sklearn.metrics import mean_squared_error, mean_absolute_error # accuracy testing method" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Importing data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# note: below shows manually creating a data frame (ie no external imports) (though technically from a programmers standpoint Python reads from a bloody external file / stream, therefore what possible optimisation is already squandered. but I digress)\n", + "days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']\n", + "row_names = [\"max_temp\", \"min_temp\", \"avg_temp\", \"wind\"]\n", + "vals = [ # list of values\n", + " [30,31,32,28,27,29,26], # max_temp\n", + " [23,22,20,24,18,19,10], # min_temp\n", + " [25,28,28,26,23,25,24], # avg_temp\n", + " [50,100,40,65,80,75,50] # wind\n", + "]\n", + " \n", + "data = pd.DataFrame(vals, columns=days, index=row_names) # specifiying column names as each given day" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data exploration & Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 4 entries, max_temp to wind\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 Mon 4 non-null int64\n", + " 1 Tue 4 non-null int64\n", + " 2 Wed 4 non-null int64\n", + " 3 Thu 4 non-null int64\n", + " 4 Fri 4 non-null int64\n", + " 5 Sat 4 non-null int64\n", + " 6 Sun 4 non-null int64\n", + "dtypes: int64(7)\n", + "memory usage: 256.0+ bytes\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MonTueWedThuFriSatSun
max_temp30313228272926
min_temp23222024181910
avg_temp25282826232524
wind501004065807550
\n", + "
" + ], + "text/plain": [ + " Mon Tue Wed Thu Fri Sat Sun\n", + "max_temp 30 31 32 28 27 29 26\n", + "min_temp 23 22 20 24 18 19 10\n", + "avg_temp 25 28 28 26 23 25 24\n", + "wind 50 100 40 65 80 75 50" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1) Create scatterplot of wind vs each day" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] :: [ 50 100 40 65 80 75 50]\n" + ] + } + ], + "source": [ + "# We want to use the correlation of the an independant variables to determine our model\n", + "y = data.iloc[-1,:].values\n", + "x = days\n", + "print(x, \"::\", y)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Create scatterplot\n", + "plt.title('day x wind')\n", + "plt.xlabel('day')\n", + "plt.ylabel('wind')\n", + "plt.scatter(x, y, alpha=0.5) # ...where alpha is size of points\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2) Create scatterplot of average temperature vs each day, where each point varies in size based on value's magnitude" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "x = days\n", + "y = data.iloc[-2,:].values # avg_tmp row" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Create scatterplot\n", + "plt.title('day x temperature')\n", + "plt.xlabel('day')\n", + "plt.ylabel('temperature')\n", + "\n", + "def calculatePointSize(values, default_size=250):\n", + " \"\"\"\n", + " @brief Function to calculate size based on points given\n", + " Essentially, a standard size is set, and depending on how a given point deviates from the mean, each points respective size is altered\n", + " This allows for our values, which are quite similar, to be distinguishable on a plot\n", + " @param values - list containing values of points\n", + " @return list of point sizes\n", + " \"\"\"\n", + " mean = statistics.mean(values)\n", + " sizes = [default_size for i in range(0, len(values), 1)]\n", + " for i in range(0, len(values), 1):\n", + " if values[i] < mean:\n", + " sizes[i] -= (mean - values) * 100\n", + " else:\n", + " sizes[i] += (values - mean) * 100\n", + " return sizes[0]\n", + "\n", + "plt.scatter(x, y, s=calculatePointSize(y)) \n", + "# provide additional argument 's(ize)'\n", + "# since all points are very similar, the size of each is with regards to how it varies from the mean\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3) Create lineplot of average temperature vs each day" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# same x&y as Q2\n", + "# Create lineplot\n", + "plt.title('day x temperature')\n", + "plt.xlabel('day')\n", + "plt.ylabel('temperature (degrees C)')\n", + "plt.plot(x, y)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4) Create lineplot of average temperature vs each day, where a) every point is pinpointed on graph, b) area under line is highlighted" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# same x&y as Q3\n", + "# Create lineplot\n", + "plt.title('day x temperature')\n", + "plt.xlabel('day')\n", + "plt.ylabel('temperature (degrees C)')\n", + "plt.plot(x, y, marker=\"*\", linestyle=\"dashed\")\n", + "plt.fill_between(x, # range of x \n", + " y, # pointSSSS of y (to fill from...)\n", + " min(y)-1, # bottom point of y (...to fill until)\n", + " color='blue', # The outline color\n", + " alpha=0.2) # Transparency of the fill\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4) Create lineplot of max, average & min temperatures of each day" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "x = days\n", + "y1 = data.iloc[-2,:].values # avg_tmp row\n", + "y2 = data.iloc[-3,:].values # min_tmp row\n", + "y3 = data.iloc[-4,:].values # max_tmp row" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Create lineplot\n", + "plt.title('day x temperature')\n", + "plt.xlabel('day')\n", + "plt.ylabel('temperature (degrees C)')\n", + "plt.plot(x, y1)\n", + "plt.plot(x, y2)\n", + "plt.plot(x, y3)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5) Create lineplot of max, average & min temperatures of each day (as Q4), with proper legend and other labelling" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Create lineplot\n", + "plt.title('day x temperature')\n", + "plt.xlabel('day')\n", + "plt.ylabel('temperature (degrees C)')\n", + "plt.plot(x, y1, label=\"avg. temp\")\n", + "plt.plot(x, y2, label=\"min. temp\")\n", + "plt.plot(x, y3, label=\"max. temp\")\n", + "plt.legend()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Week3/Week3.md b/Week3/Week3.md new file mode 100644 index 0000000..aefefd3 --- /dev/null +++ b/Week3/Week3.md @@ -0,0 +1,25 @@ +# 6006CEM_notes +## Week 3 - Logistic Regression (classification) + +### Basic logistic regression + +* Despite its name, it is NOT a regression algorithm, rather a classification algorithm + * ie regression algorithms *return continuous variables* (ie numbers of any range), whilst classification algorithms *return discrete (ie fixed) values* + +* It draws a line which *seperates* values into distinct groups, such that the values fall either side of a drawn line + +* Accuracy of the model is determined by a form of error / loss calculation - in essence it's how many points are correctly classified + * note that it is more complicatd in nature than the one used for Lin.R. + +* when generating a model, we start a random line and move it by (specifiable) number of small increments for a (specifiable) large number of times, until a model with high(est) accuracy is produced + +* note: many classification algorithms only accept numeric labels, therefore conversion (a type of preprocessing) is required beforehand) + +### Multi-class classification + +* essentially, you create multiple models and you see which class identified with the highest overall probability + +* there are two methods of / approaches to classification when the output is multiple dependant variables: + * one-vs-one - a model is produced using the data of two classes at a time (eg. 3 colours: model for red vs green, model for green vs blue, model for blue vs red). generates (k-(k-1))/2 models. approach less prone to an imbalanced / undersampled dataset + * one-vs-rest - models are generated involving the whole dataset, where one target class is positive and the rest are negative (eg. 3 colours: model for red vs blue&green, model for blue vs red&green, model for green vs red&blue). generates k models. approach has less classifiers so trying to find most likely is quicker +