Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"id": "M8Pr5Yv65ctj"
},
"outputs": [],
"source": [
"# Importing Libraries\n",
"\n",
"import pandas as pd\n",
"import numpy\n",
"\n",
"# Machine Learning\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.cross_decomposition import PLSRegression\n",
"from sklearn import linear_model\n",
"\n",
"# Pipeline\n",
"from imblearn.pipeline import Pipeline\n",
"\n",
"# Ignore Warnings\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"source": [
"# Loading dataset\n",
"\n",
"data=pd.read_csv('advertisement_budget.csv')\n",
"data"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 332
},
"id": "Odoj24d_5tNv",
"outputId": "dfcbca8b-c676-4cdc-d5fc-d562889cdc77"
},
"execution_count": 76,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Year Advertising Cost Yearly Sale\n",
"0 2014 222 12023\n",
"1 2015 240 12543\n",
"2 2016 246 12756\n",
"3 2017 253 12900\n",
"4 2018 271 13087\n",
"5 2019 285 13256\n",
"6 2020 288 13600\n",
"7 2021 298 13920\n",
"8 2022 319 14121"
],
"text/html": [
"\n",
" <div id=\"df-4f0b181a-5ee7-40a8-8fd6-ed9d8d6bf447\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Year</th>\n",
" <th>Advertising Cost</th>\n",
" <th>Yearly Sale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2014</td>\n",
" <td>222</td>\n",
" <td>12023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2015</td>\n",
" <td>240</td>\n",
" <td>12543</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2016</td>\n",
" <td>246</td>\n",
" <td>12756</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2017</td>\n",
" <td>253</td>\n",
" <td>12900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2018</td>\n",
" <td>271</td>\n",
" <td>13087</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2019</td>\n",
" <td>285</td>\n",
" <td>13256</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2020</td>\n",
" <td>288</td>\n",
" <td>13600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2021</td>\n",
" <td>298</td>\n",
" <td>13920</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2022</td>\n",
" <td>319</td>\n",
" <td>14121</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4f0b181a-5ee7-40a8-8fd6-ed9d8d6bf447')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-4f0b181a-5ee7-40a8-8fd6-ed9d8d6bf447 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-4f0b181a-5ee7-40a8-8fd6-ed9d8d6bf447');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 76
}
]
},
{
"cell_type": "code",
"source": [
"# Create target variable\n",
"\n",
"target=data[['Yearly Sale']]\n",
"data=data[['Advertising Cost']]"
],
"metadata": {
"id": "5FieMQe452uh"
},
"execution_count": 77,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Train - Test split for training\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1, random_state=42)"
],
"metadata": {
"id": "5i-DNGmA6LzQ"
},
"execution_count": 78,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Regression Models\n",
"\n",
"linreg = LinearRegression() # Linear Regression\n",
"pls2 = PLSRegression(n_components=2) # Partial least squares (PLS) Regression\n",
"lasso = linear_model.Lasso(alpha=0.1) # Lasso Regression\n",
"\n",
"linreg_pipe = Pipeline([('Linear Regression', linreg)])\n",
"pls_pipe = Pipeline([('PLS Regression', pls2)])\n",
"lasso_pipe = Pipeline([('Lasso Regression', lasso)])\n"
],
"metadata": {
"id": "YABYFVl06h_h"
},
"execution_count": 79,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Defining model to calculate accuracy\n",
"\n",
"def accuracy(y_test,y_pred):\n",
" y=y_test.to_numpy()\n",
" x=y[0,0]-y_pred[0]\n",
" err=x/y[0,0]\n",
" acc=100-err\n",
" print(\"Accuracy is: \", acc, \"%\")\n"
],
"metadata": {
"id": "tc_6KSeLDjyi"
},
"execution_count": 80,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"The goal of a machine learning model is to generalize patterns in training data so that you can correctly predict new data that has never been presented to the model. Overfitting occurs when a model adjusts excessively to the training data, seeing patterns that do not exist, and consequently performing poorly in predicting new data.\n",
"\n",
"As out data is very small, complex models can make crazy curves that will almost perfectly explain the training data, but possibly will perform poorly over the test data.\n",
"\n",
"Hence we choose 3 simple regression models:\n",
"\n",
"\n",
"\n",
"1. Linear Regression\n",
"\n",
"Linear regression, also known as ordinary least squares (OLS) and linear least squares is used to understand the mean change in a dependent variable given a one-unit change in each independent variable. This analysis estimates parameters by minimizing the sum of the squared errors (SSE). Linear models are the most common and most straightforward to use. For continuous dependent variable, linear regression is probably the first type one should consider.\n",
"\n",
"2. Lasso Regression\n",
"\n",
"Lasso regression (least absolute shrinkage and selection operator) performs variable selection that aims to increase prediction accuracy by identifying a simpler model. It is similar to Ridge regression but with variable selection.\n",
"\n",
"3. Partial least squares (PLS) Regression\n",
"\n",
"PLS is useful when there are very few observations compared to the number of independent variables or when the independent variables are highly correlated. PLS decreases the independent variables down to a smaller number of uncorrelated components, similar to Principal Components Analysis. Then, the procedure performs linear regression on these components rather than the original data. PLS emphasizes developing predictive models and is not used for screening variables. PLS uses the correlation structure to identify smaller effects and model multivariate patterns in the dependent variables.\n",
"\n",
"\n"
],
"metadata": {
"id": "Khstw0ZZHUF_"
}
},
{
"cell_type": "code",
"source": [
"# Training and Testing the models on the data\n",
"\n",
"classifiers = [linreg_pipe, pls_pipe, lasso_pipe]\n",
"classifier_names = ['Linear Regression', 'Partial least squares (PLS) regression', 'Lasso Regression']\n",
"\n",
"for clf, label in zip(classifiers, classifier_names):\n",
" clf.fit(X_train, y_train) \n",
" y_pred = clf.predict(X_test)\n",
" y=y_test.to_numpy()\n",
" print(label, \"model:\")\n",
" print(\"Actual Yearly Sale\", y[0,0],\"\\t\", \"Predicted Yearly Sale\",y_pred[0])\n",
" accuracy(y_test,y_pred)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Hxl0QQE4Bl6-",
"outputId": "77730b10-bc71-4ba8-d038-34dbe7113c3f"
},
"execution_count": 81,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Linear Regression model:\n",
"Actual Yearly Sale 13920 \t Predicted Yearly Sale [13694.96733742]\n",
"Accuracy is: [99.98383386] %\n",
"Partial least squares (PLS) regression model:\n",
"Actual Yearly Sale 13920 \t Predicted Yearly Sale [13694.96733742]\n",
"Accuracy is: [99.98383386] %\n",
"Lasso Regression model:\n",
"Actual Yearly Sale 13920 \t Predicted Yearly Sale 13694.96354622339\n",
"Accuracy is: 99.98383358809076 %\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"Predicting the sale of the year 2023 if the advertising budget for 2023 is increased by 15% as compared to its previous year. Advertising cost of 2022 is $319. Advertising cost of 2023 will be 15% more than that of 2022.\n",
"\n",
"Since all the models have equal accuracy, I am using, linear regression is used to predict the sale for 2023."
],
"metadata": {
"id": "8QUqIax4JDP1"
}
},
{
"cell_type": "code",
"source": [
"# Calculation of 2023 Yearly Sale\n",
"\n",
"new_adv_cost = [(319*(15/100)+319)]\n",
"new_adv_cost = pd.DataFrame(new_adv_cost)\n",
"_2023_sale = linreg.predict(new_adv_cost)\n",
"print(\"Using Linear Regression, The Predicted sale for 2023 is with 15% increase in Advertising Cost is: \", _2023_sale[0])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8UfKwEhq7tPd",
"outputId": "327a84e0-b85c-49ff-9594-ec18045540ea"
},
"execution_count": 83,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Using Linear Regression, The Predicted sale for 2023 is with 15% increase in Advertising Cost is: [15091.49391222]\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"Using Linear Regression, The Predicted sale for 2023 is with 15% increase in Advertising Cost is: $15091.49"
],
"metadata": {
"id": "7Y4wM_oiJuGg"
}
}
]
}