Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "markdown",
"id": "american-florida",
"metadata": {},
"source": [
"Python Notebook\n",
"This Python Notebook is Property of Premobowei Miriki and was created as part of a movie recommendation project.\n",
"Creating a Movie Recommendation system using a Neural Collaborative Filtering approach\n",
"I will be using the Movielens 25M dataset which consists of 25 million user rating applied to 62,000 movies by 162,000 users.\n",
"The dataset can be found at https://grouplens.org/datasets/movielens/"
]
},
{
"cell_type": "markdown",
"id": "overhead-arlington",
"metadata": {},
"source": [
"# Load the data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "dress-employee",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n",
"/Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
"/Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
"/Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
"/Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
"/Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
"/Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
]
}
],
"source": [
"# Load required models\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"\n",
"from keras.layers import Input, Embedding, Flatten, Dot, Dense, Multiply, Concatenate\n",
"from keras.models import Model\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "eastern-thousand",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ['KMP_DUPLICATE_LIB_OK']='True'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "funny-investor",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Load dataset\n",
"dataset = pd.read_csv(\"ratings.csv\")"
]
},
{
"cell_type": "markdown",
"id": "shaped-oxygen",
"metadata": {},
"source": [
"# Data Pre-Processing"
]
},
{
"cell_type": "markdown",
"id": "amended-mattress",
"metadata": {},
"source": [
"## Data Analysis"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "median-retirement",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>296</td>\n",
" <td>5.0</td>\n",
" <td>1147880044</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>306</td>\n",
" <td>3.5</td>\n",
" <td>1147868817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>307</td>\n",
" <td>5.0</td>\n",
" <td>1147868828</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>665</td>\n",
" <td>5.0</td>\n",
" <td>1147878820</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>899</td>\n",
" <td>3.5</td>\n",
" <td>1147868510</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 296 5.0 1147880044\n",
"1 1 306 3.5 1147868817\n",
"2 1 307 5.0 1147868828\n",
"3 1 665 5.0 1147878820\n",
"4 1 899 3.5 1147868510"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Explore the dataset\n",
"dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "comparative-campbell",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 25000095 entries, 0 to 25000094\n",
"Data columns (total 4 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 userId int64 \n",
" 1 movieId int64 \n",
" 2 rating float64\n",
" 3 timestamp int64 \n",
"dtypes: float64(1), int64(3)\n",
"memory usage: 762.9 MB\n"
]
}
],
"source": [
"# This gives information the data being worked with. It shows the datatype of each field which is helpful when\n",
"# collecting new user input. \n",
"dataset.info()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "enclosed-making",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check for missing user Id's\n",
"dataset['userId'].isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "elder-abuse",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check for missing movie Id's\n",
"dataset['movieId'].isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "proud-antibody",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check for missing ratings\n",
"dataset['rating'].isnull().sum()"
]
},
{
"cell_type": "markdown",
"id": "afraid-division",
"metadata": {},
"source": [
"Since al the null checks come back as 0. This means that all fields in the data are filled up and the data is clean \n",
"to use"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ignored-crisis",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Find the minimum and maximum rating given to the movie by the users\n",
"dataset['rating'].min()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "aggregate-contrast",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5.0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"dataset['rating'].max()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "clean-pottery",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"59047"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create two variable to define the number of users and number of movies in the dataset\n",
"movies_len = len(dataset.movieId.unique())\n",
"movies_len"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "correct-rainbow",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2001185 1\n",
"10627899 1\n",
"4075778 1\n",
"19245863 1\n",
"21816622 1\n",
" ... \n",
"18457961 209157\n",
"17864443 209159\n",
"1036618 209163\n",
"18457962 209169\n",
"18457963 209171\n",
"Name: movieId, Length: 25000095, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.movieId.sort_values(ascending = True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "permanent-general",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"162541"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"users_len = len(dataset.userId.unique())\n",
"users_len"
]
},
{
"cell_type": "markdown",
"id": "bronze-savannah",
"metadata": {},
"source": [
"Note: As seen direcly above the shape of the ratings dataset given is (25000095, 4). Meaning it has 25,000,095 columns\n",
"and 4 rows. The dataset contains 62,000 movies and 162,000 users. This means that the maximum amount of rating we can have is 1,004,000,000 ratings. However as seen in the analysis above we only have 25,000,095 ratings which makes up only 0.25% of all the total possible values. Therefore it is safe to say that the data is quite sparse."
]
},
{
"cell_type": "markdown",
"id": "functioning-recycling",
"metadata": {},
"source": [
"## Change rating scale"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "pleased-holly",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>296</td>\n",
" <td>1</td>\n",
" <td>1147880044</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>306</td>\n",
" <td>1</td>\n",
" <td>1147868817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>307</td>\n",
" <td>1</td>\n",
" <td>1147868828</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>665</td>\n",
" <td>1</td>\n",
" <td>1147878820</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>899</td>\n",
" <td>1</td>\n",
" <td>1147868510</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 296 1 1147880044\n",
"1 1 306 1 1147868817\n",
"2 1 307 1 1147868828\n",
"3 1 665 1 1147878820\n",
"4 1 899 1 1147868510"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# For this project i have opted to change the rating values from a 1-5 rating to a 0/1 rating.\n",
"# Making classification easier with fewer responses\n",
"# Using lambda i am able to automate this process using one line. the lambda function returns a boolean value\n",
"# I used the int function to change its type\n",
"\n",
"dataset[\"rating\"] = dataset[\"rating\"].apply(lambda x: int(x > 3))\n",
"dataset.head()"
]
},
{
"cell_type": "markdown",
"id": "amended-electricity",
"metadata": {},
"source": [
"## Get Training and testing data"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "aad9a034",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: scikit-learn in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (0.24.2)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from scikit-learn) (2.1.0)\n",
"Requirement already satisfied: joblib>=0.11 in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from scikit-learn) (1.0.1)\n",
"Requirement already satisfied: numpy>=1.13.3 in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from scikit-learn) (1.19.2)\n",
"Requirement already satisfied: scipy>=0.19.1 in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from scikit-learn) (1.5.2)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install -U scikit-learn"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "lined-invasion",
"metadata": {},
"outputs": [],
"source": [
"# Using sklears train test split to split the data into random subsets \n",
"# I specified the test size to be 0.35 (35% of the original data) \n",
"# And a random state of 28 to make sure i get the same split each time \n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"train, test = train_test_split(dataset, test_size = 0.35, random_state = 28)"
]
},
{
"cell_type": "markdown",
"id": "objective-outside",
"metadata": {},
"source": [
"### Explore train and test data"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "great-columbia",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>15558276</th>\n",
" <td>100737</td>\n",
" <td>30793</td>\n",
" <td>1</td>\n",
" <td>1246805953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7435598</th>\n",
" <td>48267</td>\n",
" <td>7143</td>\n",
" <td>1</td>\n",
" <td>1305372348</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15963881</th>\n",
" <td>103498</td>\n",
" <td>48043</td>\n",
" <td>1</td>\n",
" <td>1294403268</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17990831</th>\n",
" <td>116544</td>\n",
" <td>4262</td>\n",
" <td>1</td>\n",
" <td>1177293460</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15962036</th>\n",
" <td>103487</td>\n",
" <td>2615</td>\n",
" <td>1</td>\n",
" <td>959990693</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" userId movieId rating timestamp\n",
"15558276 100737 30793 1 1246805953\n",
"7435598 48267 7143 1 1305372348\n",
"15963881 103498 48043 1 1294403268\n",
"17990831 116544 4262 1 1177293460\n",
"15962036 103487 2615 1 959990693"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Explore training data\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "impressive-jaguar",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userId</th>\n",
" <th>movieId</th>\n",
" <th>rating</th>\n",
" <th>timestamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10660080</th>\n",
" <td>69224</td>\n",
" <td>181545</td>\n",
" <td>1</td>\n",
" <td>1525157229</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8479980</th>\n",
" <td>55195</td>\n",
" <td>132480</td>\n",
" <td>1</td>\n",
" <td>1446660403</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21694756</th>\n",
" <td>141028</td>\n",
" <td>3977</td>\n",
" <td>0</td>\n",
" <td>1061333069</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13359258</th>\n",
" <td>86443</td>\n",
" <td>27846</td>\n",
" <td>1</td>\n",
" <td>1173055252</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20095339</th>\n",
" <td>130627</td>\n",
" <td>1722</td>\n",
" <td>1</td>\n",
" <td>1218986817</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" userId movieId rating timestamp\n",
"10660080 69224 181545 1 1525157229\n",
"8479980 55195 132480 1 1446660403\n",
"21694756 141028 3977 0 1061333069\n",
"13359258 86443 27846 1 1173055252\n",
"20095339 130627 1722 1 1218986817"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Explore testing data\n",
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "breeding-contribution",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(8750034, 4) (16250061, 4)\n"
]
}
],
"source": [
"# Explore the shape of both traing and test data\n",
"print(test.shape, train.shape)"
]
},
{
"cell_type": "markdown",
"id": "honest-triumph",
"metadata": {},
"source": [
"# Model Creation"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "bizarre-breeding",
"metadata": {},
"outputs": [],
"source": [
"# Define input layers\n",
"# Both inputs for the movie and the user will expect a one dimensional array with one element as input\n",
"movie_input = Input(shape=[1], name = 'Movies')\n",
"user_input = Input(shape=[1], name = 'Users')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "special-tractor",
"metadata": {},
"outputs": [],
"source": [
"# Define embedding layers\n",
"# Where MF stands for Matrix Factorization\n",
"# Define the input dimensions as the number of movies and users respectively and their output dimension as 4\n",
"\n",
"\n",
"movies_mf_embedding = Embedding(209171 + 1, 4, name= 'Embedded_Movies_MF', input_length=1)\n",
"\n",
"users_mf_embedding = Embedding(users_len + 1, 4, name = 'Embedded_Users_MF', input_length=1,)\n",
"\n",
"movies_mlp_embedding = Embedding(209171 + 1, 4, name = 'Embeded_Movies_MLP',input_length=1,)\n",
"\n",
"users_mlp_embedding = Embedding(users_len + 1, 4 , name = 'Embeded_Users_MLP', input_length=1,)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "crazy-bloom",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Matrix factorization function\n",
"# The purpose of this function is to hold the data for the matrix factorization part of my code\n",
"# I have used the flatten fuction to flatten the data and the dot product to calcuate the movie rating\n",
"def MatrixFactorizationNN(movies_embedding_MF, users_embeding_MF):\n",
" movies_mf_flatten = Flatten(name = 'Flatten_Movies_MF')(movies_mf_embedding(movie_input))\n",
" users_mf_flatten = Flatten(name = 'Flatten_Users_MF')(users_mf_embedding(user_input))\n",
" matrix_vect = Dot(name = 'Dot_Product_MF', axes = 1)([movies_mf_flatten, users_mf_flatten])\n",
" return matrix_vect"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "stone-certificate",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"__________________________________________________________________________________________________\n",
"Layer (type) Output Shape Param # Connected to \n",
"==================================================================================================\n",
"Movies (InputLayer) (None, 1) 0 \n",
"__________________________________________________________________________________________________\n",
"Users (InputLayer) (None, 1) 0 \n",
"__________________________________________________________________________________________________\n",
"Embedded_Movies_MF (Embedding) (None, 1, 4) 836688 Movies[0][0] \n",
"__________________________________________________________________________________________________\n",
"Embedded_Users_MF (Embedding) (None, 1, 4) 650168 Users[0][0] \n",
"__________________________________________________________________________________________________\n",
"Flatten_Movies_MF (Flatten) (None, 4) 0 Embedded_Movies_MF[0][0] \n",
"__________________________________________________________________________________________________\n",
"Flatten_Users_MF (Flatten) (None, 4) 0 Embedded_Users_MF[0][0] \n",
"__________________________________________________________________________________________________\n",
"Dot_Product_MF (Dot) (None, 1) 0 Flatten_Movies_MF[0][0] \n",
" Flatten_Users_MF[0][0] \n",
"==================================================================================================\n",
"Total params: 1,486,856\n",
"Trainable params: 1,486,856\n",
"Non-trainable params: 0\n",
"__________________________________________________________________________________________________\n"
]
}
],
"source": [
"# Create model using the output of the matrix factorization\n",
"model = Model([movie_input, user_input], MatrixFactorizationNN(movies_mf_embedding, users_mf_embedding))\n",
"model.compile(optimizer = 'adam', loss = 'mean_squared_error')\n",
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "9ba2f50e",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: GraphViz in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (0.16)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install GraphViz"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "ae6714cc",
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"<svg height=\"304pt\" viewBox=\"0.00 0.00 745.50 304.00\" width=\"746pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g class=\"graph\" id=\"graph0\" transform=\"scale(1 1) rotate(0) translate(4 300)\">\n",
"<title>G</title>\n",
"<polygon fill=\"white\" points=\"-4,4 -4,-300 741.5,-300 741.5,4 -4,4\" stroke=\"transparent\"/>\n",
"<!-- 140679661934688 -->\n",
"<g class=\"node\" id=\"node1\">\n",
"<title>140679661934688</title>\n",
"<polygon fill=\"none\" points=\"55.5,-249.5 55.5,-295.5 309.5,-295.5 309.5,-249.5 55.5,-249.5\" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"119.5\" y=\"-268.8\">Movies: InputLayer</text>\n",
"<polyline fill=\"none\" points=\"183.5,-249.5 183.5,-295.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"211.5\" y=\"-280.3\">input:</text>\n",
"<polyline fill=\"none\" points=\"183.5,-272.5 239.5,-272.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"211.5\" y=\"-257.3\">output:</text>\n",
"<polyline fill=\"none\" points=\"239.5,-249.5 239.5,-295.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"274.5\" y=\"-280.3\">(None, 1)</text>\n",
"<polyline fill=\"none\" points=\"239.5,-272.5 309.5,-272.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"274.5\" y=\"-257.3\">(None, 1)</text>\n",
"</g>\n",
"<!-- 140679662064920 -->\n",
"<g class=\"node\" id=\"node3\">\n",
"<title>140679662064920</title>\n",
"<polygon fill=\"none\" points=\"0,-166.5 0,-212.5 365,-212.5 365,-166.5 0,-166.5\" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"112.5\" y=\"-185.8\">Embedded_Movies_MF: Embedding</text>\n",
"<polyline fill=\"none\" points=\"225,-166.5 225,-212.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"253\" y=\"-197.3\">input:</text>\n",
"<polyline fill=\"none\" points=\"225,-189.5 281,-189.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"253\" y=\"-174.3\">output:</text>\n",
"<polyline fill=\"none\" points=\"281,-166.5 281,-212.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"323\" y=\"-197.3\">(None, 1)</text>\n",
"<polyline fill=\"none\" points=\"281,-189.5 365,-189.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"323\" y=\"-174.3\">(None, 1, 4)</text>\n",
"</g>\n",
"<!-- 140679661934688&#45;&gt;140679662064920 -->\n",
"<g class=\"edge\" id=\"edge1\">\n",
"<title>140679661934688-&gt;140679662064920</title>\n",
"<path d=\"M182.5,-249.37C182.5,-241.15 182.5,-231.66 182.5,-222.73\" fill=\"none\" stroke=\"black\"/>\n",
"<polygon fill=\"black\" points=\"186,-222.61 182.5,-212.61 179,-222.61 186,-222.61\" stroke=\"black\"/>\n",
"</g>\n",
"<!-- 140679661933176 -->\n",
"<g class=\"node\" id=\"node2\">\n",
"<title>140679661933176</title>\n",
"<polygon fill=\"none\" points=\"438.5,-249.5 438.5,-295.5 682.5,-295.5 682.5,-249.5 438.5,-249.5\" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"497.5\" y=\"-268.8\">Users: InputLayer</text>\n",
"<polyline fill=\"none\" points=\"556.5,-249.5 556.5,-295.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"584.5\" y=\"-280.3\">input:</text>\n",
"<polyline fill=\"none\" points=\"556.5,-272.5 612.5,-272.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"584.5\" y=\"-257.3\">output:</text>\n",
"<polyline fill=\"none\" points=\"612.5,-249.5 612.5,-295.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"647.5\" y=\"-280.3\">(None, 1)</text>\n",
"<polyline fill=\"none\" points=\"612.5,-272.5 682.5,-272.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"647.5\" y=\"-257.3\">(None, 1)</text>\n",
"</g>\n",
"<!-- 140679662064864 -->\n",
"<g class=\"node\" id=\"node4\">\n",
"<title>140679662064864</title>\n",
"<polygon fill=\"none\" points=\"383.5,-166.5 383.5,-212.5 737.5,-212.5 737.5,-166.5 383.5,-166.5\" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"490.5\" y=\"-185.8\">Embedded_Users_MF: Embedding</text>\n",
"<polyline fill=\"none\" points=\"597.5,-166.5 597.5,-212.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"625.5\" y=\"-197.3\">input:</text>\n",
"<polyline fill=\"none\" points=\"597.5,-189.5 653.5,-189.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"625.5\" y=\"-174.3\">output:</text>\n",
"<polyline fill=\"none\" points=\"653.5,-166.5 653.5,-212.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"695.5\" y=\"-197.3\">(None, 1)</text>\n",
"<polyline fill=\"none\" points=\"653.5,-189.5 737.5,-189.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"695.5\" y=\"-174.3\">(None, 1, 4)</text>\n",
"</g>\n",
"<!-- 140679661933176&#45;&gt;140679662064864 -->\n",
"<g class=\"edge\" id=\"edge2\">\n",
"<title>140679661933176-&gt;140679662064864</title>\n",
"<path d=\"M560.5,-249.37C560.5,-241.15 560.5,-231.66 560.5,-222.73\" fill=\"none\" stroke=\"black\"/>\n",
"<polygon fill=\"black\" points=\"564,-222.61 560.5,-212.61 557,-222.61 564,-222.61\" stroke=\"black\"/>\n",
"</g>\n",
"<!-- 140679661934240 -->\n",
"<g class=\"node\" id=\"node5\">\n",
"<title>140679661934240</title>\n",
"<polygon fill=\"none\" points=\"46.5,-83.5 46.5,-129.5 364.5,-129.5 364.5,-83.5 46.5,-83.5\" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"135.5\" y=\"-102.8\">Flatten_Movies_MF: Flatten</text>\n",
"<polyline fill=\"none\" points=\"224.5,-83.5 224.5,-129.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"252.5\" y=\"-114.3\">input:</text>\n",
"<polyline fill=\"none\" points=\"224.5,-106.5 280.5,-106.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"252.5\" y=\"-91.3\">output:</text>\n",
"<polyline fill=\"none\" points=\"280.5,-83.5 280.5,-129.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"322.5\" y=\"-114.3\">(None, 1, 4)</text>\n",
"<polyline fill=\"none\" points=\"280.5,-106.5 364.5,-106.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"322.5\" y=\"-91.3\">(None, 4)</text>\n",
"</g>\n",
"<!-- 140679662064920&#45;&gt;140679661934240 -->\n",
"<g class=\"edge\" id=\"edge3\">\n",
"<title>140679662064920-&gt;140679661934240</title>\n",
"<path d=\"M188.78,-166.37C191.14,-158.06 193.87,-148.45 196.43,-139.43\" fill=\"none\" stroke=\"black\"/>\n",
"<polygon fill=\"black\" points=\"199.86,-140.18 199.22,-129.61 193.12,-138.27 199.86,-140.18\" stroke=\"black\"/>\n",
"</g>\n",
"<!-- 140679661935528 -->\n",
"<g class=\"node\" id=\"node6\">\n",
"<title>140679661935528</title>\n",
"<polygon fill=\"none\" points=\"394.5,-83.5 394.5,-129.5 702.5,-129.5 702.5,-83.5 394.5,-83.5\" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"478.5\" y=\"-102.8\">Flatten_Users_MF: Flatten</text>\n",
"<polyline fill=\"none\" points=\"562.5,-83.5 562.5,-129.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"590.5\" y=\"-114.3\">input:</text>\n",
"<polyline fill=\"none\" points=\"562.5,-106.5 618.5,-106.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"590.5\" y=\"-91.3\">output:</text>\n",
"<polyline fill=\"none\" points=\"618.5,-83.5 618.5,-129.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"660.5\" y=\"-114.3\">(None, 1, 4)</text>\n",
"<polyline fill=\"none\" points=\"618.5,-106.5 702.5,-106.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"660.5\" y=\"-91.3\">(None, 4)</text>\n",
"</g>\n",
"<!-- 140679662064864&#45;&gt;140679661935528 -->\n",
"<g class=\"edge\" id=\"edge4\">\n",
"<title>140679662064864-&gt;140679661935528</title>\n",
"<path d=\"M557.22,-166.37C556,-158.15 554.6,-148.66 553.27,-139.73\" fill=\"none\" stroke=\"black\"/>\n",
"<polygon fill=\"black\" points=\"556.7,-138.99 551.78,-129.61 549.78,-140.01 556.7,-138.99\" stroke=\"black\"/>\n",
"</g>\n",
"<!-- 140679661935696 -->\n",
"<g class=\"node\" id=\"node7\">\n",
"<title>140679661935696</title>\n",
"<polygon fill=\"none\" points=\"200.5,-0.5 200.5,-46.5 540.5,-46.5 540.5,-0.5 200.5,-0.5\" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"272.5\" y=\"-19.8\">Dot_Product_MF: Dot</text>\n",
"<polyline fill=\"none\" points=\"344.5,-0.5 344.5,-46.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"372.5\" y=\"-31.3\">input:</text>\n",
"<polyline fill=\"none\" points=\"344.5,-23.5 400.5,-23.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"372.5\" y=\"-8.3\">output:</text>\n",
"<polyline fill=\"none\" points=\"400.5,-0.5 400.5,-46.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"470.5\" y=\"-31.3\">[(None, 4), (None, 4)]</text>\n",
"<polyline fill=\"none\" points=\"400.5,-23.5 540.5,-23.5 \" stroke=\"black\"/>\n",
"<text font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"470.5\" y=\"-8.3\">(None, 1)</text>\n",
"</g>\n",
"<!-- 140679661934240&#45;&gt;140679661935696 -->\n",
"<g class=\"edge\" id=\"edge5\">\n",
"<title>140679661934240-&gt;140679661935696</title>\n",
"<path d=\"M250.59,-83.37C270.87,-73.41 294.97,-61.58 316.24,-51.14\" fill=\"none\" stroke=\"black\"/>\n",
"<polygon fill=\"black\" points=\"318.03,-54.16 325.47,-46.61 314.95,-47.87 318.03,-54.16\" stroke=\"black\"/>\n",
"</g>\n",
"<!-- 140679661935528&#45;&gt;140679661935696 -->\n",
"<g class=\"edge\" id=\"edge6\">\n",
"<title>140679661935528-&gt;140679661935696</title>\n",
"<path d=\"M499.86,-83.37C477.79,-73.32 451.51,-61.36 428.42,-50.86\" fill=\"none\" stroke=\"black\"/>\n",
"<polygon fill=\"black\" points=\"429.63,-47.56 419.08,-46.61 426.73,-53.93 429.63,-47.56\" stroke=\"black\"/>\n",
"</g>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.SVG object>"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"import keras\n",
"from IPython.display import SVG\n",
"from keras.optimizers import Adam\n",
"from keras.utils.vis_utils import model_to_dot\n",
"SVG(model_to_dot(model, show_shapes=True, show_layer_names=True, rankdir='HB').create(prog='dot', format='svg'))"
]
},
{
"cell_type": "markdown",
"id": "legislative-vault",
"metadata": {},
"source": [
"# Train the model"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "abandoned-myrtle",
"metadata": {},
"outputs": [],
"source": [
"# Beacsue the raining process take a long time especially when dealing with a large dataset such as this one \n",
"# I used the load model from keras to store the trained model as a .h5 file and load it when needed\n",
"\n",
"from keras.models import load_model\n",
"\n",
"if os.path.exists('recommender_model2.h5'):\n",
" model = load_model('recommender_model2.h5')\n",
"else:\n",
" history = model.fit([train.movieId, train.userId], train.rating, epochs= 4, verbose=1)\n",
" model.save('recommender_model2.h5')\n",
" plt.plot(history.history['loss'])\n",
" plt.xlabel = ('Epochs')\n",
" plt.ylabel = (\"Training_Error\")"
]
},
{
"cell_type": "markdown",
"id": "phantom-louisville",
"metadata": {},
"source": [
"# Evaluate the Model"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "excess-quantity",
"metadata": {},
"outputs": [],
"source": [
"#loss, accuracy = model.evaluate([test.movieId, test.userId], test.rating)\n",
"y_hat = np.round(model.predict([test.movieId, test.userId]),0)\n",
"y_true = test.rating"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "2fa542d4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.2689119836562921"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.metrics import mean_absolute_error\n",
"mean_absolute_error(y_true, y_hat)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "otherwise-creativity",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.7927092] 1\n",
"[1.1115079] 1\n",
"[0.0020586] 0\n",
"[1.0242506] 1\n",
"[0.22690105] 1\n",
"[0.50635934] 1\n",
"[0.65502155] 0\n",
"[0.37845236] 0\n",
"[0.7152109] 1\n",
"[0.07030999] 0\n",
"[0.6284604] 0\n",
"[-0.17061862] 0\n"
]
},
{
"data": {
"text/plain": [
"[None, None, None, None, None, None, None, None, None, None, None, None]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions = model.predict([test.movieId.head(12), test.userId.head(12)])\n",
"[print(predictions[i], test.rating.iloc[i]) for i in range (0,12)]"
]
},
{
"cell_type": "markdown",
"id": "average-estonia",
"metadata": {},
"source": [
"### Mean Squared Error"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "antique-complement",
"metadata": {},
"outputs": [],
"source": [
"y_hat = np.round(model.predict([test.movieId, test.userId]),0)\n",
"y_true = test.rating"
]
},
{
"cell_type": "markdown",
"id": "perfect-tension",
"metadata": {},
"source": [
"# Make recommendations"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "following-appendix",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([131072, 1, 2, ..., 131066, 131068, 131070])"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"moviedata = np.array(list(set(dataset.movieId)))\n",
"moviedata"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "economic-aruba",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([10, 10, 10, ..., 10, 10, 10])"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"user = np.array([10 for i in range(len(moviedata))])\n",
"user"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "documentary-coating",
"metadata": {},
"outputs": [],
"source": [
"predictions = model.predict([moviedata, user])\n",
"predictions = np.array([a[0] for a in predictions])"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "single-harbor",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([40511, 13673, 49893, 35302, 58824, 37541, 42699, 28908, 34748,\n",
" 50747])"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"recommendedmovies = (predictions).argsort()[:10]\n",
"recommendedmovies"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "recovered-snapshot",
"metadata": {},
"outputs": [],
"source": [
"movies = pd.read_csv(\"movies.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "italic-allowance",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [movieId, title, genres]\n",
"Index: []"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[movies['movieId'].isin (recommendedmovies)]"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "opposite-police",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 10 11 17 32 34 36 39 47 50 95 110 111\n",
" 141 145 150 153 161 165 185 215 253 260 277 288\n",
" 292 296 300 318 339 344 345 349 356 357 364 372\n",
" 377 380 412 434 454 457 474 480 500 527 539 541\n",
" 586 587 588 589 590 592 593 595 597 608 647 648\n",
" 733 736 750 780 788 838 858 899 909 912 924 926\n",
" 932 940 965 969 1036 1078 1079 1089 1092 1097 1101 1172\n",
" 1183 1193 1196 1198 1200 1203 1210 1213 1214 1220 1221 1225\n",
" 1230 1240 1246 1258 1259 1265 1269 1270 1287 1291 1292 1307\n",
" 1339 1377 1387 1391 1393 1453 1459 1479 1485 1517 1541 1573\n",
" 1580 1608 1610 1617 1625 1641 1653 1693 1694 1704 1721 1784\n",
" 1917 1923 1939 1953 1961 1968 2000 2002 2011 2012 2018 2028\n",
" 2072 2081 2115 2133 2141 2174 2248 2262 2314 2321 2329 2353\n",
" 2384 2396 2406 2427 2445 2502 2505 2520 2521 2522 2539 2571\n",
" 2617 2628 2640 2683 2706 2707 2710 2712 2723 2731 2753 2762\n",
" 2791 2797 2819 2858 2870 2871 2915 2918 2959 2967 2987 2997\n",
" 3004 3081 3105 3107 3147 3175 3206 3251 3253 3255 3364 3370\n",
" 3401 3408 3420 3471 3481 3525 3534 3578 3599 3623 3649 3736\n",
" 3742 3751 3755 3793 3801 3824 3841 3897 3911 3948 3977 3994\n",
" 3996 4022 4027 4031 4034 4080 4084 4085 4159 4167 4174 4191\n",
" 4226 4238 4246 4305 4306 4370 4392 4393 4419 4462 4464 4465\n",
" 4474 4564 4639 4844 4963 4973 4979 4995 5049 5096 5114 5151\n",
" 5226 5299 5349 5377 5404 5418 5445 5459 5472 5508 5581 5610\n",
" 5618 5669 5677 5680 5707 5902 5945 5954 5960 5989 6218 6254\n",
" 6287 6333 6377 6378 6440 6493 6502 6539 6548 6573 6586 6620\n",
" 6711 6750 6787 6874 6953 6954 6957 7121 7147 7173 7293 7315\n",
" 7361 7438 7444 8094 8120 8360 8464 8638 8665 8695 8784 8838\n",
" 8874 8943 8948 8949 8961 8968 8973 25868 26078 27317 27724 30812\n",
" 32584 32587 33166 33794 33836 34048 34338 35836 42938 43936 44191 44195\n",
" 44759 45106 46578 46723 47640 48304 48326 48394 48516 48774 48780 49007\n",
" 49272 50514 50802 50923 51255 51471 51662 51834 52967 53953 54286 54503\n",
" 54997 55052 55069 55363 55765 55805 55820 55830 56367 56949 57243 58295\n",
" 58559 58803 59258 59315 59910 60069 60103 61986 63082 63853 64957 66665\n",
" 69243 70932 71460]\n"
]
}
],
"source": [
"relationship = dataset.loc[dataset['userId'] == 31]\n",
"relationship = relationship['movieId'].to_numpy()\n",
"print(relationship)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "perceived-devil",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>GoldenEye (1995)</td>\n",
" <td>Action|Adventure|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11</td>\n",
" <td>American President, The (1995)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>17</td>\n",
" <td>Sense and Sensibility (1995)</td>\n",
" <td>Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>32</td>\n",
" <td>Twelve Monkeys (a.k.a. 12 Monkeys) (1995)</td>\n",
" <td>Mystery|Sci-Fi|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>34</td>\n",
" <td>Babe (1995)</td>\n",
" <td>Children|Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12901</th>\n",
" <td>64957</td>\n",
" <td>Curious Case of Benjamin Button, The (2008)</td>\n",
" <td>Drama|Fantasy|Mystery|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13103</th>\n",
" <td>66665</td>\n",
" <td>Away We Go (2009)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13402</th>\n",
" <td>69243</td>\n",
" <td>Before the Rains (2007)</td>\n",
" <td>Drama|Romance|Thriller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13694</th>\n",
" <td>70932</td>\n",
" <td>My Life in Ruins (2009)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13804</th>\n",
" <td>71460</td>\n",
" <td>Wanted (2009)</td>\n",
" <td>Action|Romance</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>399 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" movieId title \\\n",
"9 10 GoldenEye (1995) \n",
"10 11 American President, The (1995) \n",
"16 17 Sense and Sensibility (1995) \n",
"31 32 Twelve Monkeys (a.k.a. 12 Monkeys) (1995) \n",
"33 34 Babe (1995) \n",
"... ... ... \n",
"12901 64957 Curious Case of Benjamin Button, The (2008) \n",
"13103 66665 Away We Go (2009) \n",
"13402 69243 Before the Rains (2007) \n",
"13694 70932 My Life in Ruins (2009) \n",
"13804 71460 Wanted (2009) \n",
"\n",
" genres \n",
"9 Action|Adventure|Thriller \n",
"10 Comedy|Drama|Romance \n",
"16 Drama|Romance \n",
"31 Mystery|Sci-Fi|Thriller \n",
"33 Children|Drama \n",
"... ... \n",
"12901 Drama|Fantasy|Mystery|Romance \n",
"13103 Comedy|Drama|Romance \n",
"13402 Drama|Romance|Thriller \n",
"13694 Comedy \n",
"13804 Action|Romance \n",
"\n",
"[399 rows x 3 columns]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[movies['movieId'].isin (relationship)]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "diagnostic-damage",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: coremltools in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (4.1)\n",
"Requirement already satisfied: six>=1.10.0 in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from coremltools) (1.15.0)\n",
"Requirement already satisfied: scipy in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from coremltools) (1.5.2)\n",
"Requirement already satisfied: numpy<1.20,>=1.14.5 in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from coremltools) (1.19.2)\n",
"Requirement already satisfied: attr in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from coremltools) (0.3.1)\n",
"Requirement already satisfied: protobuf>=3.1.0 in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from coremltools) (3.14.0)\n",
"Requirement already satisfied: attrs in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from coremltools) (20.3.0)\n",
"Requirement already satisfied: tqdm in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from coremltools) (4.59.0)\n",
"Requirement already satisfied: sympy in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from coremltools) (1.8)\n",
"Requirement already satisfied: packaging in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from coremltools) (20.9)\n",
"Requirement already satisfied: pyparsing>=2.0.2 in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from packaging->coremltools) (2.4.7)\n",
"Requirement already satisfied: mpmath>=0.19 in /Users/premoboweimiriki/opt/anaconda3/envs/RecSys2/lib/python3.6/site-packages (from sympy->coremltools) (1.2.1)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install coremltools"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "noted-stations",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:scikit-learn version 0.24.2 is not supported. Minimum required version: 0.17. Maximum required version: 0.19.2. Disabling scikit-learn conversion API.\n"
]
}
],
"source": [
"from keras.models import load_model\n",
"import coremltools"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "planned-infrared",
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "EOL while scanning string literal (<ipython-input-42-d4cef054d021>, line 4)",
"output_type": "error",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-42-d4cef054d021>\"\u001b[0;36m, line \u001b[0;32m4\u001b[0m\n\u001b[0;31m your_model.output_description['output'] = 'Prediction of Digit\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m EOL while scanning string literal\n"
]
}
],
"source": [
"your_model.author = 'Premo Miriki'\n",
"your_model.short_description = 'Movie Recommendation with Movielens'\n",
"your_model.input_description['image'] = ''\n",
"your_model.output_description['output'] = 'Prediction of Digit"
]
},
{
"cell_type": "markdown",
"id": "94316417",
"metadata": {},
"source": [
"### Convert model to CoreML"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "32aaa29b",
"metadata": {},
"outputs": [],
"source": [
"import coremltools"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f10cd7a",
"metadata": {},
"outputs": [],
"source": [
"coreml_model = coremltools.converters.keras.convert(model)\n",
"coreml_model.save('recSys.mlmodel')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea5b60f4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}