diff --git a/nlp.ipynb b/nlp.ipynb new file mode 100644 index 0000000..32bdebe --- /dev/null +++ b/nlp.ipynb @@ -0,0 +1,65 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.svm import SVC\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# Load the data\n", + "df = pd.read_csv('data.csv')\n", + "\n", + "# Preprocess the descriptions by lowercasing and removing punctuation\n", + "df['description'] = df['description'].str.lower()\n", + "df['description'] = df['description'].str.replace(r'[^\\w\\s]', '')\n", + "\n", + "# Split the dataset into training and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(df['description'], df['timeS'], test_size=0.2)\n", + "\n", + "# Extract features using a bag-of-words approach\n", + "vectorizer = CountVectorizer()\n", + "X_train_features = vectorizer.fit_transform(X_train)\n", + "X_test_features = vectorizer.transform(X_test)\n", + "\n", + "# Train a support vector machine classifier\n", + "classifier = SVC()\n", + "output = classifier.fit(X_train_features, y_train)\n", + "output.save('model.m5')\n", + "\n", + "# Make predictions on the test set\n", + "y_pred = classifier.predict(X_test_features)\n", + "\n", + "# Evaluate the model's performance\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(f'Model accuracy: {accuracy:.2f}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (Pyodide)", + "language": "python", + "name": "python" + }, + "language_info": { + "codemirror_mode": { + "name": "python", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}