{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# ReelRating: CS 4804 Mini-Project\n", "## Jeff Suliga, Tanya Acharya, Rishi Patel, Parth Mittal" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# All the imports for the project:\n", "\n", "import numpy as np \n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import classification_report\n", "from sklearn.svm import SVR\n", "import re" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Read the data from the csv file:\n", "\n", "file_path = 'IMDBDataset.csv'\n", "data = pd.read_csv(file_path)\n", "\n", "# data.info()\n", "\n", "#data.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "Classification metrics can't handle a mix of binary and continuous targets", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\Users\\SC\\OneDrive - Virginia Tech\\Fall 2023\\Intro to AI\\reelrating\\reelrating_training.ipynb Cell 4\u001b[0m line \u001b[0;36m2\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=18'>19</a>\u001b[0m regression_model\u001b[39m.\u001b[39mfit(X_train_tfidf, Y_train)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=20'>21</a>\u001b[0m predictions \u001b[39m=\u001b[39m regression_model\u001b[39m.\u001b[39mpredict(X_test_tfidf)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=21'>22</a>\u001b[0m report \u001b[39m=\u001b[39m classification_report(Y_test, predictions, output_dict\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=22'>23</a>\u001b[0m report\n", "File \u001b[1;32mc:\\Users\\SC\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:211\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 206\u001b[0m \u001b[39mwith\u001b[39;00m config_context(\n\u001b[0;32m 207\u001b[0m skip_parameter_validation\u001b[39m=\u001b[39m(\n\u001b[0;32m 208\u001b[0m prefer_skip_nested_validation \u001b[39mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 209\u001b[0m )\n\u001b[0;32m 210\u001b[0m ):\n\u001b[1;32m--> 211\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 212\u001b[0m \u001b[39mexcept\u001b[39;00m InvalidParameterError \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 213\u001b[0m \u001b[39m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 214\u001b[0m \u001b[39m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 215\u001b[0m \u001b[39m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[39m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 217\u001b[0m msg \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39msub(\n\u001b[0;32m 218\u001b[0m \u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m\\\u001b[39m\u001b[39mw+ must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 219\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m{\u001b[39;00mfunc\u001b[39m.\u001b[39m\u001b[39m__qualname__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 220\u001b[0m \u001b[39mstr\u001b[39m(e),\n\u001b[0;32m 221\u001b[0m )\n", "File \u001b[1;32mc:\\Users\\SC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:2539\u001b[0m, in \u001b[0;36mclassification_report\u001b[1;34m(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict, zero_division)\u001b[0m\n\u001b[0;32m 2405\u001b[0m \u001b[39m@validate_params\u001b[39m(\n\u001b[0;32m 2406\u001b[0m {\n\u001b[0;32m 2407\u001b[0m \u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m: [\u001b[39m\"\u001b[39m\u001b[39marray-like\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39msparse matrix\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2430\u001b[0m zero_division\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mwarn\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 2431\u001b[0m ):\n\u001b[0;32m 2432\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Build a text report showing the main classification metrics.\u001b[39;00m\n\u001b[0;32m 2433\u001b[0m \n\u001b[0;32m 2434\u001b[0m \u001b[39m Read more in the :ref:`User Guide <classification_report>`.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2536\u001b[0m \u001b[39m <BLANKLINE>\u001b[39;00m\n\u001b[0;32m 2537\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2539\u001b[0m y_type, y_true, y_pred \u001b[39m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m 2541\u001b[0m \u001b[39mif\u001b[39;00m labels \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 2542\u001b[0m labels \u001b[39m=\u001b[39m unique_labels(y_true, y_pred)\n", "File \u001b[1;32mc:\\Users\\SC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:93\u001b[0m, in \u001b[0;36m_check_targets\u001b[1;34m(y_true, y_pred)\u001b[0m\n\u001b[0;32m 90\u001b[0m y_type \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39mmulticlass\u001b[39m\u001b[39m\"\u001b[39m}\n\u001b[0;32m 92\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(y_type) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m---> 93\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 94\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mClassification metrics can\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt handle a mix of \u001b[39m\u001b[39m{0}\u001b[39;00m\u001b[39m and \u001b[39m\u001b[39m{1}\u001b[39;00m\u001b[39m targets\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(\n\u001b[0;32m 95\u001b[0m type_true, type_pred\n\u001b[0;32m 96\u001b[0m )\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 99\u001b[0m \u001b[39m# We can't have more than one value on y_type => The set is no more needed\u001b[39;00m\n\u001b[0;32m 100\u001b[0m y_type \u001b[39m=\u001b[39m y_type\u001b[39m.\u001b[39mpop()\n", "\u001b[1;31mValueError\u001b[0m: Classification metrics can't handle a mix of binary and continuous targets" ] } ], "source": [ "# Function to clean the text data\n", "\n", "def clean_text(text):\n", " text = re.sub(r'<.*?>', '', text) # Remove HTML tags\n", " text = re.sub(r'[^a-zA-Z0-9\\s]', '', text) # Remove non-alphanumeric characters\n", " text = text.lower() # Convert to lowercase\n", " return text\n", "\n", "data['cleaned-review'] = data['review'].apply(clean_text)\n", "data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0})\n", "\n", "X_train, X_test, Y_train, Y_test = train_test_split(data['cleaned-review'], data['label'], test_size=0.2, random_state=42)\n", "\n", "vectorizer = TfidfVectorizer(max_features=5000)\n", "X_train_tfidf = vectorizer.fit_transform(X_train)\n", "X_test_tfidf = vectorizer.transform(X_test)\n", "\n", "regression_model = SVR()\n", "regression_model.fit(X_train_tfidf, Y_train)\n", "\n", "predictions = regression_model.predict(X_test_tfidf)\n", "report = classification_report(Y_test, predictions, output_dict=True)\n", "report" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def predict_rating(review):\n", " cleaned_review = clean_text(review)\n", " review_tfidf = vectorizer.transform([cleaned_review])\n", " predicted_rating = regression_model.predict(review_tfidf)[0]\n", " return predicted_rating\n", "\n", "print(predict_rating(\"This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }