{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ReelRating: CS 4804 Mini-Project\n",
    "## Jeff Suliga, Tanya Acharya, Rishi Patel, Parth Mittal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# All the imports for the project:\n",
    "\n",
    "import numpy as np \n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.svm import SVR\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the data from the csv file:\n",
    "\n",
    "file_path = 'IMDBDataset.csv'\n",
    "data = pd.read_csv(file_path)\n",
    "\n",
    "# data.info()\n",
    "\n",
    "#data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Classification metrics can't handle a mix of binary and continuous targets",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32mc:\\Users\\SC\\OneDrive - Virginia Tech\\Fall 2023\\Intro to AI\\reelrating\\reelrating_training.ipynb Cell 4\u001b[0m line \u001b[0;36m2\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=18'>19</a>\u001b[0m regression_model\u001b[39m.\u001b[39mfit(X_train_tfidf, Y_train)\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=20'>21</a>\u001b[0m predictions \u001b[39m=\u001b[39m regression_model\u001b[39m.\u001b[39mpredict(X_test_tfidf)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=21'>22</a>\u001b[0m report \u001b[39m=\u001b[39m classification_report(Y_test, predictions, output_dict\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=22'>23</a>\u001b[0m report\n",
      "File \u001b[1;32mc:\\Users\\SC\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:211\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    205\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m    206\u001b[0m     \u001b[39mwith\u001b[39;00m config_context(\n\u001b[0;32m    207\u001b[0m         skip_parameter_validation\u001b[39m=\u001b[39m(\n\u001b[0;32m    208\u001b[0m             prefer_skip_nested_validation \u001b[39mor\u001b[39;00m global_skip_validation\n\u001b[0;32m    209\u001b[0m         )\n\u001b[0;32m    210\u001b[0m     ):\n\u001b[1;32m--> 211\u001b[0m         \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m    212\u001b[0m \u001b[39mexcept\u001b[39;00m InvalidParameterError \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m    213\u001b[0m     \u001b[39m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m    214\u001b[0m     \u001b[39m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m    215\u001b[0m     \u001b[39m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m    216\u001b[0m     \u001b[39m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m    217\u001b[0m     msg \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39msub(\n\u001b[0;32m    218\u001b[0m         \u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m\\\u001b[39m\u001b[39mw+ must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m    219\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m{\u001b[39;00mfunc\u001b[39m.\u001b[39m\u001b[39m__qualname__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m    220\u001b[0m         \u001b[39mstr\u001b[39m(e),\n\u001b[0;32m    221\u001b[0m     )\n",
      "File \u001b[1;32mc:\\Users\\SC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:2539\u001b[0m, in \u001b[0;36mclassification_report\u001b[1;34m(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict, zero_division)\u001b[0m\n\u001b[0;32m   2405\u001b[0m \u001b[39m@validate_params\u001b[39m(\n\u001b[0;32m   2406\u001b[0m     {\n\u001b[0;32m   2407\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m: [\u001b[39m\"\u001b[39m\u001b[39marray-like\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39msparse matrix\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2430\u001b[0m     zero_division\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mwarn\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m   2431\u001b[0m ):\n\u001b[0;32m   2432\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Build a text report showing the main classification metrics.\u001b[39;00m\n\u001b[0;32m   2433\u001b[0m \n\u001b[0;32m   2434\u001b[0m \u001b[39m    Read more in the :ref:`User Guide <classification_report>`.\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2536\u001b[0m \u001b[39m    <BLANKLINE>\u001b[39;00m\n\u001b[0;32m   2537\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2539\u001b[0m     y_type, y_true, y_pred \u001b[39m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m   2541\u001b[0m     \u001b[39mif\u001b[39;00m labels \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m   2542\u001b[0m         labels \u001b[39m=\u001b[39m unique_labels(y_true, y_pred)\n",
      "File \u001b[1;32mc:\\Users\\SC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:93\u001b[0m, in \u001b[0;36m_check_targets\u001b[1;34m(y_true, y_pred)\u001b[0m\n\u001b[0;32m     90\u001b[0m     y_type \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39mmulticlass\u001b[39m\u001b[39m\"\u001b[39m}\n\u001b[0;32m     92\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(y_type) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m---> 93\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m     94\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mClassification metrics can\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt handle a mix of \u001b[39m\u001b[39m{0}\u001b[39;00m\u001b[39m and \u001b[39m\u001b[39m{1}\u001b[39;00m\u001b[39m targets\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(\n\u001b[0;32m     95\u001b[0m             type_true, type_pred\n\u001b[0;32m     96\u001b[0m         )\n\u001b[0;32m     97\u001b[0m     )\n\u001b[0;32m     99\u001b[0m \u001b[39m# We can't have more than one value on y_type => The set is no more needed\u001b[39;00m\n\u001b[0;32m    100\u001b[0m y_type \u001b[39m=\u001b[39m y_type\u001b[39m.\u001b[39mpop()\n",
      "\u001b[1;31mValueError\u001b[0m: Classification metrics can't handle a mix of binary and continuous targets"
     ]
    }
   ],
   "source": [
    "# Function to clean the text data\n",
    "\n",
    "def clean_text(text):\n",
    "    text = re.sub(r'<.*?>', '', text) # Remove HTML tags\n",
    "    text = re.sub(r'[^a-zA-Z0-9\\s]', '', text) # Remove non-alphanumeric characters\n",
    "    text = text.lower() # Convert to lowercase\n",
    "    return text\n",
    "\n",
    "data['cleaned-review'] = data['review'].apply(clean_text)\n",
    "data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0})\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(data['cleaned-review'], data['label'], test_size=0.2, random_state=42)\n",
    "\n",
    "vectorizer = TfidfVectorizer(max_features=5000)\n",
    "X_train_tfidf = vectorizer.fit_transform(X_train)\n",
    "X_test_tfidf = vectorizer.transform(X_test)\n",
    "\n",
    "regression_model = SVR()\n",
    "regression_model.fit(X_train_tfidf, Y_train)\n",
    "\n",
    "predictions = regression_model.predict(X_test_tfidf)\n",
    "report = classification_report(Y_test, predictions, output_dict=True)\n",
    "report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_rating(review):\n",
    "    cleaned_review = clean_text(review)\n",
    "    review_tfidf = vectorizer.transform([cleaned_review])\n",
    "    predicted_rating = regression_model.predict(review_tfidf)[0]\n",
    "    return predicted_rating\n",
    "\n",
    "print(predict_rating(\"This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}