Testing Model Training

8a396e6e · Parth Mittal · 8f329351 · 8a396e6e · 8a396e6e · 8a396e6e
Commit 8a396e6e authored 1 year ago by Parth Mittal
--- a/GUI.py
+++ b/GUI.py
--- a/model.py
+++ b/model.py
--- a/reelrating_training.ipynb
+++ b/reelrating_training.ipynb
@@ -10,7 +10,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -18,9 +18,89 @@
    "\n",
    "import numpy as np \n",
    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import classification_report\n",
+    "from sklearn.svm import SVR\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the data from the csv file:\n",
+    "\n",
+    "file_path = 'IMDBDataset.csv'\n",
+    "data = pd.read_csv(file_path)\n",
+    "\n",
+    "# data.info()\n",
+    "\n",
+    "#data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Classification metrics can't handle a mix of binary and continuous targets",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[1;32mc:\\Users\\SC\\OneDrive - Virginia Tech\\Fall 2023\\Intro to AI\\reelrating\\reelrating_training.ipynb Cell 4\u001b[0m line \u001b[0;36m2\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=18'>19</a>\u001b[0m regression_model\u001b[39m.\u001b[39mfit(X_train_tfidf, Y_train)\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=20'>21</a>\u001b[0m predictions \u001b[39m=\u001b[39m regression_model\u001b[39m.\u001b[39mpredict(X_test_tfidf)\n\u001b[1;32m---> <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=21'>22</a>\u001b[0m report \u001b[39m=\u001b[39m classification_report(Y_test, predictions, output_dict\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m     <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=22'>23</a>\u001b[0m report\n",
+      "File \u001b[1;32mc:\\Users\\SC\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:211\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    205\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m    206\u001b[0m     \u001b[39mwith\u001b[39;00m config_context(\n\u001b[0;32m    207\u001b[0m         skip_parameter_validation\u001b[39m=\u001b[39m(\n\u001b[0;32m    208\u001b[0m             prefer_skip_nested_validation \u001b[39mor\u001b[39;00m global_skip_validation\n\u001b[0;32m    209\u001b[0m         )\n\u001b[0;32m    210\u001b[0m     ):\n\u001b[1;32m--> 211\u001b[0m         \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m    212\u001b[0m \u001b[39mexcept\u001b[39;00m InvalidParameterError \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m    213\u001b[0m     \u001b[39m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m    214\u001b[0m     \u001b[39m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m    215\u001b[0m     \u001b[39m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m    216\u001b[0m     \u001b[39m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m    217\u001b[0m     msg \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39msub(\n\u001b[0;32m    218\u001b[0m         \u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m\\\u001b[39m\u001b[39mw+ must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m    219\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m{\u001b[39;00mfunc\u001b[39m.\u001b[39m\u001b[39m__qualname__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m    220\u001b[0m         \u001b[39mstr\u001b[39m(e),\n\u001b[0;32m    221\u001b[0m     )\n",
+      "File \u001b[1;32mc:\\Users\\SC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:2539\u001b[0m, in \u001b[0;36mclassification_report\u001b[1;34m(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict, zero_division)\u001b[0m\n\u001b[0;32m   2405\u001b[0m \u001b[39m@validate_params\u001b[39m(\n\u001b[0;32m   2406\u001b[0m     {\n\u001b[0;32m   2407\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m: [\u001b[39m\"\u001b[39m\u001b[39marray-like\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39msparse matrix\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2430\u001b[0m     zero_division\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mwarn\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m   2431\u001b[0m ):\n\u001b[0;32m   2432\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Build a text report showing the main classification metrics.\u001b[39;00m\n\u001b[0;32m   2433\u001b[0m \n\u001b[0;32m   2434\u001b[0m \u001b[39m    Read more in the :ref:`User Guide <classification_report>`.\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2536\u001b[0m \u001b[39m    <BLANKLINE>\u001b[39;00m\n\u001b[0;32m   2537\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 2539\u001b[0m     y_type, y_true, y_pred \u001b[39m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[0;32m   2541\u001b[0m     \u001b[39mif\u001b[39;00m labels \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m   2542\u001b[0m         labels \u001b[39m=\u001b[39m unique_labels(y_true, y_pred)\n",
+      "File \u001b[1;32mc:\\Users\\SC\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:93\u001b[0m, in \u001b[0;36m_check_targets\u001b[1;34m(y_true, y_pred)\u001b[0m\n\u001b[0;32m     90\u001b[0m     y_type \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39mmulticlass\u001b[39m\u001b[39m\"\u001b[39m}\n\u001b[0;32m     92\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(y_type) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m---> 93\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m     94\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mClassification metrics can\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt handle a mix of \u001b[39m\u001b[39m{0}\u001b[39;00m\u001b[39m and \u001b[39m\u001b[39m{1}\u001b[39;00m\u001b[39m targets\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(\n\u001b[0;32m     95\u001b[0m             type_true, type_pred\n\u001b[0;32m     96\u001b[0m         )\n\u001b[0;32m     97\u001b[0m     )\n\u001b[0;32m     99\u001b[0m \u001b[39m# We can't have more than one value on y_type => The set is no more needed\u001b[39;00m\n\u001b[0;32m    100\u001b[0m y_type \u001b[39m=\u001b[39m y_type\u001b[39m.\u001b[39mpop()\n",
+      "\u001b[1;31mValueError\u001b[0m: Classification metrics can't handle a mix of binary and continuous targets"
+     ]
+    }
+   ],
+   "source": [
+    "# Function to clean the text data\n",
+    "\n",
+    "def clean_text(text):\n",
+    "    text = re.sub(r'<.*?>', '', text) # Remove HTML tags\n",
+    "    text = re.sub(r'[^a-zA-Z0-9\\s]', '', text) # Remove non-alphanumeric characters\n",
+    "    text = text.lower() # Convert to lowercase\n",
+    "    return text\n",
+    "\n",
+    "data['cleaned-review'] = data['review'].apply(clean_text)\n",
+    "data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0})\n",
    "\n",
+    "X_train, X_test, Y_train, Y_test = train_test_split(data['cleaned-review'], data['label'], test_size=0.2, random_state=42)\n",
+    "\n",
+    "vectorizer = TfidfVectorizer(max_features=5000)\n",
+    "X_train_tfidf = vectorizer.fit_transform(X_train)\n",
+    "X_test_tfidf = vectorizer.transform(X_test)\n",
+    "\n",
+    "regression_model = SVR()\n",
+    "regression_model.fit(X_train_tfidf, Y_train)\n",
+    "\n",
+    "predictions = regression_model.predict(X_test_tfidf)\n",
+    "report = classification_report(Y_test, predictions, output_dict=True)\n",
+    "report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict_rating(review):\n",
+    "    cleaned_review = clean_text(review)\n",
+    "    review_tfidf = vectorizer.transform([cleaned_review])\n",
+    "    predicted_rating = regression_model.predict(review_tfidf)[0]\n",
+    "    return predicted_rating\n",
    "\n",
-    "data = pd.read_csv('IMDBDataset.csv')"
+    "print(predict_rating(\"This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!\"))"
   ]
  },
  {
@@ -38,7 +118,15 @@
   "name": "python3"
  },
  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },

 %% Cell type:markdown id: tags:

 # ReelRating: CS 4804 Mini-Project
 ## Jeff Suliga, Tanya Acharya, Rishi Patel, Parth Mittal

 %% Cell type:code id: tags:

 ``` python
 # All the imports for the project:

 import numpy as np
 import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report
+from sklearn.svm import SVR
+import re
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Read the data from the csv file:
+
+file_path = 'IMDBDataset.csv'
+data = pd.read_csv(file_path)
+
+# data.info()
+
+#data.head()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Function to clean the text data

+def clean_text(text):
+    text = re.sub(r'<.*?>', '', text) # Remove HTML tags
+    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove non-alphanumeric characters
+    text = text.lower() # Convert to lowercase
+    return text
+
+data['cleaned-review'] = data['review'].apply(clean_text)
+data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0})
+
+X_train, X_test, Y_train, Y_test = train_test_split(data['cleaned-review'], data['label'], test_size=0.2, random_state=42)
+
+vectorizer = TfidfVectorizer(max_features=5000)
+X_train_tfidf = vectorizer.fit_transform(X_train)
+X_test_tfidf = vectorizer.transform(X_test)
+
+regression_model = SVR()
+regression_model.fit(X_train_tfidf, Y_train)
+
+predictions = regression_model.predict(X_test_tfidf)
+report = classification_report(Y_test, predictions, output_dict=True)
+report
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    ValueError                                Traceback (most recent call last)
+    c:\Users\SC\OneDrive - Virginia Tech\Fall 2023\Intro to AI\reelrating\reelrating_training.ipynb Cell 4 line 2
+         <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=18'>19</a> regression_model.fit(X_train_tfidf, Y_train)
+         <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=20'>21</a> predictions = regression_model.predict(X_test_tfidf)
+    ---> <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=21'>22</a> report = classification_report(Y_test, predictions, output_dict=True)
+         <a href='vscode-notebook-cell:/c%3A/Users/SC/OneDrive%20-%20Virginia%20Tech/Fall%202023/Intro%20to%20AI/reelrating/reelrating_training.ipynb#W3sZmlsZQ%3D%3D?line=22'>23</a> report
+File     c:\Users\SC\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:211, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
+        205 try:
+        206     with config_context(
+        207         skip_parameter_validation=(
+        208             prefer_skip_nested_validation or global_skip_validation
+        209         )
+        210     ):
+    --> 211         return func(*args, **kwargs)
+        212 except InvalidParameterError as e:
+        213     # When the function is just a wrapper around an estimator, we allow
+        214     # the function to delegate validation to the estimator, but we replace
+        215     # the name of the estimator by the name of the function in the error
+        216     # message to avoid confusion.
+        217     msg = re.sub(
+        218         r"parameter of \w+ must be",
+        219         f"parameter of {func.__qualname__} must be",
+        220         str(e),
+        221     )
+File     c:\Users\SC\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:2539, in classification_report(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict, zero_division)
+       2405 @validate_params(
+       2406     {
+       2407         "y_true": ["array-like", "sparse matrix"],
+       (...)
+       2430     zero_division="warn",
+       2431 ):
+       2432     """Build a text report showing the main classification metrics.
+       2433
+       2434     Read more in the :ref:`User Guide <classification_report>`.
+       (...)
+       2536     <BLANKLINE>
+       2537     """
+    -> 2539     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+       2541     if labels is None:
+       2542         labels = unique_labels(y_true, y_pred)
+File     c:\Users\SC\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:93, in _check_targets(y_true, y_pred)
+         90     y_type = {"multiclass"}
+         92 if len(y_type) > 1:
+    ---> 93     raise ValueError(
+         94         "Classification metrics can't handle a mix of {0} and {1} targets".format(
+         95             type_true, type_pred
+         96         )
+         97     )
+         99 # We can't have more than one value on y_type => The set is no more needed
+        100 y_type = y_type.pop()
+    ValueError: Classification metrics can't handle a mix of binary and continuous targets
+
+%% Cell type:code id: tags:
+
+``` python
+def predict_rating(review):
+    cleaned_review = clean_text(review)
+    review_tfidf = vectorizer.transform([cleaned_review])
+    predicted_rating = regression_model.predict(review_tfidf)[0]
+    return predicted_rating

-data = pd.read_csv('IMDBDataset.csv')
+print(predict_rating("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))
 ```

 %% Cell type:code id: tags:

 ``` python
 ```

--- a/util.py
+++ b/util.py
+import re
+
+def clean_text(text):
+    """ Clean the input text. """
+    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
+    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
+    text = text.lower()  # Convert to lowercase
+    return text