diff --git a/NLP_Approach b/NLP_Approach
new file mode 100644
index 0000000000000000000000000000000000000000..70a7747684798a2eceacd6a20a97975bd2912cff
--- /dev/null
+++ b/NLP_Approach
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":196309,"status":"ok","timestamp":1670510422399,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"},"user_tz":300},"id":"REugZMxlFCvU","outputId":"2b6a1767-4296-4d10-cc7a-fcefdb7d6113"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/gdrive/\n"]}],"source":["from google.colab import drive\n","drive.mount(\"/content/gdrive/\")"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"ceMmp0oL8PyI","executionInfo":{"status":"ok","timestamp":1670510423386,"user_tz":300,"elapsed":990,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import shutil\n","import sys  \n","import os\n","import time\n","from sklearn.model_selection import train_test_split"]},{"cell_type":"code","execution_count":3,"metadata":{"id":"qAYz11Vr-k5b","executionInfo":{"status":"ok","timestamp":1670510423386,"user_tz":300,"elapsed":3,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["dir_path = 'gdrive/Shareddrives/CS5024 Ethics Project'\n","sys.path.append(dir_path)"]},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":10280,"status":"ok","timestamp":1670510433664,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"},"user_tz":300},"id":"jmTuxI2t8PwQ","outputId":"d312c306-e195-45bc-95de-fed482fa0869"},"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (5,11) have mixed types.Specify dtype option on import or set low_memory=False.\n","  exec(code_obj, self.user_global_ns, self.user_ns)\n"]}],"source":["# Create DF for train and test set\n","train_df = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/TrainingData.csv')\n","test_df = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/TestData.csv')"]},{"cell_type":"code","source":["print (train_df.shape)\n","print (test_df.shape)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0NPhp8WQ1YA0","executionInfo":{"status":"ok","timestamp":1670510433665,"user_tz":300,"elapsed":7,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}},"outputId":"33582c11-9cd6-4a16-ef75-957d55f8c906"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["(400277, 26)\n","(50064, 17)\n"]}]},{"cell_type":"markdown","source":["# Data Preprocessing"],"metadata":{"id":"Y558pu6Bdbbw"}},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":797},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1670510433665,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"},"user_tz":300},"id":"Q9Jzqrlwiu83","outputId":"cf12f590-0d36-4704-de43-6193bc7fcf82"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["                                Function          Use  \\\n","Unnamed: 0                                              \n","134338              Teacher Compensation  Instruction   \n","206341                          NO_LABEL     NO_LABEL   \n","326408              Teacher Compensation  Instruction   \n","364634           Substitute Compensation  Instruction   \n","47683            Substitute Compensation  Instruction   \n","...                                  ...          ...   \n","109283          Professional Development         ISPD   \n","102430           Substitute Compensation  Instruction   \n","413949      Parent & Community Relations     NO_LABEL   \n","433672                   Library & Media  Instruction   \n","415831           Substitute Compensation  Instruction   \n","\n","                              Sharing   Reporting Student_Type  \\\n","Unnamed: 0                                                       \n","134338                School Reported      School     NO_LABEL   \n","206341                       NO_LABEL    NO_LABEL     NO_LABEL   \n","326408                School Reported      School  Unspecified   \n","364634                School Reported      School  Unspecified   \n","47683                 School Reported      School  Unspecified   \n","...                               ...         ...          ...   \n","109283                Shared Services  Non-School  Unspecified   \n","102430                School Reported      School  Unspecified   \n","413949                School Reported      School     NO_LABEL   \n","433672      School on Central Budgets  Non-School  Unspecified   \n","415831                School Reported      School      Poverty   \n","\n","                  Position_Type                 Object_Type     Pre_K  \\\n","Unnamed: 0                                                              \n","134338                  Teacher                    NO_LABEL  NO_LABEL   \n","206341                 NO_LABEL                    NO_LABEL  NO_LABEL   \n","326408                  Teacher    Base Salary/Compensation  Non PreK   \n","364634               Substitute                    Benefits  NO_LABEL   \n","47683                   Teacher     Substitute Compensation  NO_LABEL   \n","...                         ...                         ...       ...   \n","109283      Instructional Coach  Other Compensation/Stipend  NO_LABEL   \n","102430               Substitute    Base Salary/Compensation  NO_LABEL   \n","413949                    Other                    NO_LABEL  NO_LABEL   \n","433672                Librarian                    Benefits  NO_LABEL   \n","415831               Substitute     Substitute Compensation  Non PreK   \n","\n","             Operating_Status  \\\n","Unnamed: 0                      \n","134338      PreK-12 Operating   \n","206341          Non-Operating   \n","326408      PreK-12 Operating   \n","364634      PreK-12 Operating   \n","47683       PreK-12 Operating   \n","...                       ...   \n","109283      PreK-12 Operating   \n","102430      PreK-12 Operating   \n","413949      PreK-12 Operating   \n","433672      PreK-12 Operating   \n","415831      PreK-12 Operating   \n","\n","                                         Object_Description  ...  \\\n","Unnamed: 0                                                   ...   \n","134338                                                  NaN  ...   \n","206341                                  CONTRACTOR SERVICES  ...   \n","326408                         Personal Services - Teachers  ...   \n","364634                                    EMPLOYEE BENEFITS  ...   \n","47683                          TEACHER COVERAGE FOR TEACHER  ...   \n","...                                                     ...  ...   \n","109283                       WORKSHOP PARTICIPANT            ...   \n","102430                       SALARIES OF PART TIME EMPLOYEE  ...   \n","413949                                                  NaN  ...   \n","433672                                    EMPLOYEE BENEFITS  ...   \n","415831      Salaries And Wages For Substitute Professionals  ...   \n","\n","                                Sub_Object_Description  \\\n","Unnamed: 0                                               \n","134338                                             NaN   \n","206341                                             NaN   \n","326408                                             NaN   \n","364634                                             NaN   \n","47683                                              NaN   \n","...                                                ...   \n","109283                                             NaN   \n","102430                                             NaN   \n","413949                                             NaN   \n","433672                                             NaN   \n","415831      Inservice Substitute Teachers Grant Funded   \n","\n","                      Location_Description      FTE  \\\n","Unnamed: 0                                            \n","134338                                 NaN  1.00000   \n","206341                                 NaN      NaN   \n","326408                                 NaN  1.00000   \n","364634                                 NaN      NaN   \n","47683                                  NaN      NaN   \n","...                                    ...      ...   \n","109283      STAFF DEV AND INSTR MEDIA           NaN   \n","102430                                 NaN  0.00431   \n","413949                                 NaN  1.00000   \n","433672                ED RESOURCE SERVICES      NaN   \n","415831                             School       NaN   \n","\n","                      Function_Description      Facility_or_Department  \\\n","Unnamed: 0                                                               \n","134338                                 NaN                         NaN   \n","206341                            RGN  GOB                         NaN   \n","326408                                 NaN                         NaN   \n","364634             UNALLOC BUDGETS/SCHOOLS                         NaN   \n","47683                          NON-PROJECT                         NaN   \n","...                                    ...                         ...   \n","109283      INST STAFF TRAINING SVCS                               NaN   \n","102430                          TITLE II,D                         NaN   \n","413949                                 NaN                         NaN   \n","433672                         NON-PROJECT                         NaN   \n","415831                         Instruction  Instruction And Curriculum   \n","\n","                           Position_Extra         Total  \\\n","Unnamed: 0                                                \n","134338                      KINDERGARTEN   50471.810000   \n","206341                       UNDESIGNATED   3477.860000   \n","326408                            TEACHER  62237.130000   \n","364634         PROFESSIONAL-INSTRUCTIONAL     22.300000   \n","47683          PROFESSIONAL-INSTRUCTIONAL     54.166000   \n","...                                   ...           ...   \n","109283                                NaN     48.620000   \n","102430         PROFESSIONAL-INSTRUCTIONAL    128.824985   \n","413949                     PARENT/TITLE I   4902.290000   \n","433672      OFFICE/ADMINISTRATIVE SUPPORT   4020.290000   \n","415831               CERTIFIED SUBSTITUTE     46.530000   \n","\n","                       Program_Description  \\\n","Unnamed: 0                                   \n","134338                        KINDERGARTEN   \n","206341       BUILDING IMPROVEMENT SERVICES   \n","326408               Instruction - Regular   \n","364634      GENERAL MIDDLE/JUNIOR HIGH SCH   \n","47683        GENERAL HIGH SCHOOL EDUCATION   \n","...                                    ...   \n","109283                                 NaN   \n","102430        INSTRUCTIONAL STAFF TRAINING   \n","413949                                Misc   \n","433672              MEDIA SUPPORT SERVICES   \n","415831               Accelerated Education   \n","\n","                                    Fund_Description  \\\n","Unnamed: 0                                             \n","134338                                  General Fund   \n","206341                                           NaN   \n","326408                        General Purpose School   \n","364634                                           NaN   \n","47683                                            NaN   \n","...                                              ...   \n","109283                GENERAL FUND                     \n","102430                                           NaN   \n","413949                            Schoolwide Schools   \n","433672                                           NaN   \n","415831      \"Title  Part A Improving Basic Programs\"   \n","\n","                                    Text_1  \n","Unnamed: 0                                  \n","134338                                 NaN  \n","206341       BUILDING IMPROVEMENT SERVICES  \n","326408                                 NaN  \n","364634                 REGULAR INSTRUCTION  \n","47683                  REGULAR INSTRUCTION  \n","...                                    ...  \n","109283      STAFF DEV AND INSTR MEDIA       \n","102430                 INSTRUCTIONAL STAFF  \n","413949                                 NaN  \n","433672                 INSTRUCTIONAL STAFF  \n","415831                      MISCELLANEOUS   \n","\n","[400277 rows x 25 columns]"],"text/html":["\n","  <div id=\"df-f666659d-cc53-4867-b64a-0d1b481a7c61\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Function</th>\n","      <th>Use</th>\n","      <th>Sharing</th>\n","      <th>Reporting</th>\n","      <th>Student_Type</th>\n","      <th>Position_Type</th>\n","      <th>Object_Type</th>\n","      <th>Pre_K</th>\n","      <th>Operating_Status</th>\n","      <th>Object_Description</th>\n","      <th>...</th>\n","      <th>Sub_Object_Description</th>\n","      <th>Location_Description</th>\n","      <th>FTE</th>\n","      <th>Function_Description</th>\n","      <th>Facility_or_Department</th>\n","      <th>Position_Extra</th>\n","      <th>Total</th>\n","      <th>Program_Description</th>\n","      <th>Fund_Description</th>\n","      <th>Text_1</th>\n","    </tr>\n","    <tr>\n","      <th>Unnamed: 0</th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>134338</th>\n","      <td>Teacher Compensation</td>\n","      <td>Instruction</td>\n","      <td>School Reported</td>\n","      <td>School</td>\n","      <td>NO_LABEL</td>\n","      <td>Teacher</td>\n","      <td>NO_LABEL</td>\n","      <td>NO_LABEL</td>\n","      <td>PreK-12 Operating</td>\n","      <td>NaN</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>1.00000</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>KINDERGARTEN</td>\n","      <td>50471.810000</td>\n","      <td>KINDERGARTEN</td>\n","      <td>General Fund</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>206341</th>\n","      <td>NO_LABEL</td>\n","      <td>NO_LABEL</td>\n","      <td>NO_LABEL</td>\n","      <td>NO_LABEL</td>\n","      <td>NO_LABEL</td>\n","      <td>NO_LABEL</td>\n","      <td>NO_LABEL</td>\n","      <td>NO_LABEL</td>\n","      <td>Non-Operating</td>\n","      <td>CONTRACTOR SERVICES</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>RGN  GOB</td>\n","      <td>NaN</td>\n","      <td>UNDESIGNATED</td>\n","      <td>3477.860000</td>\n","      <td>BUILDING IMPROVEMENT SERVICES</td>\n","      <td>NaN</td>\n","      <td>BUILDING IMPROVEMENT SERVICES</td>\n","    </tr>\n","    <tr>\n","      <th>326408</th>\n","      <td>Teacher Compensation</td>\n","      <td>Instruction</td>\n","      <td>School Reported</td>\n","      <td>School</td>\n","      <td>Unspecified</td>\n","      <td>Teacher</td>\n","      <td>Base Salary/Compensation</td>\n","      <td>Non PreK</td>\n","      <td>PreK-12 Operating</td>\n","      <td>Personal Services - Teachers</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>1.00000</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>TEACHER</td>\n","      <td>62237.130000</td>\n","      <td>Instruction - Regular</td>\n","      <td>General Purpose School</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>364634</th>\n","      <td>Substitute Compensation</td>\n","      <td>Instruction</td>\n","      <td>School Reported</td>\n","      <td>School</td>\n","      <td>Unspecified</td>\n","      <td>Substitute</td>\n","      <td>Benefits</td>\n","      <td>NO_LABEL</td>\n","      <td>PreK-12 Operating</td>\n","      <td>EMPLOYEE BENEFITS</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>UNALLOC BUDGETS/SCHOOLS</td>\n","      <td>NaN</td>\n","      <td>PROFESSIONAL-INSTRUCTIONAL</td>\n","      <td>22.300000</td>\n","      <td>GENERAL MIDDLE/JUNIOR HIGH SCH</td>\n","      <td>NaN</td>\n","      <td>REGULAR INSTRUCTION</td>\n","    </tr>\n","    <tr>\n","      <th>47683</th>\n","      <td>Substitute Compensation</td>\n","      <td>Instruction</td>\n","      <td>School Reported</td>\n","      <td>School</td>\n","      <td>Unspecified</td>\n","      <td>Teacher</td>\n","      <td>Substitute Compensation</td>\n","      <td>NO_LABEL</td>\n","      <td>PreK-12 Operating</td>\n","      <td>TEACHER COVERAGE FOR TEACHER</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NON-PROJECT</td>\n","      <td>NaN</td>\n","      <td>PROFESSIONAL-INSTRUCTIONAL</td>\n","      <td>54.166000</td>\n","      <td>GENERAL HIGH SCHOOL EDUCATION</td>\n","      <td>NaN</td>\n","      <td>REGULAR INSTRUCTION</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>109283</th>\n","      <td>Professional Development</td>\n","      <td>ISPD</td>\n","      <td>Shared Services</td>\n","      <td>Non-School</td>\n","      <td>Unspecified</td>\n","      <td>Instructional Coach</td>\n","      <td>Other Compensation/Stipend</td>\n","      <td>NO_LABEL</td>\n","      <td>PreK-12 Operating</td>\n","      <td>WORKSHOP PARTICIPANT</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>STAFF DEV AND INSTR MEDIA</td>\n","      <td>NaN</td>\n","      <td>INST STAFF TRAINING SVCS</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>48.620000</td>\n","      <td>NaN</td>\n","      <td>GENERAL FUND</td>\n","      <td>STAFF DEV AND INSTR MEDIA</td>\n","    </tr>\n","    <tr>\n","      <th>102430</th>\n","      <td>Substitute Compensation</td>\n","      <td>Instruction</td>\n","      <td>School Reported</td>\n","      <td>School</td>\n","      <td>Unspecified</td>\n","      <td>Substitute</td>\n","      <td>Base Salary/Compensation</td>\n","      <td>NO_LABEL</td>\n","      <td>PreK-12 Operating</td>\n","      <td>SALARIES OF PART TIME EMPLOYEE</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>0.00431</td>\n","      <td>TITLE II,D</td>\n","      <td>NaN</td>\n","      <td>PROFESSIONAL-INSTRUCTIONAL</td>\n","      <td>128.824985</td>\n","      <td>INSTRUCTIONAL STAFF TRAINING</td>\n","      <td>NaN</td>\n","      <td>INSTRUCTIONAL STAFF</td>\n","    </tr>\n","    <tr>\n","      <th>413949</th>\n","      <td>Parent &amp; Community Relations</td>\n","      <td>NO_LABEL</td>\n","      <td>School Reported</td>\n","      <td>School</td>\n","      <td>NO_LABEL</td>\n","      <td>Other</td>\n","      <td>NO_LABEL</td>\n","      <td>NO_LABEL</td>\n","      <td>PreK-12 Operating</td>\n","      <td>NaN</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>1.00000</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>PARENT/TITLE I</td>\n","      <td>4902.290000</td>\n","      <td>Misc</td>\n","      <td>Schoolwide Schools</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>433672</th>\n","      <td>Library &amp; Media</td>\n","      <td>Instruction</td>\n","      <td>School on Central Budgets</td>\n","      <td>Non-School</td>\n","      <td>Unspecified</td>\n","      <td>Librarian</td>\n","      <td>Benefits</td>\n","      <td>NO_LABEL</td>\n","      <td>PreK-12 Operating</td>\n","      <td>EMPLOYEE BENEFITS</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>ED RESOURCE SERVICES</td>\n","      <td>NaN</td>\n","      <td>NON-PROJECT</td>\n","      <td>NaN</td>\n","      <td>OFFICE/ADMINISTRATIVE SUPPORT</td>\n","      <td>4020.290000</td>\n","      <td>MEDIA SUPPORT SERVICES</td>\n","      <td>NaN</td>\n","      <td>INSTRUCTIONAL STAFF</td>\n","    </tr>\n","    <tr>\n","      <th>415831</th>\n","      <td>Substitute Compensation</td>\n","      <td>Instruction</td>\n","      <td>School Reported</td>\n","      <td>School</td>\n","      <td>Poverty</td>\n","      <td>Substitute</td>\n","      <td>Substitute Compensation</td>\n","      <td>Non PreK</td>\n","      <td>PreK-12 Operating</td>\n","      <td>Salaries And Wages For Substitute Professionals</td>\n","      <td>...</td>\n","      <td>Inservice Substitute Teachers Grant Funded</td>\n","      <td>School</td>\n","      <td>NaN</td>\n","      <td>Instruction</td>\n","      <td>Instruction And Curriculum</td>\n","      <td>CERTIFIED SUBSTITUTE</td>\n","      <td>46.530000</td>\n","      <td>Accelerated Education</td>\n","      <td>\"Title  Part A Improving Basic Programs\"</td>\n","      <td>MISCELLANEOUS</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>400277 rows × 25 columns</p>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f666659d-cc53-4867-b64a-0d1b481a7c61')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-f666659d-cc53-4867-b64a-0d1b481a7c61 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-f666659d-cc53-4867-b64a-0d1b481a7c61');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":6}],"source":["test_df.set_index('Unnamed: 0')\n","train_df.set_index('Unnamed: 0')"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"elapsed":2146,"status":"ok","timestamp":1670510435808,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"},"user_tz":300},"id":"Thh9gm2q8Pt_","outputId":"4c390cc1-191f-4c51-aa7d-d7a4e549cd93"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["                                                            0  \\\n","Unnamed: 0                                                      \n","134338            Teacher-Elementary              KINDERGA...   \n","206341      CONTRACTOR SERVICES BOND EXPENDITURES BUILDING...   \n","326408      Personal Services - Teachers     TCHER 2ND GRA...   \n","364634      EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...   \n","47683       TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE...   \n","...                                                       ...   \n","109283      WORKSHOP PARTICIPANT               CURRICULUM ...   \n","102430      SALARIES OF PART TIME EMPLOYEE   FEDERAL GDPG ...   \n","413949            School Liaison             PARENT/TITLE ...   \n","433672      EMPLOYEE BENEFITS EDUCATIONAL RESOURCE SERVICE...   \n","415831      Salaries And Wages For Substitute Professional...   \n","\n","            Function_Aides Compensation  \\\n","Unnamed: 0                                \n","134338                                0   \n","206341                                0   \n","326408                                0   \n","364634                                0   \n","47683                                 0   \n","...                                 ...   \n","109283                                0   \n","102430                                0   \n","413949                                0   \n","433672                                0   \n","415831                                0   \n","\n","            Function_Career & Academic Counseling  Function_Communications  \\\n","Unnamed: 0                                                                   \n","134338                                          0                        0   \n","206341                                          0                        0   \n","326408                                          0                        0   \n","364634                                          0                        0   \n","47683                                           0                        0   \n","...                                           ...                      ...   \n","109283                                          0                        0   \n","102430                                          0                        0   \n","413949                                          0                        0   \n","433672                                          0                        0   \n","415831                                          0                        0   \n","\n","            Function_Curriculum Development  \\\n","Unnamed: 0                                    \n","134338                                    0   \n","206341                                    0   \n","326408                                    0   \n","364634                                    0   \n","47683                                     0   \n","...                                     ...   \n","109283                                    0   \n","102430                                    0   \n","413949                                    0   \n","433672                                    0   \n","415831                                    0   \n","\n","            Function_Data Processing & Information Services  \\\n","Unnamed: 0                                                    \n","134338                                                    0   \n","206341                                                    0   \n","326408                                                    0   \n","364634                                                    0   \n","47683                                                     0   \n","...                                                     ...   \n","109283                                                    0   \n","102430                                                    0   \n","413949                                                    0   \n","433672                                                    0   \n","415831                                                    0   \n","\n","            Function_Development & Fundraising  Function_Enrichment  \\\n","Unnamed: 0                                                            \n","134338                                       0                    0   \n","206341                                       0                    0   \n","326408                                       0                    0   \n","364634                                       0                    0   \n","47683                                        0                    0   \n","...                                        ...                  ...   \n","109283                                       0                    0   \n","102430                                       0                    0   \n","413949                                       0                    0   \n","433672                                       0                    0   \n","415831                                       0                    0   \n","\n","            Function_Extended Time & Tutoring  \\\n","Unnamed: 0                                      \n","134338                                      0   \n","206341                                      0   \n","326408                                      0   \n","364634                                      0   \n","47683                                       0   \n","...                                       ...   \n","109283                                      0   \n","102430                                      0   \n","413949                                      0   \n","433672                                      0   \n","415831                                      0   \n","\n","            Function_Facilities & Maintenance  ...  \\\n","Unnamed: 0                                     ...   \n","134338                                      0  ...   \n","206341                                      0  ...   \n","326408                                      0  ...   \n","364634                                      0  ...   \n","47683                                       0  ...   \n","...                                       ...  ...   \n","109283                                      0  ...   \n","102430                                      0  ...   \n","413949                                      0  ...   \n","433672                                      0  ...   \n","415831                                      0  ...   \n","\n","            Student_Type_Special Education  Student_Type_Unspecified  \\\n","Unnamed: 0                                                             \n","134338                                   0                         0   \n","206341                                   0                         0   \n","326408                                   0                         1   \n","364634                                   0                         1   \n","47683                                    0                         1   \n","...                                    ...                       ...   \n","109283                                   0                         1   \n","102430                                   0                         1   \n","413949                                   0                         0   \n","433672                                   0                         1   \n","415831                                   0                         0   \n","\n","            Use_Business Services  Use_ISPD  Use_Instruction  Use_Leadership  \\\n","Unnamed: 0                                                                     \n","134338                          0         0                1               0   \n","206341                          0         0                0               0   \n","326408                          0         0                1               0   \n","364634                          0         0                1               0   \n","47683                           0         0                1               0   \n","...                           ...       ...              ...             ...   \n","109283                          0         1                0               0   \n","102430                          0         0                1               0   \n","413949                          0         0                0               0   \n","433672                          0         0                1               0   \n","415831                          0         0                1               0   \n","\n","            Use_NO_LABEL  Use_O&M  Use_Pupil Services & Enrichment  \\\n","Unnamed: 0                                                           \n","134338                 0        0                                0   \n","206341                 1        0                                0   \n","326408                 0        0                                0   \n","364634                 0        0                                0   \n","47683                  0        0                                0   \n","...                  ...      ...                              ...   \n","109283                 0        0                                0   \n","102430                 0        0                                0   \n","413949                 1        0                                0   \n","433672                 0        0                                0   \n","415831                 0        0                                0   \n","\n","            Use_Untracked Budget Set-Aside  \n","Unnamed: 0                                  \n","134338                                   0  \n","206341                                   0  \n","326408                                   0  \n","364634                                   0  \n","47683                                    0  \n","...                                    ...  \n","109283                                   0  \n","102430                                   0  \n","413949                                   0  \n","433672                                   0  \n","415831                                   0  \n","\n","[400277 rows x 105 columns]"],"text/html":["\n","  <div id=\"df-efe9ca42-7ce1-4fd4-a880-234b5631bd03\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>0</th>\n","      <th>Function_Aides Compensation</th>\n","      <th>Function_Career &amp; Academic Counseling</th>\n","      <th>Function_Communications</th>\n","      <th>Function_Curriculum Development</th>\n","      <th>Function_Data Processing &amp; Information Services</th>\n","      <th>Function_Development &amp; Fundraising</th>\n","      <th>Function_Enrichment</th>\n","      <th>Function_Extended Time &amp; Tutoring</th>\n","      <th>Function_Facilities &amp; Maintenance</th>\n","      <th>...</th>\n","      <th>Student_Type_Special Education</th>\n","      <th>Student_Type_Unspecified</th>\n","      <th>Use_Business Services</th>\n","      <th>Use_ISPD</th>\n","      <th>Use_Instruction</th>\n","      <th>Use_Leadership</th>\n","      <th>Use_NO_LABEL</th>\n","      <th>Use_O&amp;M</th>\n","      <th>Use_Pupil Services &amp; Enrichment</th>\n","      <th>Use_Untracked Budget Set-Aside</th>\n","    </tr>\n","    <tr>\n","      <th>Unnamed: 0</th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>134338</th>\n","      <td>Teacher-Elementary              KINDERGA...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>206341</th>\n","      <td>CONTRACTOR SERVICES BOND EXPENDITURES BUILDING...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>326408</th>\n","      <td>Personal Services - Teachers     TCHER 2ND GRA...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>364634</th>\n","      <td>EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>47683</th>\n","      <td>TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>109283</th>\n","      <td>WORKSHOP PARTICIPANT               CURRICULUM ...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>102430</th>\n","      <td>SALARIES OF PART TIME EMPLOYEE   FEDERAL GDPG ...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>413949</th>\n","      <td>School Liaison             PARENT/TITLE ...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>433672</th>\n","      <td>EMPLOYEE BENEFITS EDUCATIONAL RESOURCE SERVICE...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>415831</th>\n","      <td>Salaries And Wages For Substitute Professional...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>400277 rows × 105 columns</p>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-efe9ca42-7ce1-4fd4-a880-234b5631bd03')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-efe9ca42-7ce1-4fd4-a880-234b5631bd03 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-efe9ca42-7ce1-4fd4-a880-234b5631bd03');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":7}],"source":["# set target Cols\n","target_cols = ['Function', 'Object_Type', 'Operating_Status', 'Position_Type',  'Pre_K', 'Reporting', 'Sharing', 'Student_Type', 'Use']\n","train_df_dummies = pd.DataFrame()\n","# drop train int cols\n","train_df.drop(['FTE', 'Total'], axis = 1, inplace=True)\n","# get dummies for target cols\n","col = pd.get_dummies(train_df[target_cols])\n","# drop target cols\n","train_df = train_df.drop(target_cols, axis=1)\n","# fill NaN with space\n","train_df.fillna(' ', inplace=True)\n","# combine all text into single col\n","combined = pd.DataFrame([' '.join(row) for row in train_df[train_df.columns[1:]].values])\n","# join combined text col with dummy labels\n","train_df_dummies = pd.concat([combined, col], axis = 1)\n","# drop test int cols\n","test_df.drop(['FTE', 'Total'], axis = 1, inplace=True)\n","# fill NaN with space\n","test_df.fillna(' ', inplace=True)\n","# combine all text into single col\n","test_df_cleaned = pd.DataFrame([' '.join(row) for row in test_df[test_df.columns[1:]].values])\n","# reset indices to original\n","test_df_cleaned.set_index(test_df['Unnamed: 0'])\n","train_df_dummies.set_index(train_df['Unnamed: 0'])"]},{"cell_type":"code","execution_count":8,"metadata":{"id":"sIdbRlvsE2Fn","executionInfo":{"status":"ok","timestamp":1670510435809,"user_tz":300,"elapsed":5,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["cols = col.columns"]},{"cell_type":"code","execution_count":9,"metadata":{"id":"5ZhHV0028PjT","executionInfo":{"status":"ok","timestamp":1670510435809,"user_tz":300,"elapsed":4,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["# rename text col\n","test_df_cleaned.rename(columns={0: \"text\"}, inplace=True)\n","train_df_dummies.rename(columns={0: \"text\"}, inplace=True)"]},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1292,"status":"ok","timestamp":1670510437097,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"},"user_tz":300},"id":"DSMXH2Qmc0Y_","outputId":"fe62771e-2bf1-44f4-88e1-d56dae02b300"},"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data]   Unzipping corpora/stopwords.zip.\n","[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data]   Unzipping tokenizers/punkt.zip.\n"]}],"source":["import nltk\n","nltk.download('stopwords')\n","nltk.download('punkt')\n","from nltk import word_tokenize\n","from nltk.corpus import stopwords\n","from nltk.stem.snowball import SnowballStemmer\n","import re"]},{"cell_type":"code","execution_count":11,"metadata":{"id":"Xly7pR2-CZhx","executionInfo":{"status":"ok","timestamp":1670510437098,"user_tz":300,"elapsed":4,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["def text_processing(df):\n","    stop_words = set(stopwords.words('english'))\n","    st = SnowballStemmer('english')\n","    # lower and trim spaces\n","    df['text'] = df['text'].apply(lambda x: x.lower().strip())\n","    # remove other spaces\n","    df['text'] = df['text'].apply(lambda x: re.sub(' +', ' ', x))\n","    # remove punctuation\n","    df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))\n","    # remove stopwords stem\n","    df['text'] = df['text'].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stop_words))"]},{"cell_type":"code","execution_count":12,"metadata":{"id":"rR1Z3FPoh7c3","executionInfo":{"status":"ok","timestamp":1670510527234,"user_tz":300,"elapsed":90139,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["# clean df\n","text_processing(train_df_dummies)\n","text_processing(test_df_cleaned)"]},{"cell_type":"markdown","source":["# Embedding"],"metadata":{"id":"4DkHoAlAdS8j"}},{"cell_type":"code","execution_count":13,"metadata":{"id":"iQAX3AoUnrtv","executionInfo":{"status":"ok","timestamp":1670510527235,"user_tz":300,"elapsed":4,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["from sklearn.feature_extraction.text import TfidfVectorizer"]},{"cell_type":"code","execution_count":14,"metadata":{"id":"dpLEeR7dC9Bd","executionInfo":{"status":"ok","timestamp":1670510527655,"user_tz":300,"elapsed":423,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["X = train_df_dummies['text'].values\n","y = train_df_dummies[list(train_df_dummies.columns[1:])].values\n","X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.2,train_size =0.8)"]},{"cell_type":"code","execution_count":24,"metadata":{"id":"J4KQiBI0oZUp","executionInfo":{"status":"ok","timestamp":1670511912627,"user_tz":300,"elapsed":9889,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["def tfidf_features(X_train, X_val, X_test):\n","    \"\"\"\n","        X_train, X_test — samples        \n","        return TF-IDF vectorized representation of each sample and vocabulary\n","    \"\"\"\n","    # Create TF-IDF vectorizer with a proper parameters choice\n","    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=1.0, min_df=1, token_pattern='(\\S+)')\n","    # Fit the vectorizer on the train set\n","    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n","    # Transform the train, test set and return the result\n","    X_val_tfidf = tfidf_vectorizer.transform(X_val)\n","    X_test_tfidf = tfidf_vectorizer.transform(X_test)\n","    \n","    return X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_\n","timer = time.time()\n","X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, test_df_cleaned.text)\n","tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}"]},{"cell_type":"markdown","source":["# Logistic Regression"],"metadata":{"id":"ydQ-7nJdT7Ad"}},{"cell_type":"code","execution_count":25,"metadata":{"id":"nwyPgHYMplz3","executionInfo":{"status":"ok","timestamp":1670513291331,"user_tz":300,"elapsed":1378713,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"045b6532-7308-4018-b3c3-af14efe9aec5"},"outputs":[{"output_type":"stream","name":"stdout","text":["Time for NLP training\" 1387.7084062099457\n","Time for NLP inference\" 0.6320400238037109\n"]}],"source":["from sklearn.multiclass import OneVsRestClassifier\n","from sklearn.linear_model import LogisticRegression, RidgeClassifier\n","\n","def train_logreg(X_train, y_train, C, regularisation):\n","    \"\"\"\n","      X_train, y_train — training data\n","      \n","      return: trained classifier\n","    \"\"\"\n","    \n","    # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n","\n","    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)\n","    return model\n","\n","logreg_tfidf = train_logreg(X_train_tfidf, y_train, C = 2, regularisation = 'l2')\n","print (f'Time for NLP training\" {time.time()-timer}')\n","timer = time.time()\n","y_val_predicted_labels_logreg = logreg_tfidf.predict(X_val_tfidf)\n","print (f'Time for NLP inference\" {time.time()-timer}')\n"]},{"cell_type":"markdown","source":["# XG Boost"],"metadata":{"id":"SOFP4vZFUJiu"}},{"cell_type":"code","source":["# from sklearn.multiclass import OneVsRestClassifier\n","# from xgboost import XGBClassifier\n","\n","# def train_xgb(X_train, y_train, params):\n","#     \"\"\"\n","#     X_train, y_train — training data\n","    \n","#     return: trained classifier\n","#     \"\"\"\n","    \n","#     # Create and fit XGBoost wraped into OneVsRestClassifier.\n","\n","#     model = OneVsRestClassifier(XGBClassifier(**params)).fit(X_train, y_train)\n","#     return model\n","# xgb_params = {'eta': 0.3, \n","#               'max_depth': 5, \n","#               'subsample': 0.8, \n","#               'colsample_bytree': 0.8, \n","#               'tree_method' : 'gpu_hist',\n","#               'objective': 'binary:logistic', \n","#               'eval_metric': 'auc', \n","#               'seed': 42\n","#              }\n","# xgb_tfidf = train_xgb(X_train_tfidf, y_train, xgb_params)\n","# y_val_predicted_labels_xgb = xgb_tfidf.predict(X_val_tfidf)"],"metadata":{"id":"r42fPuwwURXN","executionInfo":{"status":"ok","timestamp":1670513291331,"user_tz":300,"elapsed":10,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"execution_count":26,"outputs":[]},{"cell_type":"markdown","source":["# SGD"],"metadata":{"id":"WeL9xN2WLpyO"}},{"cell_type":"code","source":["# from sklearn.multiclass import OneVsRestClassifier\n","# from sklearn.linear_model import SGDClassifier\n","# def train_sgd(X_train, y_train):\n","#     \"\"\"\n","#       X_train, y_train — training data\n","      \n","#       return: trained classifier\n","#     \"\"\"\n","    \n","#     # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n","\n","#     model = OneVsRestClassifier(SGDClassifier(loss = 'log', penalty = 'l2')).fit(X_train, y_train)\n","#     return model\n","\n","# sgd_tfidf = train_sgd(X_train_tfidf, y_train)\n","# print (f'Time for NLP training\" {time.time()-timer}')\n","# timer = time.time()\n","# y_val_predicted_labels_sgd = sgd_tfidf.predict(X_val_tfidf)\n","# print (f'Time for NLP inference\" {time.time()-timer}')\n"],"metadata":{"id":"CxxB9ABSLrs1","executionInfo":{"status":"ok","timestamp":1670513291331,"user_tz":300,"elapsed":8,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"execution_count":27,"outputs":[]},{"cell_type":"markdown","source":["## Model Evaluation"],"metadata":{"id":"aToqJ94RdO9R"}},{"cell_type":"code","source":["from sklearn.metrics import accuracy_score\n","from sklearn.metrics import f1_score\n","from sklearn.metrics import roc_auc_score \n","from sklearn.metrics import precision_score\n","from sklearn.metrics import average_precision_score\n","from sklearn.metrics import recall_score\n","from sklearn.metrics import log_loss"],"metadata":{"id":"a2ULt-H4dAT5","executionInfo":{"status":"ok","timestamp":1670513291332,"user_tz":300,"elapsed":8,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"execution_count":28,"outputs":[]},{"cell_type":"code","source":["def print_evaluation_scores(y_test, predicted):\n","    \n","    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=True))\n","    print('F1-score macro: ', f1_score(y_test, predicted, average='macro', labels=[1]))\n","    print('F1-score micro: ', f1_score(y_test, predicted, average='micro', labels=[1]))\n","    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))\n","    print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))\n","    print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))\n","    print('Precision weighted: ', precision_score(y_test, predicted, average='weighted', labels=[1]))\n","    print('Log Loss: ', log_loss(y_test, predicted, normalize=True))\n","\n","    \n","print('Metrics')\n","print_evaluation_scores(y_val, y_val_predicted_labels_logreg)\n","# print_evaluation_scores(y_val, y_val_predicted_labels_xgb)\n","#  print_evaluation_scores(y_val, y_val_predicted_labels_sgd)"],"metadata":{"id":"g1xaUV6Ic1kS","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1670513299813,"user_tz":300,"elapsed":8489,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}},"outputId":"e1fb88d6-f5e0-4b75-c656-9b0ee6ea87da"},"execution_count":29,"outputs":[{"output_type":"stream","name":"stdout","text":["Metrics\n","Accuracy:  0.898533526531428\n","F1-score macro:  0.857397504456328\n","F1-score micro:  0.857397504456328\n","F1-score weighted:  0.9780481032620225\n","Precision macro:  0.8548790839083935\n","Precision micro:  0.9595717445537028\n","Precision weighted:  0.8809523809523809\n","Log Loss:  27.62320822221433\n"]}]},{"cell_type":"markdown","source":["# Submission"],"metadata":{"id":"4e9yV9CadHwO"}},{"cell_type":"code","execution_count":30,"metadata":{"id":"75VcWGLmvvCi","executionInfo":{"status":"ok","timestamp":1670513300205,"user_tz":300,"elapsed":394,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["test_predictions = logreg_tfidf.predict_proba(X_test_tfidf)"]},{"cell_type":"code","execution_count":31,"metadata":{"id":"GhNpC96cwRmD","executionInfo":{"status":"ok","timestamp":1670513301440,"user_tz":300,"elapsed":1237,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}},"colab":{"base_uri":"https://localhost:8080/","height":554},"outputId":"aa8f5639-897c-4437-aca7-daafddd03797"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["        Function__Aides Compensation  Function__Career & Academic Counseling  \\\n","                                                                               \n","180042                      0.005369                                0.001911   \n","28872                       0.000574                                0.001182   \n","186915                      0.017363                                0.000992   \n","412396                      0.011996                                0.000990   \n","427740                      0.001207                                0.002310   \n","...                              ...                                     ...   \n","169063                      0.004662                                0.000559   \n","433255                      0.004662                                0.000559   \n","232204                      0.004662                                0.000559   \n","171685                      0.004662                                0.000559   \n","249087                      0.004662                                0.000559   \n","\n","        Function__Communications  Function__Curriculum Development  \\\n","                                                                     \n","180042                  0.000062                          0.000037   \n","28872                   0.000243                          0.005150   \n","186915                  0.000156                          0.001177   \n","412396                  0.000155                          0.001122   \n","427740                  0.000585                          0.002508   \n","...                          ...                               ...   \n","169063                  0.000271                          0.001190   \n","433255                  0.000271                          0.001190   \n","232204                  0.000271                          0.001190   \n","171685                  0.000271                          0.001190   \n","249087                  0.000271                          0.001190   \n","\n","        Function__Data Processing & Information Services  \\\n","                                                           \n","180042                                          0.000424   \n","28872                                           0.001041   \n","186915                                          0.000616   \n","412396                                          0.000625   \n","427740                                          0.002355   \n","...                                                  ...   \n","169063                                          0.000868   \n","433255                                          0.000868   \n","232204                                          0.000868   \n","171685                                          0.000868   \n","249087                                          0.000868   \n","\n","        Function__Development & Fundraising  Function__Enrichment  \\\n","                                                                    \n","180042                             0.000088              0.000996   \n","28872                              0.000150              0.227637   \n","186915                             0.000108              0.000500   \n","412396                             0.000108              0.000475   \n","427740                             0.000135              0.002673   \n","...                                     ...                   ...   \n","169063                             0.000236              0.002325   \n","433255                             0.000236              0.002325   \n","232204                             0.000236              0.002325   \n","171685                             0.000236              0.002325   \n","249087                             0.000236              0.002325   \n","\n","        Function__Extended Time & Tutoring  \\\n","                                             \n","180042                            0.000203   \n","28872                             0.003900   \n","186915                            0.000463   \n","412396                            0.000489   \n","427740                            0.000067   \n","...                                    ...   \n","169063                            0.000239   \n","433255                            0.000239   \n","232204                            0.000239   \n","171685                            0.000239   \n","249087                            0.000239   \n","\n","        Function__Facilities & Maintenance  Function__Facilities Planning  \\\n","                                                                            \n","180042                            0.001402                       0.000029   \n","28872                             0.006152                       0.000040   \n","186915                            0.001457                       0.000034   \n","412396                            0.001456                       0.000034   \n","427740                            0.003695                       0.000041   \n","...                                    ...                            ...   \n","169063                            0.001862                       0.000044   \n","433255                            0.001862                       0.000044   \n","232204                            0.001862                       0.000044   \n","171685                            0.001862                       0.000044   \n","249087                            0.001862                       0.000044   \n","\n","        ...  Student_Type__Special Education  Student_Type__Unspecified  \\\n","        ...                                                               \n","180042  ...                         0.002347                   0.836926   \n","28872   ...                         0.005913                   0.894910   \n","186915  ...                         0.004487                   0.282162   \n","412396  ...                         0.004447                   0.250461   \n","427740  ...                         0.004405                   0.980520   \n","...     ...                              ...                        ...   \n","169063  ...                         0.004677                   0.037555   \n","433255  ...                         0.004677                   0.037555   \n","232204  ...                         0.004677                   0.037555   \n","171685  ...                         0.004677                   0.037555   \n","249087  ...                         0.004677                   0.037555   \n","\n","        Use__Business Services  Use__ISPD  Use__Instruction  Use__Leadership  \\\n","                                                                               \n","180042                0.000200   0.003739          0.064349         0.003398   \n","28872                 0.003721   0.009994          0.011118         0.023945   \n","186915                0.000895   0.012540          0.695857         0.008499   \n","412396                0.000910   0.010479          0.693750         0.007253   \n","427740                0.008272   0.040371          0.001226         0.805063   \n","...                        ...        ...               ...              ...   \n","169063                0.003252   0.003751          0.017954         0.002785   \n","433255                0.003252   0.003751          0.017954         0.002785   \n","232204                0.003252   0.003751          0.017954         0.002785   \n","171685                0.003252   0.003751          0.017954         0.002785   \n","249087                0.003252   0.003751          0.017954         0.002785   \n","\n","        Use__NO_LABEL  Use__O&M  Use__Pupil Services & Enrichment  \\\n","                                                                    \n","180042       0.264019  0.006676                          0.005294   \n","28872        0.047036  0.023556                          0.146622   \n","186915       0.016858  0.003341                          0.009452   \n","412396       0.017822  0.003547                          0.008566   \n","427740       0.005581  0.037933                          0.004177   \n","...               ...       ...                               ...   \n","169063       0.883382  0.002572                          0.044943   \n","433255       0.883382  0.002572                          0.044943   \n","232204       0.883382  0.002572                          0.044943   \n","171685       0.883382  0.002572                          0.044943   \n","249087       0.883382  0.002572                          0.044943   \n","\n","        Use__Untracked Budget Set-Aside  \n","                                         \n","180042                         0.000372  \n","28872                          0.000244  \n","186915                         0.000168  \n","412396                         0.000168  \n","427740                         0.000233  \n","...                                 ...  \n","169063                         0.000341  \n","433255                         0.000341  \n","232204                         0.000341  \n","171685                         0.000341  \n","249087                         0.000341  \n","\n","[50064 rows x 104 columns]"],"text/html":["\n","  <div id=\"df-73e59131-014c-49d1-a337-c844b74a21b5\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Function__Aides Compensation</th>\n","      <th>Function__Career &amp; Academic Counseling</th>\n","      <th>Function__Communications</th>\n","      <th>Function__Curriculum Development</th>\n","      <th>Function__Data Processing &amp; Information Services</th>\n","      <th>Function__Development &amp; Fundraising</th>\n","      <th>Function__Enrichment</th>\n","      <th>Function__Extended Time &amp; Tutoring</th>\n","      <th>Function__Facilities &amp; Maintenance</th>\n","      <th>Function__Facilities Planning</th>\n","      <th>...</th>\n","      <th>Student_Type__Special Education</th>\n","      <th>Student_Type__Unspecified</th>\n","      <th>Use__Business Services</th>\n","      <th>Use__ISPD</th>\n","      <th>Use__Instruction</th>\n","      <th>Use__Leadership</th>\n","      <th>Use__NO_LABEL</th>\n","      <th>Use__O&amp;M</th>\n","      <th>Use__Pupil Services &amp; Enrichment</th>\n","      <th>Use__Untracked Budget Set-Aside</th>\n","    </tr>\n","    <tr>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","      <th></th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>180042</th>\n","      <td>0.005369</td>\n","      <td>0.001911</td>\n","      <td>0.000062</td>\n","      <td>0.000037</td>\n","      <td>0.000424</td>\n","      <td>0.000088</td>\n","      <td>0.000996</td>\n","      <td>0.000203</td>\n","      <td>0.001402</td>\n","      <td>0.000029</td>\n","      <td>...</td>\n","      <td>0.002347</td>\n","      <td>0.836926</td>\n","      <td>0.000200</td>\n","      <td>0.003739</td>\n","      <td>0.064349</td>\n","      <td>0.003398</td>\n","      <td>0.264019</td>\n","      <td>0.006676</td>\n","      <td>0.005294</td>\n","      <td>0.000372</td>\n","    </tr>\n","    <tr>\n","      <th>28872</th>\n","      <td>0.000574</td>\n","      <td>0.001182</td>\n","      <td>0.000243</td>\n","      <td>0.005150</td>\n","      <td>0.001041</td>\n","      <td>0.000150</td>\n","      <td>0.227637</td>\n","      <td>0.003900</td>\n","      <td>0.006152</td>\n","      <td>0.000040</td>\n","      <td>...</td>\n","      <td>0.005913</td>\n","      <td>0.894910</td>\n","      <td>0.003721</td>\n","      <td>0.009994</td>\n","      <td>0.011118</td>\n","      <td>0.023945</td>\n","      <td>0.047036</td>\n","      <td>0.023556</td>\n","      <td>0.146622</td>\n","      <td>0.000244</td>\n","    </tr>\n","    <tr>\n","      <th>186915</th>\n","      <td>0.017363</td>\n","      <td>0.000992</td>\n","      <td>0.000156</td>\n","      <td>0.001177</td>\n","      <td>0.000616</td>\n","      <td>0.000108</td>\n","      <td>0.000500</td>\n","      <td>0.000463</td>\n","      <td>0.001457</td>\n","      <td>0.000034</td>\n","      <td>...</td>\n","      <td>0.004487</td>\n","      <td>0.282162</td>\n","      <td>0.000895</td>\n","      <td>0.012540</td>\n","      <td>0.695857</td>\n","      <td>0.008499</td>\n","      <td>0.016858</td>\n","      <td>0.003341</td>\n","      <td>0.009452</td>\n","      <td>0.000168</td>\n","    </tr>\n","    <tr>\n","      <th>412396</th>\n","      <td>0.011996</td>\n","      <td>0.000990</td>\n","      <td>0.000155</td>\n","      <td>0.001122</td>\n","      <td>0.000625</td>\n","      <td>0.000108</td>\n","      <td>0.000475</td>\n","      <td>0.000489</td>\n","      <td>0.001456</td>\n","      <td>0.000034</td>\n","      <td>...</td>\n","      <td>0.004447</td>\n","      <td>0.250461</td>\n","      <td>0.000910</td>\n","      <td>0.010479</td>\n","      <td>0.693750</td>\n","      <td>0.007253</td>\n","      <td>0.017822</td>\n","      <td>0.003547</td>\n","      <td>0.008566</td>\n","      <td>0.000168</td>\n","    </tr>\n","    <tr>\n","      <th>427740</th>\n","      <td>0.001207</td>\n","      <td>0.002310</td>\n","      <td>0.000585</td>\n","      <td>0.002508</td>\n","      <td>0.002355</td>\n","      <td>0.000135</td>\n","      <td>0.002673</td>\n","      <td>0.000067</td>\n","      <td>0.003695</td>\n","      <td>0.000041</td>\n","      <td>...</td>\n","      <td>0.004405</td>\n","      <td>0.980520</td>\n","      <td>0.008272</td>\n","      <td>0.040371</td>\n","      <td>0.001226</td>\n","      <td>0.805063</td>\n","      <td>0.005581</td>\n","      <td>0.037933</td>\n","      <td>0.004177</td>\n","      <td>0.000233</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>169063</th>\n","      <td>0.004662</td>\n","      <td>0.000559</td>\n","      <td>0.000271</td>\n","      <td>0.001190</td>\n","      <td>0.000868</td>\n","      <td>0.000236</td>\n","      <td>0.002325</td>\n","      <td>0.000239</td>\n","      <td>0.001862</td>\n","      <td>0.000044</td>\n","      <td>...</td>\n","      <td>0.004677</td>\n","      <td>0.037555</td>\n","      <td>0.003252</td>\n","      <td>0.003751</td>\n","      <td>0.017954</td>\n","      <td>0.002785</td>\n","      <td>0.883382</td>\n","      <td>0.002572</td>\n","      <td>0.044943</td>\n","      <td>0.000341</td>\n","    </tr>\n","    <tr>\n","      <th>433255</th>\n","      <td>0.004662</td>\n","      <td>0.000559</td>\n","      <td>0.000271</td>\n","      <td>0.001190</td>\n","      <td>0.000868</td>\n","      <td>0.000236</td>\n","      <td>0.002325</td>\n","      <td>0.000239</td>\n","      <td>0.001862</td>\n","      <td>0.000044</td>\n","      <td>...</td>\n","      <td>0.004677</td>\n","      <td>0.037555</td>\n","      <td>0.003252</td>\n","      <td>0.003751</td>\n","      <td>0.017954</td>\n","      <td>0.002785</td>\n","      <td>0.883382</td>\n","      <td>0.002572</td>\n","      <td>0.044943</td>\n","      <td>0.000341</td>\n","    </tr>\n","    <tr>\n","      <th>232204</th>\n","      <td>0.004662</td>\n","      <td>0.000559</td>\n","      <td>0.000271</td>\n","      <td>0.001190</td>\n","      <td>0.000868</td>\n","      <td>0.000236</td>\n","      <td>0.002325</td>\n","      <td>0.000239</td>\n","      <td>0.001862</td>\n","      <td>0.000044</td>\n","      <td>...</td>\n","      <td>0.004677</td>\n","      <td>0.037555</td>\n","      <td>0.003252</td>\n","      <td>0.003751</td>\n","      <td>0.017954</td>\n","      <td>0.002785</td>\n","      <td>0.883382</td>\n","      <td>0.002572</td>\n","      <td>0.044943</td>\n","      <td>0.000341</td>\n","    </tr>\n","    <tr>\n","      <th>171685</th>\n","      <td>0.004662</td>\n","      <td>0.000559</td>\n","      <td>0.000271</td>\n","      <td>0.001190</td>\n","      <td>0.000868</td>\n","      <td>0.000236</td>\n","      <td>0.002325</td>\n","      <td>0.000239</td>\n","      <td>0.001862</td>\n","      <td>0.000044</td>\n","      <td>...</td>\n","      <td>0.004677</td>\n","      <td>0.037555</td>\n","      <td>0.003252</td>\n","      <td>0.003751</td>\n","      <td>0.017954</td>\n","      <td>0.002785</td>\n","      <td>0.883382</td>\n","      <td>0.002572</td>\n","      <td>0.044943</td>\n","      <td>0.000341</td>\n","    </tr>\n","    <tr>\n","      <th>249087</th>\n","      <td>0.004662</td>\n","      <td>0.000559</td>\n","      <td>0.000271</td>\n","      <td>0.001190</td>\n","      <td>0.000868</td>\n","      <td>0.000236</td>\n","      <td>0.002325</td>\n","      <td>0.000239</td>\n","      <td>0.001862</td>\n","      <td>0.000044</td>\n","      <td>...</td>\n","      <td>0.004677</td>\n","      <td>0.037555</td>\n","      <td>0.003252</td>\n","      <td>0.003751</td>\n","      <td>0.017954</td>\n","      <td>0.002785</td>\n","      <td>0.883382</td>\n","      <td>0.002572</td>\n","      <td>0.044943</td>\n","      <td>0.000341</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>50064 rows × 104 columns</p>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-73e59131-014c-49d1-a337-c844b74a21b5')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-73e59131-014c-49d1-a337-c844b74a21b5 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-73e59131-014c-49d1-a337-c844b74a21b5');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":31}],"source":["submission_cols = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/SubmissionFormat.csv')\n","cols_list = list(submission_cols.columns.values)\n","cols_list = cols_list[1:] ## remove the first column which is the index\n","submission = pd.DataFrame(test_predictions, columns=cols_list)\n","submission.set_index(test_df['Unnamed: 0'], inplace=True)\n","submission.index.name = \"\"\n","submission"]},{"cell_type":"code","execution_count":32,"metadata":{"id":"ep-ruNCfyMKH","executionInfo":{"status":"ok","timestamp":1670513308615,"user_tz":300,"elapsed":7178,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["submission.to_csv('12072022_tfidf_logreg_Cval_2.csv')"]},{"cell_type":"code","execution_count":32,"metadata":{"id":"XwaSy1i13nax","executionInfo":{"status":"ok","timestamp":1670513308616,"user_tz":300,"elapsed":5,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":[]}],"metadata":{"colab":{"provenance":[{"file_id":"1dLdfXkr3V01-zlCKmWbNew2UXHqO9Vwo","timestamp":1668433872897}],"machine_shape":"hm"},"gpuClass":"premium","kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":0}
\ No newline at end of file