From 76fe5b7265e4707d401f2cd013cd67f3a927635a Mon Sep 17 00:00:00 2001
From: dmath010 <>
Date: Fri, 9 Dec 2022 16:00:58 +0000
Subject: [PATCH] Upload New File

 NLP_Approach.ipynb | 2362 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2362 insertions(+)
 create mode 100644 NLP_Approach.ipynb

diff --git a/NLP_Approach.ipynb b/NLP_Approach.ipynb
new file mode 100644
index 0000000..28268f3
--- /dev/null
+++ b/NLP_Approach.ipynb
@@ -0,0 +1,2362 @@
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "REugZMxlFCvU",
+        "outputId": "2b6a1767-4296-4d10-cc7a-fcefdb7d6113"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Mounted at /content/gdrive/\n"
+          ]
+        }
+      ],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount(\"/content/gdrive/\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ceMmp0oL8PyI"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import shutil\n",
+        "import sys  \n",
+        "import os\n",
+        "import time\n",
+        "from sklearn.model_selection import train_test_split"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qAYz11Vr-k5b"
+      },
+      "outputs": [],
+      "source": [
+        "dir_path = 'gdrive/Shareddrives/CS5024 Ethics Project'\n",
+        "sys.path.append(dir_path)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "jmTuxI2t8PwQ",
+        "outputId": "d312c306-e195-45bc-95de-fed482fa0869"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "/usr/local/lib/python3.8/dist-packages/IPython/core/ DtypeWarning: Columns (5,11) have mixed types.Specify dtype option on import or set low_memory=False.\n",
+            "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Create DF for train and test set\n",
+        "train_df = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/TrainingData.csv')\n",
+        "test_df = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/TestData.csv')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print (train_df.shape)\n",
+        "print (test_df.shape)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0NPhp8WQ1YA0",
+        "outputId": "33582c11-9cd6-4a16-ef75-957d55f8c906"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "(400277, 26)\n",
+            "(50064, 17)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Data Preprocessing"
+      ],
+      "metadata": {
+        "id": "Y558pu6Bdbbw"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 797
+        },
+        "id": "Q9Jzqrlwiu83",
+        "outputId": "cf12f590-0d36-4704-de43-6193bc7fcf82"
+      },
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "                                Function          Use  \\\n",
+              "Unnamed: 0                                              \n",
+              "134338              Teacher Compensation  Instruction   \n",
+              "206341                          NO_LABEL     NO_LABEL   \n",
+              "326408              Teacher Compensation  Instruction   \n",
+              "364634           Substitute Compensation  Instruction   \n",
+              "47683            Substitute Compensation  Instruction   \n",
+              "...                                  ...          ...   \n",
+              "109283          Professional Development         ISPD   \n",
+              "102430           Substitute Compensation  Instruction   \n",
+              "413949      Parent & Community Relations     NO_LABEL   \n",
+              "433672                   Library & Media  Instruction   \n",
+              "415831           Substitute Compensation  Instruction   \n",
+              "\n",
+              "                              Sharing   Reporting Student_Type  \\\n",
+              "Unnamed: 0                                                       \n",
+              "134338                School Reported      School     NO_LABEL   \n",
+              "206341                       NO_LABEL    NO_LABEL     NO_LABEL   \n",
+              "326408                School Reported      School  Unspecified   \n",
+              "364634                School Reported      School  Unspecified   \n",
+              "47683                 School Reported      School  Unspecified   \n",
+              "...                               ...         ...          ...   \n",
+              "109283                Shared Services  Non-School  Unspecified   \n",
+              "102430                School Reported      School  Unspecified   \n",
+              "413949                School Reported      School     NO_LABEL   \n",
+              "433672      School on Central Budgets  Non-School  Unspecified   \n",
+              "415831                School Reported      School      Poverty   \n",
+              "\n",
+              "                  Position_Type                 Object_Type     Pre_K  \\\n",
+              "Unnamed: 0                                                              \n",
+              "134338                  Teacher                    NO_LABEL  NO_LABEL   \n",
+              "206341                 NO_LABEL                    NO_LABEL  NO_LABEL   \n",
+              "326408                  Teacher    Base Salary/Compensation  Non PreK   \n",
+              "364634               Substitute                    Benefits  NO_LABEL   \n",
+              "47683                   Teacher     Substitute Compensation  NO_LABEL   \n",
+              "...                         ...                         ...       ...   \n",
+              "109283      Instructional Coach  Other Compensation/Stipend  NO_LABEL   \n",
+              "102430               Substitute    Base Salary/Compensation  NO_LABEL   \n",
+              "413949                    Other                    NO_LABEL  NO_LABEL   \n",
+              "433672                Librarian                    Benefits  NO_LABEL   \n",
+              "415831               Substitute     Substitute Compensation  Non PreK   \n",
+              "\n",
+              "             Operating_Status  \\\n",
+              "Unnamed: 0                      \n",
+              "134338      PreK-12 Operating   \n",
+              "206341          Non-Operating   \n",
+              "326408      PreK-12 Operating   \n",
+              "364634      PreK-12 Operating   \n",
+              "47683       PreK-12 Operating   \n",
+              "...                       ...   \n",
+              "109283      PreK-12 Operating   \n",
+              "102430      PreK-12 Operating   \n",
+              "413949      PreK-12 Operating   \n",
+              "433672      PreK-12 Operating   \n",
+              "415831      PreK-12 Operating   \n",
+              "\n",
+              "                                         Object_Description  ...  \\\n",
+              "Unnamed: 0                                                   ...   \n",
+              "134338                                                  NaN  ...   \n",
+              "206341                                  CONTRACTOR SERVICES  ...   \n",
+              "326408                         Personal Services - Teachers  ...   \n",
+              "364634                                    EMPLOYEE BENEFITS  ...   \n",
+              "47683                          TEACHER COVERAGE FOR TEACHER  ...   \n",
+              "...                                                     ...  ...   \n",
+              "109283                       WORKSHOP PARTICIPANT            ...   \n",
+              "102430                       SALARIES OF PART TIME EMPLOYEE  ...   \n",
+              "413949                                                  NaN  ...   \n",
+              "433672                                    EMPLOYEE BENEFITS  ...   \n",
+              "415831      Salaries And Wages For Substitute Professionals  ...   \n",
+              "\n",
+              "                                Sub_Object_Description  \\\n",
+              "Unnamed: 0                                               \n",
+              "134338                                             NaN   \n",
+              "206341                                             NaN   \n",
+              "326408                                             NaN   \n",
+              "364634                                             NaN   \n",
+              "47683                                              NaN   \n",
+              "...                                                ...   \n",
+              "109283                                             NaN   \n",
+              "102430                                             NaN   \n",
+              "413949                                             NaN   \n",
+              "433672                                             NaN   \n",
+              "415831      Inservice Substitute Teachers Grant Funded   \n",
+              "\n",
+              "                      Location_Description      FTE  \\\n",
+              "Unnamed: 0                                            \n",
+              "134338                                 NaN  1.00000   \n",
+              "206341                                 NaN      NaN   \n",
+              "326408                                 NaN  1.00000   \n",
+              "364634                                 NaN      NaN   \n",
+              "47683                                  NaN      NaN   \n",
+              "...                                    ...      ...   \n",
+              "109283      STAFF DEV AND INSTR MEDIA           NaN   \n",
+              "102430                                 NaN  0.00431   \n",
+              "413949                                 NaN  1.00000   \n",
+              "433672                ED RESOURCE SERVICES      NaN   \n",
+              "415831                             School       NaN   \n",
+              "\n",
+              "                      Function_Description      Facility_or_Department  \\\n",
+              "Unnamed: 0                                                               \n",
+              "134338                                 NaN                         NaN   \n",
+              "206341                            RGN  GOB                         NaN   \n",
+              "326408                                 NaN                         NaN   \n",
+              "364634             UNALLOC BUDGETS/SCHOOLS                         NaN   \n",
+              "47683                          NON-PROJECT                         NaN   \n",
+              "...                                    ...                         ...   \n",
+              "109283      INST STAFF TRAINING SVCS                               NaN   \n",
+              "102430                          TITLE II,D                         NaN   \n",
+              "413949                                 NaN                         NaN   \n",
+              "433672                         NON-PROJECT                         NaN   \n",
+              "415831                         Instruction  Instruction And Curriculum   \n",
+              "\n",
+              "                           Position_Extra         Total  \\\n",
+              "Unnamed: 0                                                \n",
+              "134338                      KINDERGARTEN   50471.810000   \n",
+              "206341                       UNDESIGNATED   3477.860000   \n",
+              "326408                            TEACHER  62237.130000   \n",
+              "364634         PROFESSIONAL-INSTRUCTIONAL     22.300000   \n",
+              "47683          PROFESSIONAL-INSTRUCTIONAL     54.166000   \n",
+              "...                                   ...           ...   \n",
+              "109283                                NaN     48.620000   \n",
+              "102430         PROFESSIONAL-INSTRUCTIONAL    128.824985   \n",
+              "413949                     PARENT/TITLE I   4902.290000   \n",
+              "433672      OFFICE/ADMINISTRATIVE SUPPORT   4020.290000   \n",
+              "415831               CERTIFIED SUBSTITUTE     46.530000   \n",
+              "\n",
+              "                       Program_Description  \\\n",
+              "Unnamed: 0                                   \n",
+              "134338                        KINDERGARTEN   \n",
+              "206341       BUILDING IMPROVEMENT SERVICES   \n",
+              "326408               Instruction - Regular   \n",
+              "364634      GENERAL MIDDLE/JUNIOR HIGH SCH   \n",
+              "47683        GENERAL HIGH SCHOOL EDUCATION   \n",
+              "...                                    ...   \n",
+              "109283                                 NaN   \n",
+              "102430        INSTRUCTIONAL STAFF TRAINING   \n",
+              "413949                                Misc   \n",
+              "433672              MEDIA SUPPORT SERVICES   \n",
+              "415831               Accelerated Education   \n",
+              "\n",
+              "                                    Fund_Description  \\\n",
+              "Unnamed: 0                                             \n",
+              "134338                                  General Fund   \n",
+              "206341                                           NaN   \n",
+              "326408                        General Purpose School   \n",
+              "364634                                           NaN   \n",
+              "47683                                            NaN   \n",
+              "...                                              ...   \n",
+              "109283                GENERAL FUND                     \n",
+              "102430                                           NaN   \n",
+              "413949                            Schoolwide Schools   \n",
+              "433672                                           NaN   \n",
+              "415831      \"Title  Part A Improving Basic Programs\"   \n",
+              "\n",
+              "                                    Text_1  \n",
+              "Unnamed: 0                                  \n",
+              "134338                                 NaN  \n",
+              "206341       BUILDING IMPROVEMENT SERVICES  \n",
+              "326408                                 NaN  \n",
+              "364634                 REGULAR INSTRUCTION  \n",
+              "47683                  REGULAR INSTRUCTION  \n",
+              "...                                    ...  \n",
+              "109283      STAFF DEV AND INSTR MEDIA       \n",
+              "102430                 INSTRUCTIONAL STAFF  \n",
+              "413949                                 NaN  \n",
+              "433672                 INSTRUCTIONAL STAFF  \n",
+              "415831                      MISCELLANEOUS   \n",
+              "\n",
+              "[400277 rows x 25 columns]"
+            ],
+            "text/html": [
+              "\n",
+              "  <div id=\"df-f666659d-cc53-4867-b64a-0d1b481a7c61\">\n",
+              "    <div class=\"colab-df-container\">\n",
+              "      <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Function</th>\n",
+              "      <th>Use</th>\n",
+              "      <th>Sharing</th>\n",
+              "      <th>Reporting</th>\n",
+              "      <th>Student_Type</th>\n",
+              "      <th>Position_Type</th>\n",
+              "      <th>Object_Type</th>\n",
+              "      <th>Pre_K</th>\n",
+              "      <th>Operating_Status</th>\n",
+              "      <th>Object_Description</th>\n",
+              "      <th>...</th>\n",
+              "      <th>Sub_Object_Description</th>\n",
+              "      <th>Location_Description</th>\n",
+              "      <th>FTE</th>\n",
+              "      <th>Function_Description</th>\n",
+              "      <th>Facility_or_Department</th>\n",
+              "      <th>Position_Extra</th>\n",
+              "      <th>Total</th>\n",
+              "      <th>Program_Description</th>\n",
+              "      <th>Fund_Description</th>\n",
+              "      <th>Text_1</th>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>Unnamed: 0</th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>134338</th>\n",
+              "      <td>Teacher Compensation</td>\n",
+              "      <td>Instruction</td>\n",
+              "      <td>School Reported</td>\n",
+              "      <td>School</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>Teacher</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>PreK-12 Operating</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>...</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>1.00000</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>KINDERGARTEN</td>\n",
+              "      <td>50471.810000</td>\n",
+              "      <td>KINDERGARTEN</td>\n",
+              "      <td>General Fund</td>\n",
+              "      <td>NaN</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>206341</th>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>Non-Operating</td>\n",
+              "      <td>CONTRACTOR SERVICES</td>\n",
+              "      <td>...</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>RGN  GOB</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>UNDESIGNATED</td>\n",
+              "      <td>3477.860000</td>\n",
+              "      <td>BUILDING IMPROVEMENT SERVICES</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>BUILDING IMPROVEMENT SERVICES</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>326408</th>\n",
+              "      <td>Teacher Compensation</td>\n",
+              "      <td>Instruction</td>\n",
+              "      <td>School Reported</td>\n",
+              "      <td>School</td>\n",
+              "      <td>Unspecified</td>\n",
+              "      <td>Teacher</td>\n",
+              "      <td>Base Salary/Compensation</td>\n",
+              "      <td>Non PreK</td>\n",
+              "      <td>PreK-12 Operating</td>\n",
+              "      <td>Personal Services - Teachers</td>\n",
+              "      <td>...</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>1.00000</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>TEACHER</td>\n",
+              "      <td>62237.130000</td>\n",
+              "      <td>Instruction - Regular</td>\n",
+              "      <td>General Purpose School</td>\n",
+              "      <td>NaN</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>364634</th>\n",
+              "      <td>Substitute Compensation</td>\n",
+              "      <td>Instruction</td>\n",
+              "      <td>School Reported</td>\n",
+              "      <td>School</td>\n",
+              "      <td>Unspecified</td>\n",
+              "      <td>Substitute</td>\n",
+              "      <td>Benefits</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>PreK-12 Operating</td>\n",
+              "      <td>EMPLOYEE BENEFITS</td>\n",
+              "      <td>...</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>UNALLOC BUDGETS/SCHOOLS</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>PROFESSIONAL-INSTRUCTIONAL</td>\n",
+              "      <td>22.300000</td>\n",
+              "      <td>GENERAL MIDDLE/JUNIOR HIGH SCH</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>REGULAR INSTRUCTION</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>47683</th>\n",
+              "      <td>Substitute Compensation</td>\n",
+              "      <td>Instruction</td>\n",
+              "      <td>School Reported</td>\n",
+              "      <td>School</td>\n",
+              "      <td>Unspecified</td>\n",
+              "      <td>Teacher</td>\n",
+              "      <td>Substitute Compensation</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>PreK-12 Operating</td>\n",
+              "      <td>TEACHER COVERAGE FOR TEACHER</td>\n",
+              "      <td>...</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NON-PROJECT</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>PROFESSIONAL-INSTRUCTIONAL</td>\n",
+              "      <td>54.166000</td>\n",
+              "      <td>GENERAL HIGH SCHOOL EDUCATION</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>REGULAR INSTRUCTION</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>...</th>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>109283</th>\n",
+              "      <td>Professional Development</td>\n",
+              "      <td>ISPD</td>\n",
+              "      <td>Shared Services</td>\n",
+              "      <td>Non-School</td>\n",
+              "      <td>Unspecified</td>\n",
+              "      <td>Instructional Coach</td>\n",
+              "      <td>Other Compensation/Stipend</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>PreK-12 Operating</td>\n",
+              "      <td>WORKSHOP PARTICIPANT</td>\n",
+              "      <td>...</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>STAFF DEV AND INSTR MEDIA</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>INST STAFF TRAINING SVCS</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>48.620000</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>GENERAL FUND</td>\n",
+              "      <td>STAFF DEV AND INSTR MEDIA</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>102430</th>\n",
+              "      <td>Substitute Compensation</td>\n",
+              "      <td>Instruction</td>\n",
+              "      <td>School Reported</td>\n",
+              "      <td>School</td>\n",
+              "      <td>Unspecified</td>\n",
+              "      <td>Substitute</td>\n",
+              "      <td>Base Salary/Compensation</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>PreK-12 Operating</td>\n",
+              "      <td>SALARIES OF PART TIME EMPLOYEE</td>\n",
+              "      <td>...</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>0.00431</td>\n",
+              "      <td>TITLE II,D</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>PROFESSIONAL-INSTRUCTIONAL</td>\n",
+              "      <td>128.824985</td>\n",
+              "      <td>INSTRUCTIONAL STAFF TRAINING</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>INSTRUCTIONAL STAFF</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>413949</th>\n",
+              "      <td>Parent &amp; Community Relations</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>School Reported</td>\n",
+              "      <td>School</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>Other</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>PreK-12 Operating</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>...</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>1.00000</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>PARENT/TITLE I</td>\n",
+              "      <td>4902.290000</td>\n",
+              "      <td>Misc</td>\n",
+              "      <td>Schoolwide Schools</td>\n",
+              "      <td>NaN</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>433672</th>\n",
+              "      <td>Library &amp; Media</td>\n",
+              "      <td>Instruction</td>\n",
+              "      <td>School on Central Budgets</td>\n",
+              "      <td>Non-School</td>\n",
+              "      <td>Unspecified</td>\n",
+              "      <td>Librarian</td>\n",
+              "      <td>Benefits</td>\n",
+              "      <td>NO_LABEL</td>\n",
+              "      <td>PreK-12 Operating</td>\n",
+              "      <td>EMPLOYEE BENEFITS</td>\n",
+              "      <td>...</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>ED RESOURCE SERVICES</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>NON-PROJECT</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>OFFICE/ADMINISTRATIVE SUPPORT</td>\n",
+              "      <td>4020.290000</td>\n",
+              "      <td>MEDIA SUPPORT SERVICES</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>INSTRUCTIONAL STAFF</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>415831</th>\n",
+              "      <td>Substitute Compensation</td>\n",
+              "      <td>Instruction</td>\n",
+              "      <td>School Reported</td>\n",
+              "      <td>School</td>\n",
+              "      <td>Poverty</td>\n",
+              "      <td>Substitute</td>\n",
+              "      <td>Substitute Compensation</td>\n",
+              "      <td>Non PreK</td>\n",
+              "      <td>PreK-12 Operating</td>\n",
+              "      <td>Salaries And Wages For Substitute Professionals</td>\n",
+              "      <td>...</td>\n",
+              "      <td>Inservice Substitute Teachers Grant Funded</td>\n",
+              "      <td>School</td>\n",
+              "      <td>NaN</td>\n",
+              "      <td>Instruction</td>\n",
+              "      <td>Instruction And Curriculum</td>\n",
+              "      <td>CERTIFIED SUBSTITUTE</td>\n",
+              "      <td>46.530000</td>\n",
+              "      <td>Accelerated Education</td>\n",
+              "      <td>\"Title  Part A Improving Basic Programs\"</td>\n",
+              "      <td>MISCELLANEOUS</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>400277 rows × 25 columns</p>\n",
+              "</div>\n",
+              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f666659d-cc53-4867-b64a-0d1b481a7c61')\"\n",
+              "              title=\"Convert this dataframe to an interactive table.\"\n",
+              "              style=\"display:none;\">\n",
+              "        \n",
+              "  <svg xmlns=\"\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
+              "       width=\"24px\">\n",
+              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
+              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c. 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
+              "  </svg>\n",
+              "      </button>\n",
+              "      \n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      flex-wrap:wrap;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "      <script>\n",
+              "        const buttonEl =\n",
+              "          document.querySelector('#df-f666659d-cc53-4867-b64a-0d1b481a7c61 button.colab-df-convert');\n",
+              " =\n",
+              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "        async function convertToInteractive(key) {\n",
+              "          const element = document.querySelector('#df-f666659d-cc53-4867-b64a-0d1b481a7c61');\n",
+              "          const dataTable =\n",
+              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                     [key], {});\n",
+              "          if (!dataTable) return;\n",
+              "\n",
+              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "            '<a target=\"_blank\" href=>data table notebook</a>'\n",
+              "            + ' to learn more about interactive tables.';\n",
+              "          element.innerHTML = '';\n",
+              "          dataTable['output_type'] = 'display_data';\n",
+              "          await google.colab.output.renderOutput(dataTable, element);\n",
+              "          const docLink = document.createElement('div');\n",
+              "          docLink.innerHTML = docLinkHtml;\n",
+              "          element.appendChild(docLink);\n",
+              "        }\n",
+              "      </script>\n",
+              "    </div>\n",
+              "  </div>\n",
+              "  "
+            ]
+          },
+          "metadata": {},
+          "execution_count": 6
+        }
+      ],
+      "source": [
+        "test_df.set_index('Unnamed: 0')\n",
+        "train_df.set_index('Unnamed: 0')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "Thh9gm2q8Pt_",
+        "outputId": "4c390cc1-191f-4c51-aa7d-d7a4e549cd93"
+      },
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "                                                            0  \\\n",
+              "Unnamed: 0                                                      \n",
+              "134338            Teacher-Elementary              KINDERGA...   \n",
+              "206341      CONTRACTOR SERVICES BOND EXPENDITURES BUILDING...   \n",
+              "326408      Personal Services - Teachers     TCHER 2ND GRA...   \n",
+              "364634      EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...   \n",
+              "47683       TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE...   \n",
+              "...                                                       ...   \n",
+              "109283      WORKSHOP PARTICIPANT               CURRICULUM ...   \n",
+              "102430      SALARIES OF PART TIME EMPLOYEE   FEDERAL GDPG ...   \n",
+              "413949            School Liaison             PARENT/TITLE ...   \n",
+              "433672      EMPLOYEE BENEFITS EDUCATIONAL RESOURCE SERVICE...   \n",
+              "415831      Salaries And Wages For Substitute Professional...   \n",
+              "\n",
+              "            Function_Aides Compensation  \\\n",
+              "Unnamed: 0                                \n",
+              "134338                                0   \n",
+              "206341                                0   \n",
+              "326408                                0   \n",
+              "364634                                0   \n",
+              "47683                                 0   \n",
+              "...                                 ...   \n",
+              "109283                                0   \n",
+              "102430                                0   \n",
+              "413949                                0   \n",
+              "433672                                0   \n",
+              "415831                                0   \n",
+              "\n",
+              "            Function_Career & Academic Counseling  Function_Communications  \\\n",
+              "Unnamed: 0                                                                   \n",
+              "134338                                          0                        0   \n",
+              "206341                                          0                        0   \n",
+              "326408                                          0                        0   \n",
+              "364634                                          0                        0   \n",
+              "47683                                           0                        0   \n",
+              "...                                           ...                      ...   \n",
+              "109283                                          0                        0   \n",
+              "102430                                          0                        0   \n",
+              "413949                                          0                        0   \n",
+              "433672                                          0                        0   \n",
+              "415831                                          0                        0   \n",
+              "\n",
+              "            Function_Curriculum Development  \\\n",
+              "Unnamed: 0                                    \n",
+              "134338                                    0   \n",
+              "206341                                    0   \n",
+              "326408                                    0   \n",
+              "364634                                    0   \n",
+              "47683                                     0   \n",
+              "...                                     ...   \n",
+              "109283                                    0   \n",
+              "102430                                    0   \n",
+              "413949                                    0   \n",
+              "433672                                    0   \n",
+              "415831                                    0   \n",
+              "\n",
+              "            Function_Data Processing & Information Services  \\\n",
+              "Unnamed: 0                                                    \n",
+              "134338                                                    0   \n",
+              "206341                                                    0   \n",
+              "326408                                                    0   \n",
+              "364634                                                    0   \n",
+              "47683                                                     0   \n",
+              "...                                                     ...   \n",
+              "109283                                                    0   \n",
+              "102430                                                    0   \n",
+              "413949                                                    0   \n",
+              "433672                                                    0   \n",
+              "415831                                                    0   \n",
+              "\n",
+              "            Function_Development & Fundraising  Function_Enrichment  \\\n",
+              "Unnamed: 0                                                            \n",
+              "134338                                       0                    0   \n",
+              "206341                                       0                    0   \n",
+              "326408                                       0                    0   \n",
+              "364634                                       0                    0   \n",
+              "47683                                        0                    0   \n",
+              "...                                        ...                  ...   \n",
+              "109283                                       0                    0   \n",
+              "102430                                       0                    0   \n",
+              "413949                                       0                    0   \n",
+              "433672                                       0                    0   \n",
+              "415831                                       0                    0   \n",
+              "\n",
+              "            Function_Extended Time & Tutoring  \\\n",
+              "Unnamed: 0                                      \n",
+              "134338                                      0   \n",
+              "206341                                      0   \n",
+              "326408                                      0   \n",
+              "364634                                      0   \n",
+              "47683                                       0   \n",
+              "...                                       ...   \n",
+              "109283                                      0   \n",
+              "102430                                      0   \n",
+              "413949                                      0   \n",
+              "433672                                      0   \n",
+              "415831                                      0   \n",
+              "\n",
+              "            Function_Facilities & Maintenance  ...  \\\n",
+              "Unnamed: 0                                     ...   \n",
+              "134338                                      0  ...   \n",
+              "206341                                      0  ...   \n",
+              "326408                                      0  ...   \n",
+              "364634                                      0  ...   \n",
+              "47683                                       0  ...   \n",
+              "...                                       ...  ...   \n",
+              "109283                                      0  ...   \n",
+              "102430                                      0  ...   \n",
+              "413949                                      0  ...   \n",
+              "433672                                      0  ...   \n",
+              "415831                                      0  ...   \n",
+              "\n",
+              "            Student_Type_Special Education  Student_Type_Unspecified  \\\n",
+              "Unnamed: 0                                                             \n",
+              "134338                                   0                         0   \n",
+              "206341                                   0                         0   \n",
+              "326408                                   0                         1   \n",
+              "364634                                   0                         1   \n",
+              "47683                                    0                         1   \n",
+              "...                                    ...                       ...   \n",
+              "109283                                   0                         1   \n",
+              "102430                                   0                         1   \n",
+              "413949                                   0                         0   \n",
+              "433672                                   0                         1   \n",
+              "415831                                   0                         0   \n",
+              "\n",
+              "            Use_Business Services  Use_ISPD  Use_Instruction  Use_Leadership  \\\n",
+              "Unnamed: 0                                                                     \n",
+              "134338                          0         0                1               0   \n",
+              "206341                          0         0                0               0   \n",
+              "326408                          0         0                1               0   \n",
+              "364634                          0         0                1               0   \n",
+              "47683                           0         0                1               0   \n",
+              "...                           ...       ...              ...             ...   \n",
+              "109283                          0         1                0               0   \n",
+              "102430                          0         0                1               0   \n",
+              "413949                          0         0                0               0   \n",
+              "433672                          0         0                1               0   \n",
+              "415831                          0         0                1               0   \n",
+              "\n",
+              "            Use_NO_LABEL  Use_O&M  Use_Pupil Services & Enrichment  \\\n",
+              "Unnamed: 0                                                           \n",
+              "134338                 0        0                                0   \n",
+              "206341                 1        0                                0   \n",
+              "326408                 0        0                                0   \n",
+              "364634                 0        0                                0   \n",
+              "47683                  0        0                                0   \n",
+              "...                  ...      ...                              ...   \n",
+              "109283                 0        0                                0   \n",
+              "102430                 0        0                                0   \n",
+              "413949                 1        0                                0   \n",
+              "433672                 0        0                                0   \n",
+              "415831                 0        0                                0   \n",
+              "\n",
+              "            Use_Untracked Budget Set-Aside  \n",
+              "Unnamed: 0                                  \n",
+              "134338                                   0  \n",
+              "206341                                   0  \n",
+              "326408                                   0  \n",
+              "364634                                   0  \n",
+              "47683                                    0  \n",
+              "...                                    ...  \n",
+              "109283                                   0  \n",
+              "102430                                   0  \n",
+              "413949                                   0  \n",
+              "433672                                   0  \n",
+              "415831                                   0  \n",
+              "\n",
+              "[400277 rows x 105 columns]"
+            ],
+            "text/html": [
+              "\n",
+              "  <div id=\"df-efe9ca42-7ce1-4fd4-a880-234b5631bd03\">\n",
+              "    <div class=\"colab-df-container\">\n",
+              "      <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>0</th>\n",
+              "      <th>Function_Aides Compensation</th>\n",
+              "      <th>Function_Career &amp; Academic Counseling</th>\n",
+              "      <th>Function_Communications</th>\n",
+              "      <th>Function_Curriculum Development</th>\n",
+              "      <th>Function_Data Processing &amp; Information Services</th>\n",
+              "      <th>Function_Development &amp; Fundraising</th>\n",
+              "      <th>Function_Enrichment</th>\n",
+              "      <th>Function_Extended Time &amp; Tutoring</th>\n",
+              "      <th>Function_Facilities &amp; Maintenance</th>\n",
+              "      <th>...</th>\n",
+              "      <th>Student_Type_Special Education</th>\n",
+              "      <th>Student_Type_Unspecified</th>\n",
+              "      <th>Use_Business Services</th>\n",
+              "      <th>Use_ISPD</th>\n",
+              "      <th>Use_Instruction</th>\n",
+              "      <th>Use_Leadership</th>\n",
+              "      <th>Use_NO_LABEL</th>\n",
+              "      <th>Use_O&amp;M</th>\n",
+              "      <th>Use_Pupil Services &amp; Enrichment</th>\n",
+              "      <th>Use_Untracked Budget Set-Aside</th>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>Unnamed: 0</th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>134338</th>\n",
+              "      <td>Teacher-Elementary              KINDERGA...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>206341</th>\n",
+              "      <td>CONTRACTOR SERVICES BOND EXPENDITURES BUILDING...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>326408</th>\n",
+              "      <td>Personal Services - Teachers     TCHER 2ND GRA...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>364634</th>\n",
+              "      <td>EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>47683</th>\n",
+              "      <td>TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>...</th>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>109283</th>\n",
+              "      <td>WORKSHOP PARTICIPANT               CURRICULUM ...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>102430</th>\n",
+              "      <td>SALARIES OF PART TIME EMPLOYEE   FEDERAL GDPG ...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>413949</th>\n",
+              "      <td>School Liaison             PARENT/TITLE ...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>433672</th>\n",
+              "      <td>EMPLOYEE BENEFITS EDUCATIONAL RESOURCE SERVICE...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>415831</th>\n",
+              "      <td>Salaries And Wages For Substitute Professional...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>400277 rows × 105 columns</p>\n",
+              "</div>\n",
+              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-efe9ca42-7ce1-4fd4-a880-234b5631bd03')\"\n",
+              "              title=\"Convert this dataframe to an interactive table.\"\n",
+              "              style=\"display:none;\">\n",
+              "        \n",
+              "  <svg xmlns=\"\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
+              "       width=\"24px\">\n",
+              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
+              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c. 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
+              "  </svg>\n",
+              "      </button>\n",
+              "      \n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      flex-wrap:wrap;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "      <script>\n",
+              "        const buttonEl =\n",
+              "          document.querySelector('#df-efe9ca42-7ce1-4fd4-a880-234b5631bd03 button.colab-df-convert');\n",
+              " =\n",
+              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "        async function convertToInteractive(key) {\n",
+              "          const element = document.querySelector('#df-efe9ca42-7ce1-4fd4-a880-234b5631bd03');\n",
+              "          const dataTable =\n",
+              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                     [key], {});\n",
+              "          if (!dataTable) return;\n",
+              "\n",
+              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "            '<a target=\"_blank\" href=>data table notebook</a>'\n",
+              "            + ' to learn more about interactive tables.';\n",
+              "          element.innerHTML = '';\n",
+              "          dataTable['output_type'] = 'display_data';\n",
+              "          await google.colab.output.renderOutput(dataTable, element);\n",
+              "          const docLink = document.createElement('div');\n",
+              "          docLink.innerHTML = docLinkHtml;\n",
+              "          element.appendChild(docLink);\n",
+              "        }\n",
+              "      </script>\n",
+              "    </div>\n",
+              "  </div>\n",
+              "  "
+            ]
+          },
+          "metadata": {},
+          "execution_count": 7
+        }
+      ],
+      "source": [
+        "# set target Cols\n",
+        "target_cols = ['Function', 'Object_Type', 'Operating_Status', 'Position_Type',  'Pre_K', 'Reporting', 'Sharing', 'Student_Type', 'Use']\n",
+        "train_df_dummies = pd.DataFrame()\n",
+        "# drop train int cols\n",
+        "train_df.drop(['FTE', 'Total'], axis = 1, inplace=True)\n",
+        "# get dummies for target cols\n",
+        "col = pd.get_dummies(train_df[target_cols])\n",
+        "# drop target cols\n",
+        "train_df = train_df.drop(target_cols, axis=1)\n",
+        "# fill NaN with space\n",
+        "train_df.fillna(' ', inplace=True)\n",
+        "# combine all text into single col\n",
+        "combined = pd.DataFrame([' '.join(row) for row in train_df[train_df.columns[1:]].values])\n",
+        "# join combined text col with dummy labels\n",
+        "train_df_dummies = pd.concat([combined, col], axis = 1)\n",
+        "# drop test int cols\n",
+        "test_df.drop(['FTE', 'Total'], axis = 1, inplace=True)\n",
+        "# fill NaN with space\n",
+        "test_df.fillna(' ', inplace=True)\n",
+        "# combine all text into single col\n",
+        "test_df_cleaned = pd.DataFrame([' '.join(row) for row in test_df[test_df.columns[1:]].values])\n",
+        "# reset indices to original\n",
+        "test_df_cleaned.set_index(test_df['Unnamed: 0'])\n",
+        "train_df_dummies.set_index(train_df['Unnamed: 0'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "sIdbRlvsE2Fn"
+      },
+      "outputs": [],
+      "source": [
+        "cols = col.columns"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5ZhHV0028PjT"
+      },
+      "outputs": [],
+      "source": [
+        "# rename text col\n",
+        "test_df_cleaned.rename(columns={0: \"text\"}, inplace=True)\n",
+        "train_df_dummies.rename(columns={0: \"text\"}, inplace=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DSMXH2Qmc0Y_",
+        "outputId": "fe62771e-2bf1-44f4-88e1-d56dae02b300"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+            "[nltk_data]   Unzipping corpora/\n",
+            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+            "[nltk_data]   Unzipping tokenizers/\n"
+          ]
+        }
+      ],
+      "source": [
+        "import nltk\n",
+        "'stopwords')\n",
+        "'punkt')\n",
+        "from nltk import word_tokenize\n",
+        "from nltk.corpus import stopwords\n",
+        "from nltk.stem.snowball import SnowballStemmer\n",
+        "import re"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Xly7pR2-CZhx"
+      },
+      "outputs": [],
+      "source": [
+        "def text_processing(df):\n",
+        "    stop_words = set(stopwords.words('english'))\n",
+        "    st = SnowballStemmer('english')\n",
+        "    # lower and trim spaces\n",
+        "    df['text'] = df['text'].apply(lambda x: x.lower().strip())\n",
+        "    # remove other spaces\n",
+        "    df['text'] = df['text'].apply(lambda x: re.sub(' +', ' ', x))\n",
+        "    # remove punctuation\n",
+        "    df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))\n",
+        "    # remove stopwords stem\n",
+        "    df['text'] = df['text'].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stop_words))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rR1Z3FPoh7c3"
+      },
+      "outputs": [],
+      "source": [
+        "# clean df\n",
+        "text_processing(train_df_dummies)\n",
+        "text_processing(test_df_cleaned)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Embedding"
+      ],
+      "metadata": {
+        "id": "4DkHoAlAdS8j"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "iQAX3AoUnrtv"
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.feature_extraction.text import TfidfVectorizer"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dpLEeR7dC9Bd"
+      },
+      "outputs": [],
+      "source": [
+        "X = train_df_dummies['text'].values\n",
+        "y = train_df_dummies[list(train_df_dummies.columns[1:])].values\n",
+        "X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.2,train_size =0.8)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "J4KQiBI0oZUp"
+      },
+      "outputs": [],
+      "source": [
+        "def tfidf_features(X_train, X_val, X_test):\n",
+        "    \"\"\"\n",
+        "        X_train, X_test — samples        \n",
+        "        return TF-IDF vectorized representation of each sample and vocabulary\n",
+        "    \"\"\"\n",
+        "    # Create TF-IDF vectorizer with a proper parameters choice\n",
+        "    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=1.0, min_df=1, token_pattern='(\\S+)')\n",
+        "    # Fit the vectorizer on the train set\n",
+        "    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n",
+        "    # Transform the train, test set and return the result\n",
+        "    X_val_tfidf = tfidf_vectorizer.transform(X_val)\n",
+        "    X_test_tfidf = tfidf_vectorizer.transform(X_test)\n",
+        "    \n",
+        "    return X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_\n",
+        "timer = time.time()\n",
+        "X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, test_df_cleaned.text)\n",
+        "tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Logistic Regression"
+      ],
+      "metadata": {
+        "id": "ydQ-7nJdT7Ad"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nwyPgHYMplz3",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "045b6532-7308-4018-b3c3-af14efe9aec5"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Time for NLP training\" 1387.7084062099457\n",
+            "Time for NLP inference\" 0.6320400238037109\n"
+          ]
+        }
+      ],
+      "source": [
+        "from sklearn.multiclass import OneVsRestClassifier\n",
+        "from sklearn.linear_model import LogisticRegression, RidgeClassifier\n",
+        "\n",
+        "def train_logreg(X_train, y_train, C, regularisation):\n",
+        "    \"\"\"\n",
+        "      X_train, y_train — training data\n",
+        "      \n",
+        "      return: trained classifier\n",
+        "    \"\"\"\n",
+        "    \n",
+        "    # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n",
+        "\n",
+        "    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)\n",
+        "    return model\n",
+        "\n",
+        "logreg_tfidf = train_logreg(X_train_tfidf, y_train, C = 2, regularisation = 'l2')\n",
+        "print (f'Time for NLP training\" {time.time()-timer}')\n",
+        "timer = time.time()\n",
+        "y_val_predicted_labels_logreg = logreg_tfidf.predict(X_val_tfidf)\n",
+        "print (f'Time for NLP inference\" {time.time()-timer}')\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# XG Boost"
+      ],
+      "metadata": {
+        "id": "SOFP4vZFUJiu"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# from sklearn.multiclass import OneVsRestClassifier\n",
+        "# from xgboost import XGBClassifier\n",
+        "\n",
+        "# def train_xgb(X_train, y_train, params):\n",
+        "#     \"\"\"\n",
+        "#     X_train, y_train — training data\n",
+        "    \n",
+        "#     return: trained classifier\n",
+        "#     \"\"\"\n",
+        "    \n",
+        "#     # Create and fit XGBoost wraped into OneVsRestClassifier.\n",
+        "\n",
+        "#     model = OneVsRestClassifier(XGBClassifier(**params)).fit(X_train, y_train)\n",
+        "#     return model\n",
+        "# xgb_params = {'eta': 0.3, \n",
+        "#               'max_depth': 5, \n",
+        "#               'subsample': 0.8, \n",
+        "#               'colsample_bytree': 0.8, \n",
+        "#               'tree_method' : 'gpu_hist',\n",
+        "#               'objective': 'binary:logistic', \n",
+        "#               'eval_metric': 'auc', \n",
+        "#               'seed': 42\n",
+        "#              }\n",
+        "# xgb_tfidf = train_xgb(X_train_tfidf, y_train, xgb_params)\n",
+        "# y_val_predicted_labels_xgb = xgb_tfidf.predict(X_val_tfidf)"
+      ],
+      "metadata": {
+        "id": "r42fPuwwURXN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# SGD"
+      ],
+      "metadata": {
+        "id": "WeL9xN2WLpyO"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# from sklearn.multiclass import OneVsRestClassifier\n",
+        "# from sklearn.linear_model import SGDClassifier\n",
+        "# def train_sgd(X_train, y_train):\n",
+        "#     \"\"\"\n",
+        "#       X_train, y_train — training data\n",
+        "      \n",
+        "#       return: trained classifier\n",
+        "#     \"\"\"\n",
+        "    \n",
+        "#     # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n",
+        "\n",
+        "#     model = OneVsRestClassifier(SGDClassifier(loss = 'log', penalty = 'l2')).fit(X_train, y_train)\n",
+        "#     return model\n",
+        "\n",
+        "# sgd_tfidf = train_sgd(X_train_tfidf, y_train)\n",
+        "# print (f'Time for NLP training\" {time.time()-timer}')\n",
+        "# timer = time.time()\n",
+        "# y_val_predicted_labels_sgd = sgd_tfidf.predict(X_val_tfidf)\n",
+        "# print (f'Time for NLP inference\" {time.time()-timer}')\n"
+      ],
+      "metadata": {
+        "id": "CxxB9ABSLrs1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Model Evaluation"
+      ],
+      "metadata": {
+        "id": "aToqJ94RdO9R"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from sklearn.metrics import accuracy_score\n",
+        "from sklearn.metrics import f1_score\n",
+        "from sklearn.metrics import roc_auc_score \n",
+        "from sklearn.metrics import precision_score\n",
+        "from sklearn.metrics import average_precision_score\n",
+        "from sklearn.metrics import recall_score\n",
+        "from sklearn.metrics import log_loss"
+      ],
+      "metadata": {
+        "id": "a2ULt-H4dAT5"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def print_evaluation_scores(y_test, predicted):\n",
+        "    \n",
+        "    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=True))\n",
+        "    print('F1-score macro: ', f1_score(y_test, predicted, average='macro', labels=[1]))\n",
+        "    print('F1-score micro: ', f1_score(y_test, predicted, average='micro', labels=[1]))\n",
+        "    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))\n",
+        "    print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))\n",
+        "    print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))\n",
+        "    print('Precision weighted: ', precision_score(y_test, predicted, average='weighted', labels=[1]))\n",
+        "    print('Log Loss: ', log_loss(y_test, predicted, normalize=True))\n",
+        "\n",
+        "    \n",
+        "print('Metrics')\n",
+        "print_evaluation_scores(y_val, y_val_predicted_labels_logreg)\n",
+        "# print_evaluation_scores(y_val, y_val_predicted_labels_xgb)\n",
+        "#  print_evaluation_scores(y_val, y_val_predicted_labels_sgd)"
+      ],
+      "metadata": {
+        "id": "g1xaUV6Ic1kS",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "e1fb88d6-f5e0-4b75-c656-9b0ee6ea87da"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Metrics\n",
+            "Accuracy:  0.898533526531428\n",
+            "F1-score macro:  0.857397504456328\n",
+            "F1-score micro:  0.857397504456328\n",
+            "F1-score weighted:  0.9780481032620225\n",
+            "Precision macro:  0.8548790839083935\n",
+            "Precision micro:  0.9595717445537028\n",
+            "Precision weighted:  0.8809523809523809\n",
+            "Log Loss:  27.62320822221433\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Submission"
+      ],
+      "metadata": {
+        "id": "4e9yV9CadHwO"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "75VcWGLmvvCi"
+      },
+      "outputs": [],
+      "source": [
+        "test_predictions = logreg_tfidf.predict_proba(X_test_tfidf)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "GhNpC96cwRmD",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 554
+        },
+        "outputId": "aa8f5639-897c-4437-aca7-daafddd03797"
+      },
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "        Function__Aides Compensation  Function__Career & Academic Counseling  \\\n",
+              "                                                                               \n",
+              "180042                      0.005369                                0.001911   \n",
+              "28872                       0.000574                                0.001182   \n",
+              "186915                      0.017363                                0.000992   \n",
+              "412396                      0.011996                                0.000990   \n",
+              "427740                      0.001207                                0.002310   \n",
+              "...                              ...                                     ...   \n",
+              "169063                      0.004662                                0.000559   \n",
+              "433255                      0.004662                                0.000559   \n",
+              "232204                      0.004662                                0.000559   \n",
+              "171685                      0.004662                                0.000559   \n",
+              "249087                      0.004662                                0.000559   \n",
+              "\n",
+              "        Function__Communications  Function__Curriculum Development  \\\n",
+              "                                                                     \n",
+              "180042                  0.000062                          0.000037   \n",
+              "28872                   0.000243                          0.005150   \n",
+              "186915                  0.000156                          0.001177   \n",
+              "412396                  0.000155                          0.001122   \n",
+              "427740                  0.000585                          0.002508   \n",
+              "...                          ...                               ...   \n",
+              "169063                  0.000271                          0.001190   \n",
+              "433255                  0.000271                          0.001190   \n",
+              "232204                  0.000271                          0.001190   \n",
+              "171685                  0.000271                          0.001190   \n",
+              "249087                  0.000271                          0.001190   \n",
+              "\n",
+              "        Function__Data Processing & Information Services  \\\n",
+              "                                                           \n",
+              "180042                                          0.000424   \n",
+              "28872                                           0.001041   \n",
+              "186915                                          0.000616   \n",
+              "412396                                          0.000625   \n",
+              "427740                                          0.002355   \n",
+              "...                                                  ...   \n",
+              "169063                                          0.000868   \n",
+              "433255                                          0.000868   \n",
+              "232204                                          0.000868   \n",
+              "171685                                          0.000868   \n",
+              "249087                                          0.000868   \n",
+              "\n",
+              "        Function__Development & Fundraising  Function__Enrichment  \\\n",
+              "                                                                    \n",
+              "180042                             0.000088              0.000996   \n",
+              "28872                              0.000150              0.227637   \n",
+              "186915                             0.000108              0.000500   \n",
+              "412396                             0.000108              0.000475   \n",
+              "427740                             0.000135              0.002673   \n",
+              "...                                     ...                   ...   \n",
+              "169063                             0.000236              0.002325   \n",
+              "433255                             0.000236              0.002325   \n",
+              "232204                             0.000236              0.002325   \n",
+              "171685                             0.000236              0.002325   \n",
+              "249087                             0.000236              0.002325   \n",
+              "\n",
+              "        Function__Extended Time & Tutoring  \\\n",
+              "                                             \n",
+              "180042                            0.000203   \n",
+              "28872                             0.003900   \n",
+              "186915                            0.000463   \n",
+              "412396                            0.000489   \n",
+              "427740                            0.000067   \n",
+              "...                                    ...   \n",
+              "169063                            0.000239   \n",
+              "433255                            0.000239   \n",
+              "232204                            0.000239   \n",
+              "171685                            0.000239   \n",
+              "249087                            0.000239   \n",
+              "\n",
+              "        Function__Facilities & Maintenance  Function__Facilities Planning  \\\n",
+              "                                                                            \n",
+              "180042                            0.001402                       0.000029   \n",
+              "28872                             0.006152                       0.000040   \n",
+              "186915                            0.001457                       0.000034   \n",
+              "412396                            0.001456                       0.000034   \n",
+              "427740                            0.003695                       0.000041   \n",
+              "...                                    ...                            ...   \n",
+              "169063                            0.001862                       0.000044   \n",
+              "433255                            0.001862                       0.000044   \n",
+              "232204                            0.001862                       0.000044   \n",
+              "171685                            0.001862                       0.000044   \n",
+              "249087                            0.001862                       0.000044   \n",
+              "\n",
+              "        ...  Student_Type__Special Education  Student_Type__Unspecified  \\\n",
+              "        ...                                                               \n",
+              "180042  ...                         0.002347                   0.836926   \n",
+              "28872   ...                         0.005913                   0.894910   \n",
+              "186915  ...                         0.004487                   0.282162   \n",
+              "412396  ...                         0.004447                   0.250461   \n",
+              "427740  ...                         0.004405                   0.980520   \n",
+              "...     ...                              ...                        ...   \n",
+              "169063  ...                         0.004677                   0.037555   \n",
+              "433255  ...                         0.004677                   0.037555   \n",
+              "232204  ...                         0.004677                   0.037555   \n",
+              "171685  ...                         0.004677                   0.037555   \n",
+              "249087  ...                         0.004677                   0.037555   \n",
+              "\n",
+              "        Use__Business Services  Use__ISPD  Use__Instruction  Use__Leadership  \\\n",
+              "                                                                               \n",
+              "180042                0.000200   0.003739          0.064349         0.003398   \n",
+              "28872                 0.003721   0.009994          0.011118         0.023945   \n",
+              "186915                0.000895   0.012540          0.695857         0.008499   \n",
+              "412396                0.000910   0.010479          0.693750         0.007253   \n",
+              "427740                0.008272   0.040371          0.001226         0.805063   \n",
+              "...                        ...        ...               ...              ...   \n",
+              "169063                0.003252   0.003751          0.017954         0.002785   \n",
+              "433255                0.003252   0.003751          0.017954         0.002785   \n",
+              "232204                0.003252   0.003751          0.017954         0.002785   \n",
+              "171685                0.003252   0.003751          0.017954         0.002785   \n",
+              "249087                0.003252   0.003751          0.017954         0.002785   \n",
+              "\n",
+              "        Use__NO_LABEL  Use__O&M  Use__Pupil Services & Enrichment  \\\n",
+              "                                                                    \n",
+              "180042       0.264019  0.006676                          0.005294   \n",
+              "28872        0.047036  0.023556                          0.146622   \n",
+              "186915       0.016858  0.003341                          0.009452   \n",
+              "412396       0.017822  0.003547                          0.008566   \n",
+              "427740       0.005581  0.037933                          0.004177   \n",
+              "...               ...       ...                               ...   \n",
+              "169063       0.883382  0.002572                          0.044943   \n",
+              "433255       0.883382  0.002572                          0.044943   \n",
+              "232204       0.883382  0.002572                          0.044943   \n",
+              "171685       0.883382  0.002572                          0.044943   \n",
+              "249087       0.883382  0.002572                          0.044943   \n",
+              "\n",
+              "        Use__Untracked Budget Set-Aside  \n",
+              "                                         \n",
+              "180042                         0.000372  \n",
+              "28872                          0.000244  \n",
+              "186915                         0.000168  \n",
+              "412396                         0.000168  \n",
+              "427740                         0.000233  \n",
+              "...                                 ...  \n",
+              "169063                         0.000341  \n",
+              "433255                         0.000341  \n",
+              "232204                         0.000341  \n",
+              "171685                         0.000341  \n",
+              "249087                         0.000341  \n",
+              "\n",
+              "[50064 rows x 104 columns]"
+            ],
+            "text/html": [
+              "\n",
+              "  <div id=\"df-73e59131-014c-49d1-a337-c844b74a21b5\">\n",
+              "    <div class=\"colab-df-container\">\n",
+              "      <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Function__Aides Compensation</th>\n",
+              "      <th>Function__Career &amp; Academic Counseling</th>\n",
+              "      <th>Function__Communications</th>\n",
+              "      <th>Function__Curriculum Development</th>\n",
+              "      <th>Function__Data Processing &amp; Information Services</th>\n",
+              "      <th>Function__Development &amp; Fundraising</th>\n",
+              "      <th>Function__Enrichment</th>\n",
+              "      <th>Function__Extended Time &amp; Tutoring</th>\n",
+              "      <th>Function__Facilities &amp; Maintenance</th>\n",
+              "      <th>Function__Facilities Planning</th>\n",
+              "      <th>...</th>\n",
+              "      <th>Student_Type__Special Education</th>\n",
+              "      <th>Student_Type__Unspecified</th>\n",
+              "      <th>Use__Business Services</th>\n",
+              "      <th>Use__ISPD</th>\n",
+              "      <th>Use__Instruction</th>\n",
+              "      <th>Use__Leadership</th>\n",
+              "      <th>Use__NO_LABEL</th>\n",
+              "      <th>Use__O&amp;M</th>\n",
+              "      <th>Use__Pupil Services &amp; Enrichment</th>\n",
+              "      <th>Use__Untracked Budget Set-Aside</th>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "      <th></th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>180042</th>\n",
+              "      <td>0.005369</td>\n",
+              "      <td>0.001911</td>\n",
+              "      <td>0.000062</td>\n",
+              "      <td>0.000037</td>\n",
+              "      <td>0.000424</td>\n",
+              "      <td>0.000088</td>\n",
+              "      <td>0.000996</td>\n",
+              "      <td>0.000203</td>\n",
+              "      <td>0.001402</td>\n",
+              "      <td>0.000029</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.002347</td>\n",
+              "      <td>0.836926</td>\n",
+              "      <td>0.000200</td>\n",
+              "      <td>0.003739</td>\n",
+              "      <td>0.064349</td>\n",
+              "      <td>0.003398</td>\n",
+              "      <td>0.264019</td>\n",
+              "      <td>0.006676</td>\n",
+              "      <td>0.005294</td>\n",
+              "      <td>0.000372</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>28872</th>\n",
+              "      <td>0.000574</td>\n",
+              "      <td>0.001182</td>\n",
+              "      <td>0.000243</td>\n",
+              "      <td>0.005150</td>\n",
+              "      <td>0.001041</td>\n",
+              "      <td>0.000150</td>\n",
+              "      <td>0.227637</td>\n",
+              "      <td>0.003900</td>\n",
+              "      <td>0.006152</td>\n",
+              "      <td>0.000040</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.005913</td>\n",
+              "      <td>0.894910</td>\n",
+              "      <td>0.003721</td>\n",
+              "      <td>0.009994</td>\n",
+              "      <td>0.011118</td>\n",
+              "      <td>0.023945</td>\n",
+              "      <td>0.047036</td>\n",
+              "      <td>0.023556</td>\n",
+              "      <td>0.146622</td>\n",
+              "      <td>0.000244</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>186915</th>\n",
+              "      <td>0.017363</td>\n",
+              "      <td>0.000992</td>\n",
+              "      <td>0.000156</td>\n",
+              "      <td>0.001177</td>\n",
+              "      <td>0.000616</td>\n",
+              "      <td>0.000108</td>\n",
+              "      <td>0.000500</td>\n",
+              "      <td>0.000463</td>\n",
+              "      <td>0.001457</td>\n",
+              "      <td>0.000034</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.004487</td>\n",
+              "      <td>0.282162</td>\n",
+              "      <td>0.000895</td>\n",
+              "      <td>0.012540</td>\n",
+              "      <td>0.695857</td>\n",
+              "      <td>0.008499</td>\n",
+              "      <td>0.016858</td>\n",
+              "      <td>0.003341</td>\n",
+              "      <td>0.009452</td>\n",
+              "      <td>0.000168</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>412396</th>\n",
+              "      <td>0.011996</td>\n",
+              "      <td>0.000990</td>\n",
+              "      <td>0.000155</td>\n",
+              "      <td>0.001122</td>\n",
+              "      <td>0.000625</td>\n",
+              "      <td>0.000108</td>\n",
+              "      <td>0.000475</td>\n",
+              "      <td>0.000489</td>\n",
+              "      <td>0.001456</td>\n",
+              "      <td>0.000034</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.004447</td>\n",
+              "      <td>0.250461</td>\n",
+              "      <td>0.000910</td>\n",
+              "      <td>0.010479</td>\n",
+              "      <td>0.693750</td>\n",
+              "      <td>0.007253</td>\n",
+              "      <td>0.017822</td>\n",
+              "      <td>0.003547</td>\n",
+              "      <td>0.008566</td>\n",
+              "      <td>0.000168</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>427740</th>\n",
+              "      <td>0.001207</td>\n",
+              "      <td>0.002310</td>\n",
+              "      <td>0.000585</td>\n",
+              "      <td>0.002508</td>\n",
+              "      <td>0.002355</td>\n",
+              "      <td>0.000135</td>\n",
+              "      <td>0.002673</td>\n",
+              "      <td>0.000067</td>\n",
+              "      <td>0.003695</td>\n",
+              "      <td>0.000041</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.004405</td>\n",
+              "      <td>0.980520</td>\n",
+              "      <td>0.008272</td>\n",
+              "      <td>0.040371</td>\n",
+              "      <td>0.001226</td>\n",
+              "      <td>0.805063</td>\n",
+              "      <td>0.005581</td>\n",
+              "      <td>0.037933</td>\n",
+              "      <td>0.004177</td>\n",
+              "      <td>0.000233</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>...</th>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>169063</th>\n",
+              "      <td>0.004662</td>\n",
+              "      <td>0.000559</td>\n",
+              "      <td>0.000271</td>\n",
+              "      <td>0.001190</td>\n",
+              "      <td>0.000868</td>\n",
+              "      <td>0.000236</td>\n",
+              "      <td>0.002325</td>\n",
+              "      <td>0.000239</td>\n",
+              "      <td>0.001862</td>\n",
+              "      <td>0.000044</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.004677</td>\n",
+              "      <td>0.037555</td>\n",
+              "      <td>0.003252</td>\n",
+              "      <td>0.003751</td>\n",
+              "      <td>0.017954</td>\n",
+              "      <td>0.002785</td>\n",
+              "      <td>0.883382</td>\n",
+              "      <td>0.002572</td>\n",
+              "      <td>0.044943</td>\n",
+              "      <td>0.000341</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>433255</th>\n",
+              "      <td>0.004662</td>\n",
+              "      <td>0.000559</td>\n",
+              "      <td>0.000271</td>\n",
+              "      <td>0.001190</td>\n",
+              "      <td>0.000868</td>\n",
+              "      <td>0.000236</td>\n",
+              "      <td>0.002325</td>\n",
+              "      <td>0.000239</td>\n",
+              "      <td>0.001862</td>\n",
+              "      <td>0.000044</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.004677</td>\n",
+              "      <td>0.037555</td>\n",
+              "      <td>0.003252</td>\n",
+              "      <td>0.003751</td>\n",
+              "      <td>0.017954</td>\n",
+              "      <td>0.002785</td>\n",
+              "      <td>0.883382</td>\n",
+              "      <td>0.002572</td>\n",
+              "      <td>0.044943</td>\n",
+              "      <td>0.000341</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>232204</th>\n",
+              "      <td>0.004662</td>\n",
+              "      <td>0.000559</td>\n",
+              "      <td>0.000271</td>\n",
+              "      <td>0.001190</td>\n",
+              "      <td>0.000868</td>\n",
+              "      <td>0.000236</td>\n",
+              "      <td>0.002325</td>\n",
+              "      <td>0.000239</td>\n",
+              "      <td>0.001862</td>\n",
+              "      <td>0.000044</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.004677</td>\n",
+              "      <td>0.037555</td>\n",
+              "      <td>0.003252</td>\n",
+              "      <td>0.003751</td>\n",
+              "      <td>0.017954</td>\n",
+              "      <td>0.002785</td>\n",
+              "      <td>0.883382</td>\n",
+              "      <td>0.002572</td>\n",
+              "      <td>0.044943</td>\n",
+              "      <td>0.000341</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>171685</th>\n",
+              "      <td>0.004662</td>\n",
+              "      <td>0.000559</td>\n",
+              "      <td>0.000271</td>\n",
+              "      <td>0.001190</td>\n",
+              "      <td>0.000868</td>\n",
+              "      <td>0.000236</td>\n",
+              "      <td>0.002325</td>\n",
+              "      <td>0.000239</td>\n",
+              "      <td>0.001862</td>\n",
+              "      <td>0.000044</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.004677</td>\n",
+              "      <td>0.037555</td>\n",
+              "      <td>0.003252</td>\n",
+              "      <td>0.003751</td>\n",
+              "      <td>0.017954</td>\n",
+              "      <td>0.002785</td>\n",
+              "      <td>0.883382</td>\n",
+              "      <td>0.002572</td>\n",
+              "      <td>0.044943</td>\n",
+              "      <td>0.000341</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>249087</th>\n",
+              "      <td>0.004662</td>\n",
+              "      <td>0.000559</td>\n",
+              "      <td>0.000271</td>\n",
+              "      <td>0.001190</td>\n",
+              "      <td>0.000868</td>\n",
+              "      <td>0.000236</td>\n",
+              "      <td>0.002325</td>\n",
+              "      <td>0.000239</td>\n",
+              "      <td>0.001862</td>\n",
+              "      <td>0.000044</td>\n",
+              "      <td>...</td>\n",
+              "      <td>0.004677</td>\n",
+              "      <td>0.037555</td>\n",
+              "      <td>0.003252</td>\n",
+              "      <td>0.003751</td>\n",
+              "      <td>0.017954</td>\n",
+              "      <td>0.002785</td>\n",
+              "      <td>0.883382</td>\n",
+              "      <td>0.002572</td>\n",
+              "      <td>0.044943</td>\n",
+              "      <td>0.000341</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>50064 rows × 104 columns</p>\n",
+              "</div>\n",
+              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-73e59131-014c-49d1-a337-c844b74a21b5')\"\n",
+              "              title=\"Convert this dataframe to an interactive table.\"\n",
+              "              style=\"display:none;\">\n",
+              "        \n",
+              "  <svg xmlns=\"\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
+              "       width=\"24px\">\n",
+              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
+              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c. 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
+              "  </svg>\n",
+              "      </button>\n",
+              "      \n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      flex-wrap:wrap;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "      <script>\n",
+              "        const buttonEl =\n",
+              "          document.querySelector('#df-73e59131-014c-49d1-a337-c844b74a21b5 button.colab-df-convert');\n",
+              " =\n",
+              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "        async function convertToInteractive(key) {\n",
+              "          const element = document.querySelector('#df-73e59131-014c-49d1-a337-c844b74a21b5');\n",
+              "          const dataTable =\n",
+              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                     [key], {});\n",
+              "          if (!dataTable) return;\n",
+              "\n",
+              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "            '<a target=\"_blank\" href=>data table notebook</a>'\n",
+              "            + ' to learn more about interactive tables.';\n",
+              "          element.innerHTML = '';\n",
+              "          dataTable['output_type'] = 'display_data';\n",
+              "          await google.colab.output.renderOutput(dataTable, element);\n",
+              "          const docLink = document.createElement('div');\n",
+              "          docLink.innerHTML = docLinkHtml;\n",
+              "          element.appendChild(docLink);\n",
+              "        }\n",
+              "      </script>\n",
+              "    </div>\n",
+              "  </div>\n",
+              "  "
+            ]
+          },
+          "metadata": {},
+          "execution_count": 31
+        }
+      ],
+      "source": [
+        "submission_cols = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/SubmissionFormat.csv')\n",
+        "cols_list = list(submission_cols.columns.values)\n",
+        "cols_list = cols_list[1:] ## remove the first column which is the index\n",
+        "submission = pd.DataFrame(test_predictions, columns=cols_list)\n",
+        "submission.set_index(test_df['Unnamed: 0'], inplace=True)\n",
+        " = \"\"\n",
+        "submission"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ep-ruNCfyMKH"
+      },
+      "outputs": [],
+      "source": [
+        "submission.to_csv('12072022_tfidf_logreg_Cval_2.csv')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XwaSy1i13nax"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm"
+    },
+    "gpuClass": "premium",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
\ No newline at end of file