diff --git a/NLP_Approach.ipynb b/NLP_Approach.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..28268f3e8fa57e1e931cb08b6097b8f144ea5270 --- /dev/null +++ b/NLP_Approach.ipynb @@ -0,0 +1,2362 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "REugZMxlFCvU", + "outputId": "2b6a1767-4296-4d10-cc7a-fcefdb7d6113" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/gdrive/\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount(\"/content/gdrive/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ceMmp0oL8PyI" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import shutil\n", + "import sys \n", + "import os\n", + "import time\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qAYz11Vr-k5b" + }, + "outputs": [], + "source": [ + "dir_path = 'gdrive/Shareddrives/CS5024 Ethics Project'\n", + "sys.path.append(dir_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jmTuxI2t8PwQ", + "outputId": "d312c306-e195-45bc-95de-fed482fa0869" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (5,11) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" + ] + } + ], + "source": [ + "# Create DF for train and test set\n", + "train_df = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/TrainingData.csv')\n", + "test_df = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/TestData.csv')" + ] + }, + { + "cell_type": "code", + "source": [ + "print (train_df.shape)\n", + "print (test_df.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0NPhp8WQ1YA0", + "outputId": "33582c11-9cd6-4a16-ef75-957d55f8c906" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(400277, 26)\n", + "(50064, 17)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Data Preprocessing" + ], + "metadata": { + "id": "Y558pu6Bdbbw" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 797 + }, + "id": "Q9Jzqrlwiu83", + "outputId": "cf12f590-0d36-4704-de43-6193bc7fcf82" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Function Use \\\n", + "Unnamed: 0 \n", + "134338 Teacher Compensation Instruction \n", + "206341 NO_LABEL NO_LABEL \n", + "326408 Teacher Compensation Instruction \n", + "364634 Substitute Compensation Instruction \n", + "47683 Substitute Compensation Instruction \n", + "... ... ... \n", + "109283 Professional Development ISPD \n", + "102430 Substitute Compensation Instruction \n", + "413949 Parent & Community Relations NO_LABEL \n", + "433672 Library & Media Instruction \n", + "415831 Substitute Compensation Instruction \n", + "\n", + " Sharing Reporting Student_Type \\\n", + "Unnamed: 0 \n", + "134338 School Reported School NO_LABEL \n", + "206341 NO_LABEL NO_LABEL NO_LABEL \n", + "326408 School Reported School Unspecified \n", + "364634 School Reported School Unspecified \n", + "47683 School Reported School Unspecified \n", + "... ... ... ... \n", + "109283 Shared Services Non-School Unspecified \n", + "102430 School Reported School Unspecified \n", + "413949 School Reported School NO_LABEL \n", + "433672 School on Central Budgets Non-School Unspecified \n", + "415831 School Reported School Poverty \n", + "\n", + " Position_Type Object_Type Pre_K \\\n", + "Unnamed: 0 \n", + "134338 Teacher NO_LABEL NO_LABEL \n", + "206341 NO_LABEL NO_LABEL NO_LABEL \n", + "326408 Teacher Base Salary/Compensation Non PreK \n", + "364634 Substitute Benefits NO_LABEL \n", + "47683 Teacher Substitute Compensation NO_LABEL \n", + "... ... ... ... \n", + "109283 Instructional Coach Other Compensation/Stipend NO_LABEL \n", + "102430 Substitute Base Salary/Compensation NO_LABEL \n", + "413949 Other NO_LABEL NO_LABEL \n", + "433672 Librarian Benefits NO_LABEL \n", + "415831 Substitute Substitute Compensation Non PreK \n", + "\n", + " Operating_Status \\\n", + "Unnamed: 0 \n", + "134338 PreK-12 Operating \n", + "206341 Non-Operating \n", + "326408 PreK-12 Operating \n", + "364634 PreK-12 Operating \n", + "47683 PreK-12 Operating \n", + "... ... \n", + "109283 PreK-12 Operating \n", + "102430 PreK-12 Operating \n", + "413949 PreK-12 Operating \n", + "433672 PreK-12 Operating \n", + "415831 PreK-12 Operating \n", + "\n", + " Object_Description ... \\\n", + "Unnamed: 0 ... \n", + "134338 NaN ... \n", + "206341 CONTRACTOR SERVICES ... \n", + "326408 Personal Services - Teachers ... \n", + "364634 EMPLOYEE BENEFITS ... \n", + "47683 TEACHER COVERAGE FOR TEACHER ... \n", + "... ... ... \n", + "109283 WORKSHOP PARTICIPANT ... \n", + "102430 SALARIES OF PART TIME EMPLOYEE ... \n", + "413949 NaN ... \n", + "433672 EMPLOYEE BENEFITS ... \n", + "415831 Salaries And Wages For Substitute Professionals ... \n", + "\n", + " Sub_Object_Description \\\n", + "Unnamed: 0 \n", + "134338 NaN \n", + "206341 NaN \n", + "326408 NaN \n", + "364634 NaN \n", + "47683 NaN \n", + "... ... \n", + "109283 NaN \n", + "102430 NaN \n", + "413949 NaN \n", + "433672 NaN \n", + "415831 Inservice Substitute Teachers Grant Funded \n", + "\n", + " Location_Description FTE \\\n", + "Unnamed: 0 \n", + "134338 NaN 1.00000 \n", + "206341 NaN NaN \n", + "326408 NaN 1.00000 \n", + "364634 NaN NaN \n", + "47683 NaN NaN \n", + "... ... ... \n", + "109283 STAFF DEV AND INSTR MEDIA NaN \n", + "102430 NaN 0.00431 \n", + "413949 NaN 1.00000 \n", + "433672 ED RESOURCE SERVICES NaN \n", + "415831 School NaN \n", + "\n", + " Function_Description Facility_or_Department \\\n", + "Unnamed: 0 \n", + "134338 NaN NaN \n", + "206341 RGN GOB NaN \n", + "326408 NaN NaN \n", + "364634 UNALLOC BUDGETS/SCHOOLS NaN \n", + "47683 NON-PROJECT NaN \n", + "... ... ... \n", + "109283 INST STAFF TRAINING SVCS NaN \n", + "102430 TITLE II,D NaN \n", + "413949 NaN NaN \n", + "433672 NON-PROJECT NaN \n", + "415831 Instruction Instruction And Curriculum \n", + "\n", + " Position_Extra Total \\\n", + "Unnamed: 0 \n", + "134338 KINDERGARTEN 50471.810000 \n", + "206341 UNDESIGNATED 3477.860000 \n", + "326408 TEACHER 62237.130000 \n", + "364634 PROFESSIONAL-INSTRUCTIONAL 22.300000 \n", + "47683 PROFESSIONAL-INSTRUCTIONAL 54.166000 \n", + "... ... ... \n", + "109283 NaN 48.620000 \n", + "102430 PROFESSIONAL-INSTRUCTIONAL 128.824985 \n", + "413949 PARENT/TITLE I 4902.290000 \n", + "433672 OFFICE/ADMINISTRATIVE SUPPORT 4020.290000 \n", + "415831 CERTIFIED SUBSTITUTE 46.530000 \n", + "\n", + " Program_Description \\\n", + "Unnamed: 0 \n", + "134338 KINDERGARTEN \n", + "206341 BUILDING IMPROVEMENT SERVICES \n", + "326408 Instruction - Regular \n", + "364634 GENERAL MIDDLE/JUNIOR HIGH SCH \n", + "47683 GENERAL HIGH SCHOOL EDUCATION \n", + "... ... \n", + "109283 NaN \n", + "102430 INSTRUCTIONAL STAFF TRAINING \n", + "413949 Misc \n", + "433672 MEDIA SUPPORT SERVICES \n", + "415831 Accelerated Education \n", + "\n", + " Fund_Description \\\n", + "Unnamed: 0 \n", + "134338 General Fund \n", + "206341 NaN \n", + "326408 General Purpose School \n", + "364634 NaN \n", + "47683 NaN \n", + "... ... \n", + "109283 GENERAL FUND \n", + "102430 NaN \n", + "413949 Schoolwide Schools \n", + "433672 NaN \n", + "415831 \"Title Part A Improving Basic Programs\" \n", + "\n", + " Text_1 \n", + "Unnamed: 0 \n", + "134338 NaN \n", + "206341 BUILDING IMPROVEMENT SERVICES \n", + "326408 NaN \n", + "364634 REGULAR INSTRUCTION \n", + "47683 REGULAR INSTRUCTION \n", + "... ... \n", + "109283 STAFF DEV AND INSTR MEDIA \n", + "102430 INSTRUCTIONAL STAFF \n", + "413949 NaN \n", + "433672 INSTRUCTIONAL STAFF \n", + "415831 MISCELLANEOUS \n", + "\n", + "[400277 rows x 25 columns]" + ], + "text/html": [ + "\n", + " <div id=\"df-f666659d-cc53-4867-b64a-0d1b481a7c61\">\n", + " <div class=\"colab-df-container\">\n", + " <div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Function</th>\n", + " <th>Use</th>\n", + " <th>Sharing</th>\n", + " <th>Reporting</th>\n", + " <th>Student_Type</th>\n", + " <th>Position_Type</th>\n", + " <th>Object_Type</th>\n", + " <th>Pre_K</th>\n", + " <th>Operating_Status</th>\n", + " <th>Object_Description</th>\n", + " <th>...</th>\n", + " <th>Sub_Object_Description</th>\n", + " <th>Location_Description</th>\n", + " <th>FTE</th>\n", + " <th>Function_Description</th>\n", + " <th>Facility_or_Department</th>\n", + " <th>Position_Extra</th>\n", + " <th>Total</th>\n", + " <th>Program_Description</th>\n", + " <th>Fund_Description</th>\n", + " <th>Text_1</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Unnamed: 0</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>134338</th>\n", + " <td>Teacher Compensation</td>\n", + " <td>Instruction</td>\n", + " <td>School Reported</td>\n", + " <td>School</td>\n", + " <td>NO_LABEL</td>\n", + " <td>Teacher</td>\n", + " <td>NO_LABEL</td>\n", + " <td>NO_LABEL</td>\n", + " <td>PreK-12 Operating</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1.00000</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>KINDERGARTEN</td>\n", + " <td>50471.810000</td>\n", + " <td>KINDERGARTEN</td>\n", + " <td>General Fund</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>206341</th>\n", + " <td>NO_LABEL</td>\n", + " <td>NO_LABEL</td>\n", + " <td>NO_LABEL</td>\n", + " <td>NO_LABEL</td>\n", + " <td>NO_LABEL</td>\n", + " <td>NO_LABEL</td>\n", + " <td>NO_LABEL</td>\n", + " <td>NO_LABEL</td>\n", + " <td>Non-Operating</td>\n", + " <td>CONTRACTOR SERVICES</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>RGN GOB</td>\n", + " <td>NaN</td>\n", + " <td>UNDESIGNATED</td>\n", + " <td>3477.860000</td>\n", + " <td>BUILDING IMPROVEMENT SERVICES</td>\n", + " <td>NaN</td>\n", + " <td>BUILDING IMPROVEMENT SERVICES</td>\n", + " </tr>\n", + " <tr>\n", + " <th>326408</th>\n", + " <td>Teacher Compensation</td>\n", + " <td>Instruction</td>\n", + " <td>School Reported</td>\n", + " <td>School</td>\n", + " <td>Unspecified</td>\n", + " <td>Teacher</td>\n", + " <td>Base Salary/Compensation</td>\n", + " <td>Non PreK</td>\n", + " <td>PreK-12 Operating</td>\n", + " <td>Personal Services - Teachers</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1.00000</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>TEACHER</td>\n", + " <td>62237.130000</td>\n", + " <td>Instruction - Regular</td>\n", + " <td>General Purpose School</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>364634</th>\n", + " <td>Substitute Compensation</td>\n", + " <td>Instruction</td>\n", + " <td>School Reported</td>\n", + " <td>School</td>\n", + " <td>Unspecified</td>\n", + " <td>Substitute</td>\n", + " <td>Benefits</td>\n", + " <td>NO_LABEL</td>\n", + " <td>PreK-12 Operating</td>\n", + " <td>EMPLOYEE BENEFITS</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>UNALLOC BUDGETS/SCHOOLS</td>\n", + " <td>NaN</td>\n", + " <td>PROFESSIONAL-INSTRUCTIONAL</td>\n", + " <td>22.300000</td>\n", + " <td>GENERAL MIDDLE/JUNIOR HIGH SCH</td>\n", + " <td>NaN</td>\n", + " <td>REGULAR INSTRUCTION</td>\n", + " </tr>\n", + " <tr>\n", + " <th>47683</th>\n", + " <td>Substitute Compensation</td>\n", + " <td>Instruction</td>\n", + " <td>School Reported</td>\n", + " <td>School</td>\n", + " <td>Unspecified</td>\n", + " <td>Teacher</td>\n", + " <td>Substitute Compensation</td>\n", + " <td>NO_LABEL</td>\n", + " <td>PreK-12 Operating</td>\n", + " <td>TEACHER COVERAGE FOR TEACHER</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NON-PROJECT</td>\n", + " <td>NaN</td>\n", + " <td>PROFESSIONAL-INSTRUCTIONAL</td>\n", + " <td>54.166000</td>\n", + " <td>GENERAL HIGH SCHOOL EDUCATION</td>\n", + " <td>NaN</td>\n", + " <td>REGULAR INSTRUCTION</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>109283</th>\n", + " <td>Professional Development</td>\n", + " <td>ISPD</td>\n", + " <td>Shared Services</td>\n", + " <td>Non-School</td>\n", + " <td>Unspecified</td>\n", + " <td>Instructional Coach</td>\n", + " <td>Other Compensation/Stipend</td>\n", + " <td>NO_LABEL</td>\n", + " <td>PreK-12 Operating</td>\n", + " <td>WORKSHOP PARTICIPANT</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>STAFF DEV AND INSTR MEDIA</td>\n", + " <td>NaN</td>\n", + " <td>INST STAFF TRAINING SVCS</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>48.620000</td>\n", + " <td>NaN</td>\n", + " <td>GENERAL FUND</td>\n", + " <td>STAFF DEV AND INSTR MEDIA</td>\n", + " </tr>\n", + " <tr>\n", + " <th>102430</th>\n", + " <td>Substitute Compensation</td>\n", + " <td>Instruction</td>\n", + " <td>School Reported</td>\n", + " <td>School</td>\n", + " <td>Unspecified</td>\n", + " <td>Substitute</td>\n", + " <td>Base Salary/Compensation</td>\n", + " <td>NO_LABEL</td>\n", + " <td>PreK-12 Operating</td>\n", + " <td>SALARIES OF PART TIME EMPLOYEE</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>0.00431</td>\n", + " <td>TITLE II,D</td>\n", + " <td>NaN</td>\n", + " <td>PROFESSIONAL-INSTRUCTIONAL</td>\n", + " <td>128.824985</td>\n", + " <td>INSTRUCTIONAL STAFF TRAINING</td>\n", + " <td>NaN</td>\n", + " <td>INSTRUCTIONAL STAFF</td>\n", + " </tr>\n", + " <tr>\n", + " <th>413949</th>\n", + " <td>Parent & Community Relations</td>\n", + " <td>NO_LABEL</td>\n", + " <td>School Reported</td>\n", + " <td>School</td>\n", + " <td>NO_LABEL</td>\n", + " <td>Other</td>\n", + " <td>NO_LABEL</td>\n", + " <td>NO_LABEL</td>\n", + " <td>PreK-12 Operating</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>1.00000</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>PARENT/TITLE I</td>\n", + " <td>4902.290000</td>\n", + " <td>Misc</td>\n", + " <td>Schoolwide Schools</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>433672</th>\n", + " <td>Library & Media</td>\n", + " <td>Instruction</td>\n", + " <td>School on Central Budgets</td>\n", + " <td>Non-School</td>\n", + " <td>Unspecified</td>\n", + " <td>Librarian</td>\n", + " <td>Benefits</td>\n", + " <td>NO_LABEL</td>\n", + " <td>PreK-12 Operating</td>\n", + " <td>EMPLOYEE BENEFITS</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>ED RESOURCE SERVICES</td>\n", + " <td>NaN</td>\n", + " <td>NON-PROJECT</td>\n", + " <td>NaN</td>\n", + " <td>OFFICE/ADMINISTRATIVE SUPPORT</td>\n", + " <td>4020.290000</td>\n", + " <td>MEDIA SUPPORT SERVICES</td>\n", + " <td>NaN</td>\n", + " <td>INSTRUCTIONAL STAFF</td>\n", + " </tr>\n", + " <tr>\n", + " <th>415831</th>\n", + " <td>Substitute Compensation</td>\n", + " <td>Instruction</td>\n", + " <td>School Reported</td>\n", + " <td>School</td>\n", + " <td>Poverty</td>\n", + " <td>Substitute</td>\n", + " <td>Substitute Compensation</td>\n", + " <td>Non PreK</td>\n", + " <td>PreK-12 Operating</td>\n", + " <td>Salaries And Wages For Substitute Professionals</td>\n", + " <td>...</td>\n", + " <td>Inservice Substitute Teachers Grant Funded</td>\n", + " <td>School</td>\n", + " <td>NaN</td>\n", + " <td>Instruction</td>\n", + " <td>Instruction And Curriculum</td>\n", + " <td>CERTIFIED SUBSTITUTE</td>\n", + " <td>46.530000</td>\n", + " <td>Accelerated Education</td>\n", + " <td>\"Title Part A Improving Basic Programs\"</td>\n", + " <td>MISCELLANEOUS</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>400277 rows × 25 columns</p>\n", + "</div>\n", + " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f666659d-cc53-4867-b64a-0d1b481a7c61')\"\n", + " title=\"Convert this dataframe to an interactive table.\"\n", + " style=\"display:none;\">\n", + " \n", + " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", + " width=\"24px\">\n", + " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", + " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", + " </svg>\n", + " </button>\n", + " \n", + " <style>\n", + " .colab-df-container {\n", + " display:flex;\n", + " flex-wrap:wrap;\n", + " gap: 12px;\n", + " }\n", + "\n", + " .colab-df-convert {\n", + " background-color: #E8F0FE;\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: #1967D2;\n", + " height: 32px;\n", + " padding: 0 0 0 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-convert:hover {\n", + " background-color: #E2EBFA;\n", + " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: #174EA6;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert {\n", + " background-color: #3B4455;\n", + " fill: #D2E3FC;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert:hover {\n", + " background-color: #434B5C;\n", + " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", + " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", + " fill: #FFFFFF;\n", + " }\n", + " </style>\n", + "\n", + " <script>\n", + " const buttonEl =\n", + " document.querySelector('#df-f666659d-cc53-4867-b64a-0d1b481a7c61 button.colab-df-convert');\n", + " buttonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + "\n", + " async function convertToInteractive(key) {\n", + " const element = document.querySelector('#df-f666659d-cc53-4867-b64a-0d1b481a7c61');\n", + " const dataTable =\n", + " await google.colab.kernel.invokeFunction('convertToInteractive',\n", + " [key], {});\n", + " if (!dataTable) return;\n", + "\n", + " const docLinkHtml = 'Like what you see? Visit the ' +\n", + " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", + " + ' to learn more about interactive tables.';\n", + " element.innerHTML = '';\n", + " dataTable['output_type'] = 'display_data';\n", + " await google.colab.output.renderOutput(dataTable, element);\n", + " const docLink = document.createElement('div');\n", + " docLink.innerHTML = docLinkHtml;\n", + " element.appendChild(docLink);\n", + " }\n", + " </script>\n", + " </div>\n", + " </div>\n", + " " + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "test_df.set_index('Unnamed: 0')\n", + "train_df.set_index('Unnamed: 0')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "Thh9gm2q8Pt_", + "outputId": "4c390cc1-191f-4c51-aa7d-d7a4e549cd93" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0 \\\n", + "Unnamed: 0 \n", + "134338 Teacher-Elementary KINDERGA... \n", + "206341 CONTRACTOR SERVICES BOND EXPENDITURES BUILDING... \n", + "326408 Personal Services - Teachers TCHER 2ND GRA... \n", + "364634 EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te... \n", + "47683 TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE... \n", + "... ... \n", + "109283 WORKSHOP PARTICIPANT CURRICULUM ... \n", + "102430 SALARIES OF PART TIME EMPLOYEE FEDERAL GDPG ... \n", + "413949 School Liaison PARENT/TITLE ... \n", + "433672 EMPLOYEE BENEFITS EDUCATIONAL RESOURCE SERVICE... \n", + "415831 Salaries And Wages For Substitute Professional... \n", + "\n", + " Function_Aides Compensation \\\n", + "Unnamed: 0 \n", + "134338 0 \n", + "206341 0 \n", + "326408 0 \n", + "364634 0 \n", + "47683 0 \n", + "... ... \n", + "109283 0 \n", + "102430 0 \n", + "413949 0 \n", + "433672 0 \n", + "415831 0 \n", + "\n", + " Function_Career & Academic Counseling Function_Communications \\\n", + "Unnamed: 0 \n", + "134338 0 0 \n", + "206341 0 0 \n", + "326408 0 0 \n", + "364634 0 0 \n", + "47683 0 0 \n", + "... ... ... \n", + "109283 0 0 \n", + "102430 0 0 \n", + "413949 0 0 \n", + "433672 0 0 \n", + "415831 0 0 \n", + "\n", + " Function_Curriculum Development \\\n", + "Unnamed: 0 \n", + "134338 0 \n", + "206341 0 \n", + "326408 0 \n", + "364634 0 \n", + "47683 0 \n", + "... ... \n", + "109283 0 \n", + "102430 0 \n", + "413949 0 \n", + "433672 0 \n", + "415831 0 \n", + "\n", + " Function_Data Processing & Information Services \\\n", + "Unnamed: 0 \n", + "134338 0 \n", + "206341 0 \n", + "326408 0 \n", + "364634 0 \n", + "47683 0 \n", + "... ... \n", + "109283 0 \n", + "102430 0 \n", + "413949 0 \n", + "433672 0 \n", + "415831 0 \n", + "\n", + " Function_Development & Fundraising Function_Enrichment \\\n", + "Unnamed: 0 \n", + "134338 0 0 \n", + "206341 0 0 \n", + "326408 0 0 \n", + "364634 0 0 \n", + "47683 0 0 \n", + "... ... ... \n", + "109283 0 0 \n", + "102430 0 0 \n", + "413949 0 0 \n", + "433672 0 0 \n", + "415831 0 0 \n", + "\n", + " Function_Extended Time & Tutoring \\\n", + "Unnamed: 0 \n", + "134338 0 \n", + "206341 0 \n", + "326408 0 \n", + "364634 0 \n", + "47683 0 \n", + "... ... \n", + "109283 0 \n", + "102430 0 \n", + "413949 0 \n", + "433672 0 \n", + "415831 0 \n", + "\n", + " Function_Facilities & Maintenance ... \\\n", + "Unnamed: 0 ... \n", + "134338 0 ... \n", + "206341 0 ... \n", + "326408 0 ... \n", + "364634 0 ... \n", + "47683 0 ... \n", + "... ... ... \n", + "109283 0 ... \n", + "102430 0 ... \n", + "413949 0 ... \n", + "433672 0 ... \n", + "415831 0 ... \n", + "\n", + " Student_Type_Special Education Student_Type_Unspecified \\\n", + "Unnamed: 0 \n", + "134338 0 0 \n", + "206341 0 0 \n", + "326408 0 1 \n", + "364634 0 1 \n", + "47683 0 1 \n", + "... ... ... \n", + "109283 0 1 \n", + "102430 0 1 \n", + "413949 0 0 \n", + "433672 0 1 \n", + "415831 0 0 \n", + "\n", + " Use_Business Services Use_ISPD Use_Instruction Use_Leadership \\\n", + "Unnamed: 0 \n", + "134338 0 0 1 0 \n", + "206341 0 0 0 0 \n", + "326408 0 0 1 0 \n", + "364634 0 0 1 0 \n", + "47683 0 0 1 0 \n", + "... ... ... ... ... \n", + "109283 0 1 0 0 \n", + "102430 0 0 1 0 \n", + "413949 0 0 0 0 \n", + "433672 0 0 1 0 \n", + "415831 0 0 1 0 \n", + "\n", + " Use_NO_LABEL Use_O&M Use_Pupil Services & Enrichment \\\n", + "Unnamed: 0 \n", + "134338 0 0 0 \n", + "206341 1 0 0 \n", + "326408 0 0 0 \n", + "364634 0 0 0 \n", + "47683 0 0 0 \n", + "... ... ... ... \n", + "109283 0 0 0 \n", + "102430 0 0 0 \n", + "413949 1 0 0 \n", + "433672 0 0 0 \n", + "415831 0 0 0 \n", + "\n", + " Use_Untracked Budget Set-Aside \n", + "Unnamed: 0 \n", + "134338 0 \n", + "206341 0 \n", + "326408 0 \n", + "364634 0 \n", + "47683 0 \n", + "... ... \n", + "109283 0 \n", + "102430 0 \n", + "413949 0 \n", + "433672 0 \n", + "415831 0 \n", + "\n", + "[400277 rows x 105 columns]" + ], + "text/html": [ + "\n", + " <div id=\"df-efe9ca42-7ce1-4fd4-a880-234b5631bd03\">\n", + " <div class=\"colab-df-container\">\n", + " <div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>Function_Aides Compensation</th>\n", + " <th>Function_Career & Academic Counseling</th>\n", + " <th>Function_Communications</th>\n", + " <th>Function_Curriculum Development</th>\n", + " <th>Function_Data Processing & Information Services</th>\n", + " <th>Function_Development & Fundraising</th>\n", + " <th>Function_Enrichment</th>\n", + " <th>Function_Extended Time & Tutoring</th>\n", + " <th>Function_Facilities & Maintenance</th>\n", + " <th>...</th>\n", + " <th>Student_Type_Special Education</th>\n", + " <th>Student_Type_Unspecified</th>\n", + " <th>Use_Business Services</th>\n", + " <th>Use_ISPD</th>\n", + " <th>Use_Instruction</th>\n", + " <th>Use_Leadership</th>\n", + " <th>Use_NO_LABEL</th>\n", + " <th>Use_O&M</th>\n", + " <th>Use_Pupil Services & Enrichment</th>\n", + " <th>Use_Untracked Budget Set-Aside</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Unnamed: 0</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>134338</th>\n", + " <td>Teacher-Elementary KINDERGA...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>206341</th>\n", + " <td>CONTRACTOR SERVICES BOND EXPENDITURES BUILDING...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>326408</th>\n", + " <td>Personal Services - Teachers TCHER 2ND GRA...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>364634</th>\n", + " <td>EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>47683</th>\n", + " <td>TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>109283</th>\n", + " <td>WORKSHOP PARTICIPANT CURRICULUM ...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>102430</th>\n", + " <td>SALARIES OF PART TIME EMPLOYEE FEDERAL GDPG ...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>413949</th>\n", + " <td>School Liaison PARENT/TITLE ...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>433672</th>\n", + " <td>EMPLOYEE BENEFITS EDUCATIONAL RESOURCE SERVICE...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>415831</th>\n", + " <td>Salaries And Wages For Substitute Professional...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>400277 rows × 105 columns</p>\n", + "</div>\n", + " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-efe9ca42-7ce1-4fd4-a880-234b5631bd03')\"\n", + " title=\"Convert this dataframe to an interactive table.\"\n", + " style=\"display:none;\">\n", + " \n", + " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", + " width=\"24px\">\n", + " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", + " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", + " </svg>\n", + " </button>\n", + " \n", + " <style>\n", + " .colab-df-container {\n", + " display:flex;\n", + " flex-wrap:wrap;\n", + " gap: 12px;\n", + " }\n", + "\n", + " .colab-df-convert {\n", + " background-color: #E8F0FE;\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: #1967D2;\n", + " height: 32px;\n", + " padding: 0 0 0 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-convert:hover {\n", + " background-color: #E2EBFA;\n", + " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: #174EA6;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert {\n", + " background-color: #3B4455;\n", + " fill: #D2E3FC;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert:hover {\n", + " background-color: #434B5C;\n", + " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", + " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", + " fill: #FFFFFF;\n", + " }\n", + " </style>\n", + "\n", + " <script>\n", + " const buttonEl =\n", + " document.querySelector('#df-efe9ca42-7ce1-4fd4-a880-234b5631bd03 button.colab-df-convert');\n", + " buttonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + "\n", + " async function convertToInteractive(key) {\n", + " const element = document.querySelector('#df-efe9ca42-7ce1-4fd4-a880-234b5631bd03');\n", + " const dataTable =\n", + " await google.colab.kernel.invokeFunction('convertToInteractive',\n", + " [key], {});\n", + " if (!dataTable) return;\n", + "\n", + " const docLinkHtml = 'Like what you see? Visit the ' +\n", + " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", + " + ' to learn more about interactive tables.';\n", + " element.innerHTML = '';\n", + " dataTable['output_type'] = 'display_data';\n", + " await google.colab.output.renderOutput(dataTable, element);\n", + " const docLink = document.createElement('div');\n", + " docLink.innerHTML = docLinkHtml;\n", + " element.appendChild(docLink);\n", + " }\n", + " </script>\n", + " </div>\n", + " </div>\n", + " " + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "# set target Cols\n", + "target_cols = ['Function', 'Object_Type', 'Operating_Status', 'Position_Type', 'Pre_K', 'Reporting', 'Sharing', 'Student_Type', 'Use']\n", + "train_df_dummies = pd.DataFrame()\n", + "# drop train int cols\n", + "train_df.drop(['FTE', 'Total'], axis = 1, inplace=True)\n", + "# get dummies for target cols\n", + "col = pd.get_dummies(train_df[target_cols])\n", + "# drop target cols\n", + "train_df = train_df.drop(target_cols, axis=1)\n", + "# fill NaN with space\n", + "train_df.fillna(' ', inplace=True)\n", + "# combine all text into single col\n", + "combined = pd.DataFrame([' '.join(row) for row in train_df[train_df.columns[1:]].values])\n", + "# join combined text col with dummy labels\n", + "train_df_dummies = pd.concat([combined, col], axis = 1)\n", + "# drop test int cols\n", + "test_df.drop(['FTE', 'Total'], axis = 1, inplace=True)\n", + "# fill NaN with space\n", + "test_df.fillna(' ', inplace=True)\n", + "# combine all text into single col\n", + "test_df_cleaned = pd.DataFrame([' '.join(row) for row in test_df[test_df.columns[1:]].values])\n", + "# reset indices to original\n", + "test_df_cleaned.set_index(test_df['Unnamed: 0'])\n", + "train_df_dummies.set_index(train_df['Unnamed: 0'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sIdbRlvsE2Fn" + }, + "outputs": [], + "source": [ + "cols = col.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5ZhHV0028PjT" + }, + "outputs": [], + "source": [ + "# rename text col\n", + "test_df_cleaned.rename(columns={0: \"text\"}, inplace=True)\n", + "train_df_dummies.rename(columns={0: \"text\"}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DSMXH2Qmc0Y_", + "outputId": "fe62771e-2bf1-44f4-88e1-d56dae02b300" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + } + ], + "source": [ + "import nltk\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "from nltk import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem.snowball import SnowballStemmer\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Xly7pR2-CZhx" + }, + "outputs": [], + "source": [ + "def text_processing(df):\n", + " stop_words = set(stopwords.words('english'))\n", + " st = SnowballStemmer('english')\n", + " # lower and trim spaces\n", + " df['text'] = df['text'].apply(lambda x: x.lower().strip())\n", + " # remove other spaces\n", + " df['text'] = df['text'].apply(lambda x: re.sub(' +', ' ', x))\n", + " # remove punctuation\n", + " df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))\n", + " # remove stopwords stem\n", + " df['text'] = df['text'].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stop_words))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rR1Z3FPoh7c3" + }, + "outputs": [], + "source": [ + "# clean df\n", + "text_processing(train_df_dummies)\n", + "text_processing(test_df_cleaned)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Embedding" + ], + "metadata": { + "id": "4DkHoAlAdS8j" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iQAX3AoUnrtv" + }, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dpLEeR7dC9Bd" + }, + "outputs": [], + "source": [ + "X = train_df_dummies['text'].values\n", + "y = train_df_dummies[list(train_df_dummies.columns[1:])].values\n", + "X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.2,train_size =0.8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J4KQiBI0oZUp" + }, + "outputs": [], + "source": [ + "def tfidf_features(X_train, X_val, X_test):\n", + " \"\"\"\n", + " X_train, X_test — samples \n", + " return TF-IDF vectorized representation of each sample and vocabulary\n", + " \"\"\"\n", + " # Create TF-IDF vectorizer with a proper parameters choice\n", + " tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=1.0, min_df=1, token_pattern='(\\S+)')\n", + " # Fit the vectorizer on the train set\n", + " X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n", + " # Transform the train, test set and return the result\n", + " X_val_tfidf = tfidf_vectorizer.transform(X_val)\n", + " X_test_tfidf = tfidf_vectorizer.transform(X_test)\n", + " \n", + " return X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_\n", + "timer = time.time()\n", + "X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, test_df_cleaned.text)\n", + "tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Logistic Regression" + ], + "metadata": { + "id": "ydQ-7nJdT7Ad" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nwyPgHYMplz3", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "045b6532-7308-4018-b3c3-af14efe9aec5" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Time for NLP training\" 1387.7084062099457\n", + "Time for NLP inference\" 0.6320400238037109\n" + ] + } + ], + "source": [ + "from sklearn.multiclass import OneVsRestClassifier\n", + "from sklearn.linear_model import LogisticRegression, RidgeClassifier\n", + "\n", + "def train_logreg(X_train, y_train, C, regularisation):\n", + " \"\"\"\n", + " X_train, y_train — training data\n", + " \n", + " return: trained classifier\n", + " \"\"\"\n", + " \n", + " # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n", + "\n", + " model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)\n", + " return model\n", + "\n", + "logreg_tfidf = train_logreg(X_train_tfidf, y_train, C = 2, regularisation = 'l2')\n", + "print (f'Time for NLP training\" {time.time()-timer}')\n", + "timer = time.time()\n", + "y_val_predicted_labels_logreg = logreg_tfidf.predict(X_val_tfidf)\n", + "print (f'Time for NLP inference\" {time.time()-timer}')\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# XG Boost" + ], + "metadata": { + "id": "SOFP4vZFUJiu" + } + }, + { + "cell_type": "code", + "source": [ + "# from sklearn.multiclass import OneVsRestClassifier\n", + "# from xgboost import XGBClassifier\n", + "\n", + "# def train_xgb(X_train, y_train, params):\n", + "# \"\"\"\n", + "# X_train, y_train — training data\n", + " \n", + "# return: trained classifier\n", + "# \"\"\"\n", + " \n", + "# # Create and fit XGBoost wraped into OneVsRestClassifier.\n", + "\n", + "# model = OneVsRestClassifier(XGBClassifier(**params)).fit(X_train, y_train)\n", + "# return model\n", + "# xgb_params = {'eta': 0.3, \n", + "# 'max_depth': 5, \n", + "# 'subsample': 0.8, \n", + "# 'colsample_bytree': 0.8, \n", + "# 'tree_method' : 'gpu_hist',\n", + "# 'objective': 'binary:logistic', \n", + "# 'eval_metric': 'auc', \n", + "# 'seed': 42\n", + "# }\n", + "# xgb_tfidf = train_xgb(X_train_tfidf, y_train, xgb_params)\n", + "# y_val_predicted_labels_xgb = xgb_tfidf.predict(X_val_tfidf)" + ], + "metadata": { + "id": "r42fPuwwURXN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# SGD" + ], + "metadata": { + "id": "WeL9xN2WLpyO" + } + }, + { + "cell_type": "code", + "source": [ + "# from sklearn.multiclass import OneVsRestClassifier\n", + "# from sklearn.linear_model import SGDClassifier\n", + "# def train_sgd(X_train, y_train):\n", + "# \"\"\"\n", + "# X_train, y_train — training data\n", + " \n", + "# return: trained classifier\n", + "# \"\"\"\n", + " \n", + "# # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n", + "\n", + "# model = OneVsRestClassifier(SGDClassifier(loss = 'log', penalty = 'l2')).fit(X_train, y_train)\n", + "# return model\n", + "\n", + "# sgd_tfidf = train_sgd(X_train_tfidf, y_train)\n", + "# print (f'Time for NLP training\" {time.time()-timer}')\n", + "# timer = time.time()\n", + "# y_val_predicted_labels_sgd = sgd_tfidf.predict(X_val_tfidf)\n", + "# print (f'Time for NLP inference\" {time.time()-timer}')\n" + ], + "metadata": { + "id": "CxxB9ABSLrs1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Model Evaluation" + ], + "metadata": { + "id": "aToqJ94RdO9R" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import f1_score\n", + "from sklearn.metrics import roc_auc_score \n", + "from sklearn.metrics import precision_score\n", + "from sklearn.metrics import average_precision_score\n", + "from sklearn.metrics import recall_score\n", + "from sklearn.metrics import log_loss" + ], + "metadata": { + "id": "a2ULt-H4dAT5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def print_evaluation_scores(y_test, predicted):\n", + " \n", + " print('Accuracy: ', accuracy_score(y_test, predicted, normalize=True))\n", + " print('F1-score macro: ', f1_score(y_test, predicted, average='macro', labels=[1]))\n", + " print('F1-score micro: ', f1_score(y_test, predicted, average='micro', labels=[1]))\n", + " print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))\n", + " print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))\n", + " print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))\n", + " print('Precision weighted: ', precision_score(y_test, predicted, average='weighted', labels=[1]))\n", + " print('Log Loss: ', log_loss(y_test, predicted, normalize=True))\n", + "\n", + " \n", + "print('Metrics')\n", + "print_evaluation_scores(y_val, y_val_predicted_labels_logreg)\n", + "# print_evaluation_scores(y_val, y_val_predicted_labels_xgb)\n", + "# print_evaluation_scores(y_val, y_val_predicted_labels_sgd)" + ], + "metadata": { + "id": "g1xaUV6Ic1kS", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e1fb88d6-f5e0-4b75-c656-9b0ee6ea87da" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Metrics\n", + "Accuracy: 0.898533526531428\n", + "F1-score macro: 0.857397504456328\n", + "F1-score micro: 0.857397504456328\n", + "F1-score weighted: 0.9780481032620225\n", + "Precision macro: 0.8548790839083935\n", + "Precision micro: 0.9595717445537028\n", + "Precision weighted: 0.8809523809523809\n", + "Log Loss: 27.62320822221433\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Submission" + ], + "metadata": { + "id": "4e9yV9CadHwO" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "75VcWGLmvvCi" + }, + "outputs": [], + "source": [ + "test_predictions = logreg_tfidf.predict_proba(X_test_tfidf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GhNpC96cwRmD", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 554 + }, + "outputId": "aa8f5639-897c-4437-aca7-daafddd03797" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Function__Aides Compensation Function__Career & Academic Counseling \\\n", + " \n", + "180042 0.005369 0.001911 \n", + "28872 0.000574 0.001182 \n", + "186915 0.017363 0.000992 \n", + "412396 0.011996 0.000990 \n", + "427740 0.001207 0.002310 \n", + "... ... ... \n", + "169063 0.004662 0.000559 \n", + "433255 0.004662 0.000559 \n", + "232204 0.004662 0.000559 \n", + "171685 0.004662 0.000559 \n", + "249087 0.004662 0.000559 \n", + "\n", + " Function__Communications Function__Curriculum Development \\\n", + " \n", + "180042 0.000062 0.000037 \n", + "28872 0.000243 0.005150 \n", + "186915 0.000156 0.001177 \n", + "412396 0.000155 0.001122 \n", + "427740 0.000585 0.002508 \n", + "... ... ... \n", + "169063 0.000271 0.001190 \n", + "433255 0.000271 0.001190 \n", + "232204 0.000271 0.001190 \n", + "171685 0.000271 0.001190 \n", + "249087 0.000271 0.001190 \n", + "\n", + " Function__Data Processing & Information Services \\\n", + " \n", + "180042 0.000424 \n", + "28872 0.001041 \n", + "186915 0.000616 \n", + "412396 0.000625 \n", + "427740 0.002355 \n", + "... ... \n", + "169063 0.000868 \n", + "433255 0.000868 \n", + "232204 0.000868 \n", + "171685 0.000868 \n", + "249087 0.000868 \n", + "\n", + " Function__Development & Fundraising Function__Enrichment \\\n", + " \n", + "180042 0.000088 0.000996 \n", + "28872 0.000150 0.227637 \n", + "186915 0.000108 0.000500 \n", + "412396 0.000108 0.000475 \n", + "427740 0.000135 0.002673 \n", + "... ... ... \n", + "169063 0.000236 0.002325 \n", + "433255 0.000236 0.002325 \n", + "232204 0.000236 0.002325 \n", + "171685 0.000236 0.002325 \n", + "249087 0.000236 0.002325 \n", + "\n", + " Function__Extended Time & Tutoring \\\n", + " \n", + "180042 0.000203 \n", + "28872 0.003900 \n", + "186915 0.000463 \n", + "412396 0.000489 \n", + "427740 0.000067 \n", + "... ... \n", + "169063 0.000239 \n", + "433255 0.000239 \n", + "232204 0.000239 \n", + "171685 0.000239 \n", + "249087 0.000239 \n", + "\n", + " Function__Facilities & Maintenance Function__Facilities Planning \\\n", + " \n", + "180042 0.001402 0.000029 \n", + "28872 0.006152 0.000040 \n", + "186915 0.001457 0.000034 \n", + "412396 0.001456 0.000034 \n", + "427740 0.003695 0.000041 \n", + "... ... ... \n", + "169063 0.001862 0.000044 \n", + "433255 0.001862 0.000044 \n", + "232204 0.001862 0.000044 \n", + "171685 0.001862 0.000044 \n", + "249087 0.001862 0.000044 \n", + "\n", + " ... Student_Type__Special Education Student_Type__Unspecified \\\n", + " ... \n", + "180042 ... 0.002347 0.836926 \n", + "28872 ... 0.005913 0.894910 \n", + "186915 ... 0.004487 0.282162 \n", + "412396 ... 0.004447 0.250461 \n", + "427740 ... 0.004405 0.980520 \n", + "... ... ... ... \n", + "169063 ... 0.004677 0.037555 \n", + "433255 ... 0.004677 0.037555 \n", + "232204 ... 0.004677 0.037555 \n", + "171685 ... 0.004677 0.037555 \n", + "249087 ... 0.004677 0.037555 \n", + "\n", + " Use__Business Services Use__ISPD Use__Instruction Use__Leadership \\\n", + " \n", + "180042 0.000200 0.003739 0.064349 0.003398 \n", + "28872 0.003721 0.009994 0.011118 0.023945 \n", + "186915 0.000895 0.012540 0.695857 0.008499 \n", + "412396 0.000910 0.010479 0.693750 0.007253 \n", + "427740 0.008272 0.040371 0.001226 0.805063 \n", + "... ... ... ... ... \n", + "169063 0.003252 0.003751 0.017954 0.002785 \n", + "433255 0.003252 0.003751 0.017954 0.002785 \n", + "232204 0.003252 0.003751 0.017954 0.002785 \n", + "171685 0.003252 0.003751 0.017954 0.002785 \n", + "249087 0.003252 0.003751 0.017954 0.002785 \n", + "\n", + " Use__NO_LABEL Use__O&M Use__Pupil Services & Enrichment \\\n", + " \n", + "180042 0.264019 0.006676 0.005294 \n", + "28872 0.047036 0.023556 0.146622 \n", + "186915 0.016858 0.003341 0.009452 \n", + "412396 0.017822 0.003547 0.008566 \n", + "427740 0.005581 0.037933 0.004177 \n", + "... ... ... ... \n", + "169063 0.883382 0.002572 0.044943 \n", + "433255 0.883382 0.002572 0.044943 \n", + "232204 0.883382 0.002572 0.044943 \n", + "171685 0.883382 0.002572 0.044943 \n", + "249087 0.883382 0.002572 0.044943 \n", + "\n", + " Use__Untracked Budget Set-Aside \n", + " \n", + "180042 0.000372 \n", + "28872 0.000244 \n", + "186915 0.000168 \n", + "412396 0.000168 \n", + "427740 0.000233 \n", + "... ... \n", + "169063 0.000341 \n", + "433255 0.000341 \n", + "232204 0.000341 \n", + "171685 0.000341 \n", + "249087 0.000341 \n", + "\n", + "[50064 rows x 104 columns]" + ], + "text/html": [ + "\n", + " <div id=\"df-73e59131-014c-49d1-a337-c844b74a21b5\">\n", + " <div class=\"colab-df-container\">\n", + " <div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Function__Aides Compensation</th>\n", + " <th>Function__Career & Academic Counseling</th>\n", + " <th>Function__Communications</th>\n", + " <th>Function__Curriculum Development</th>\n", + " <th>Function__Data Processing & Information Services</th>\n", + " <th>Function__Development & Fundraising</th>\n", + " <th>Function__Enrichment</th>\n", + " <th>Function__Extended Time & Tutoring</th>\n", + " <th>Function__Facilities & Maintenance</th>\n", + " <th>Function__Facilities Planning</th>\n", + " <th>...</th>\n", + " <th>Student_Type__Special Education</th>\n", + " <th>Student_Type__Unspecified</th>\n", + " <th>Use__Business Services</th>\n", + " <th>Use__ISPD</th>\n", + " <th>Use__Instruction</th>\n", + " <th>Use__Leadership</th>\n", + " <th>Use__NO_LABEL</th>\n", + " <th>Use__O&M</th>\n", + " <th>Use__Pupil Services & Enrichment</th>\n", + " <th>Use__Untracked Budget Set-Aside</th>\n", + " </tr>\n", + " <tr>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>180042</th>\n", + " <td>0.005369</td>\n", + " <td>0.001911</td>\n", + " <td>0.000062</td>\n", + " <td>0.000037</td>\n", + " <td>0.000424</td>\n", + " <td>0.000088</td>\n", + " <td>0.000996</td>\n", + " <td>0.000203</td>\n", + " <td>0.001402</td>\n", + " <td>0.000029</td>\n", + " <td>...</td>\n", + " <td>0.002347</td>\n", + " <td>0.836926</td>\n", + " <td>0.000200</td>\n", + " <td>0.003739</td>\n", + " <td>0.064349</td>\n", + " <td>0.003398</td>\n", + " <td>0.264019</td>\n", + " <td>0.006676</td>\n", + " <td>0.005294</td>\n", + " <td>0.000372</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28872</th>\n", + " <td>0.000574</td>\n", + " <td>0.001182</td>\n", + " <td>0.000243</td>\n", + " <td>0.005150</td>\n", + " <td>0.001041</td>\n", + " <td>0.000150</td>\n", + " <td>0.227637</td>\n", + " <td>0.003900</td>\n", + " <td>0.006152</td>\n", + " <td>0.000040</td>\n", + " <td>...</td>\n", + " <td>0.005913</td>\n", + " <td>0.894910</td>\n", + " <td>0.003721</td>\n", + " <td>0.009994</td>\n", + " <td>0.011118</td>\n", + " <td>0.023945</td>\n", + " <td>0.047036</td>\n", + " <td>0.023556</td>\n", + " <td>0.146622</td>\n", + " <td>0.000244</td>\n", + " </tr>\n", + " <tr>\n", + " <th>186915</th>\n", + " <td>0.017363</td>\n", + " <td>0.000992</td>\n", + " <td>0.000156</td>\n", + " <td>0.001177</td>\n", + " <td>0.000616</td>\n", + " <td>0.000108</td>\n", + " <td>0.000500</td>\n", + " <td>0.000463</td>\n", + " <td>0.001457</td>\n", + " <td>0.000034</td>\n", + " <td>...</td>\n", + " <td>0.004487</td>\n", + " <td>0.282162</td>\n", + " <td>0.000895</td>\n", + " <td>0.012540</td>\n", + " <td>0.695857</td>\n", + " <td>0.008499</td>\n", + " <td>0.016858</td>\n", + " <td>0.003341</td>\n", + " <td>0.009452</td>\n", + " <td>0.000168</td>\n", + " </tr>\n", + " <tr>\n", + " <th>412396</th>\n", + " <td>0.011996</td>\n", + " <td>0.000990</td>\n", + " <td>0.000155</td>\n", + " <td>0.001122</td>\n", + " <td>0.000625</td>\n", + " <td>0.000108</td>\n", + " <td>0.000475</td>\n", + " <td>0.000489</td>\n", + " <td>0.001456</td>\n", + " <td>0.000034</td>\n", + " <td>...</td>\n", + " <td>0.004447</td>\n", + " <td>0.250461</td>\n", + " <td>0.000910</td>\n", + " <td>0.010479</td>\n", + " <td>0.693750</td>\n", + " <td>0.007253</td>\n", + " <td>0.017822</td>\n", + " <td>0.003547</td>\n", + " <td>0.008566</td>\n", + " <td>0.000168</td>\n", + " </tr>\n", + " <tr>\n", + " <th>427740</th>\n", + " <td>0.001207</td>\n", + " <td>0.002310</td>\n", + " <td>0.000585</td>\n", + " <td>0.002508</td>\n", + " <td>0.002355</td>\n", + " <td>0.000135</td>\n", + " <td>0.002673</td>\n", + " <td>0.000067</td>\n", + " <td>0.003695</td>\n", + " <td>0.000041</td>\n", + " <td>...</td>\n", + " <td>0.004405</td>\n", + " <td>0.980520</td>\n", + " <td>0.008272</td>\n", + " <td>0.040371</td>\n", + " <td>0.001226</td>\n", + " <td>0.805063</td>\n", + " <td>0.005581</td>\n", + " <td>0.037933</td>\n", + " <td>0.004177</td>\n", + " <td>0.000233</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>169063</th>\n", + " <td>0.004662</td>\n", + " <td>0.000559</td>\n", + " <td>0.000271</td>\n", + " <td>0.001190</td>\n", + " <td>0.000868</td>\n", + " <td>0.000236</td>\n", + " <td>0.002325</td>\n", + " <td>0.000239</td>\n", + " <td>0.001862</td>\n", + " <td>0.000044</td>\n", + " <td>...</td>\n", + " <td>0.004677</td>\n", + " <td>0.037555</td>\n", + " <td>0.003252</td>\n", + " <td>0.003751</td>\n", + " <td>0.017954</td>\n", + " <td>0.002785</td>\n", + " <td>0.883382</td>\n", + " <td>0.002572</td>\n", + " <td>0.044943</td>\n", + " <td>0.000341</td>\n", + " </tr>\n", + " <tr>\n", + " <th>433255</th>\n", + " <td>0.004662</td>\n", + " <td>0.000559</td>\n", + " <td>0.000271</td>\n", + " <td>0.001190</td>\n", + " <td>0.000868</td>\n", + " <td>0.000236</td>\n", + " <td>0.002325</td>\n", + " <td>0.000239</td>\n", + " <td>0.001862</td>\n", + " <td>0.000044</td>\n", + " <td>...</td>\n", + " <td>0.004677</td>\n", + " <td>0.037555</td>\n", + " <td>0.003252</td>\n", + " <td>0.003751</td>\n", + " <td>0.017954</td>\n", + " <td>0.002785</td>\n", + " <td>0.883382</td>\n", + " <td>0.002572</td>\n", + " <td>0.044943</td>\n", + " <td>0.000341</td>\n", + " </tr>\n", + " <tr>\n", + " <th>232204</th>\n", + " <td>0.004662</td>\n", + " <td>0.000559</td>\n", + " <td>0.000271</td>\n", + " <td>0.001190</td>\n", + " <td>0.000868</td>\n", + " <td>0.000236</td>\n", + " <td>0.002325</td>\n", + " <td>0.000239</td>\n", + " <td>0.001862</td>\n", + " <td>0.000044</td>\n", + " <td>...</td>\n", + " <td>0.004677</td>\n", + " <td>0.037555</td>\n", + " <td>0.003252</td>\n", + " <td>0.003751</td>\n", + " <td>0.017954</td>\n", + " <td>0.002785</td>\n", + " <td>0.883382</td>\n", + " <td>0.002572</td>\n", + " <td>0.044943</td>\n", + " <td>0.000341</td>\n", + " </tr>\n", + " <tr>\n", + " <th>171685</th>\n", + " <td>0.004662</td>\n", + " <td>0.000559</td>\n", + " <td>0.000271</td>\n", + " <td>0.001190</td>\n", + " <td>0.000868</td>\n", + " <td>0.000236</td>\n", + " <td>0.002325</td>\n", + " <td>0.000239</td>\n", + " <td>0.001862</td>\n", + " <td>0.000044</td>\n", + " <td>...</td>\n", + " <td>0.004677</td>\n", + " <td>0.037555</td>\n", + " <td>0.003252</td>\n", + " <td>0.003751</td>\n", + " <td>0.017954</td>\n", + " <td>0.002785</td>\n", + " <td>0.883382</td>\n", + " <td>0.002572</td>\n", + " <td>0.044943</td>\n", + " <td>0.000341</td>\n", + " </tr>\n", + " <tr>\n", + " <th>249087</th>\n", + " <td>0.004662</td>\n", + " <td>0.000559</td>\n", + " <td>0.000271</td>\n", + " <td>0.001190</td>\n", + " <td>0.000868</td>\n", + " <td>0.000236</td>\n", + " <td>0.002325</td>\n", + " <td>0.000239</td>\n", + " <td>0.001862</td>\n", + " <td>0.000044</td>\n", + " <td>...</td>\n", + " <td>0.004677</td>\n", + " <td>0.037555</td>\n", + " <td>0.003252</td>\n", + " <td>0.003751</td>\n", + " <td>0.017954</td>\n", + " <td>0.002785</td>\n", + " <td>0.883382</td>\n", + " <td>0.002572</td>\n", + " <td>0.044943</td>\n", + " <td>0.000341</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>50064 rows × 104 columns</p>\n", + "</div>\n", + " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-73e59131-014c-49d1-a337-c844b74a21b5')\"\n", + " title=\"Convert this dataframe to an interactive table.\"\n", + " style=\"display:none;\">\n", + " \n", + " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", + " width=\"24px\">\n", + " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", + " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", + " </svg>\n", + " </button>\n", + " \n", + " <style>\n", + " .colab-df-container {\n", + " display:flex;\n", + " flex-wrap:wrap;\n", + " gap: 12px;\n", + " }\n", + "\n", + " .colab-df-convert {\n", + " background-color: #E8F0FE;\n", + " border: none;\n", + " border-radius: 50%;\n", + " cursor: pointer;\n", + " display: none;\n", + " fill: #1967D2;\n", + " height: 32px;\n", + " padding: 0 0 0 0;\n", + " width: 32px;\n", + " }\n", + "\n", + " .colab-df-convert:hover {\n", + " background-color: #E2EBFA;\n", + " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", + " fill: #174EA6;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert {\n", + " background-color: #3B4455;\n", + " fill: #D2E3FC;\n", + " }\n", + "\n", + " [theme=dark] .colab-df-convert:hover {\n", + " background-color: #434B5C;\n", + " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", + " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", + " fill: #FFFFFF;\n", + " }\n", + " </style>\n", + "\n", + " <script>\n", + " const buttonEl =\n", + " document.querySelector('#df-73e59131-014c-49d1-a337-c844b74a21b5 button.colab-df-convert');\n", + " buttonEl.style.display =\n", + " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", + "\n", + " async function convertToInteractive(key) {\n", + " const element = document.querySelector('#df-73e59131-014c-49d1-a337-c844b74a21b5');\n", + " const dataTable =\n", + " await google.colab.kernel.invokeFunction('convertToInteractive',\n", + " [key], {});\n", + " if (!dataTable) return;\n", + "\n", + " const docLinkHtml = 'Like what you see? Visit the ' +\n", + " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", + " + ' to learn more about interactive tables.';\n", + " element.innerHTML = '';\n", + " dataTable['output_type'] = 'display_data';\n", + " await google.colab.output.renderOutput(dataTable, element);\n", + " const docLink = document.createElement('div');\n", + " docLink.innerHTML = docLinkHtml;\n", + " element.appendChild(docLink);\n", + " }\n", + " </script>\n", + " </div>\n", + " </div>\n", + " " + ] + }, + "metadata": {}, + "execution_count": 31 + } + ], + "source": [ + "submission_cols = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/SubmissionFormat.csv')\n", + "cols_list = list(submission_cols.columns.values)\n", + "cols_list = cols_list[1:] ## remove the first column which is the index\n", + "submission = pd.DataFrame(test_predictions, columns=cols_list)\n", + "submission.set_index(test_df['Unnamed: 0'], inplace=True)\n", + "submission.index.name = \"\"\n", + "submission" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ep-ruNCfyMKH" + }, + "outputs": [], + "source": [ + "submission.to_csv('12072022_tfidf_logreg_Cval_2.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XwaSy1i13nax" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm" + }, + "gpuClass": "premium", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file