Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
Box-Plots for Education
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dmath010
Box-Plots for Education
Commits
341de568
Commit
341de568
authored
2 years ago
by
dmath010
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
b445af89
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
NLP_Approach
+1
-0
1 addition, 0 deletions
NLP_Approach
with
1 addition
and
0 deletions
NLP_Approach
0 → 100644
+
1
−
0
View file @
341de568
{
"cells"
:
[{
"cell_type"
:
"code"
,
"execution_count"
:
1
,
"metadata"
:
{
"colab"
:
{
"base_uri"
:
"https://localhost:8080/"
},
"executionInfo"
:
{
"elapsed"
:
196309
,
"status"
:
"ok"
,
"timestamp"
:
1670510422399
,
"user"
:
{
"displayName"
:
"Danny Mathieson"
,
"userId"
:
"14123752102169145219"
},
"user_tz"
:
300
},
"id"
:
"REugZMxlFCvU"
,
"outputId"
:
"2b6a1767-4296-4d10-cc7a-fcefdb7d6113"
},
"outputs"
:
[{
"output_type"
:
"stream"
,
"name"
:
"stdout"
,
"text"
:
[
"Mounted at /content/gdrive/
\n
"
]}],
"source"
:
[
"from google.colab import drive
\n
"
,
"drive.mount(
\"
/content/gdrive/
\"
)"
]},{
"cell_type"
:
"code"
,
"execution_count"
:
2
,
"metadata"
:
{
"id"
:
"ceMmp0oL8PyI"
,
"executionInfo"
:
{
"status"
:
"ok"
,
"timestamp"
:
1670510423386
,
"user_tz"
:
300
,
"elapsed"
:
990
,
"user"
:
{
"displayName"
:
"Danny Mathieson"
,
"userId"
:
"14123752102169145219"
}}},
"outputs"
:
[],
"source"
:
[
"import pandas as pd
\n
"
,
"import numpy as np
\n
"
,
"import shutil
\n
"
,
"import sys
\n
"
,
"import os
\n
"
,
"import time
\n
"
,
"from sklearn.model_selection import train_test_split"
]},{
"cell_type"
:
"code"
,
"execution_count"
:
3
,
"metadata"
:
{
"id"
:
"qAYz11Vr-k5b"
,
"executionInfo"
:
{
"status"
:
"ok"
,
"timestamp"
:
1670510423386
,
"user_tz"
:
300
,
"elapsed"
:
3
,
"user"
:
{
"displayName"
:
"Danny Mathieson"
,
"userId"
:
"14123752102169145219"
}}},
"outputs"
:
[],
"source"
:
[
"dir_path = 'gdrive/Shareddrives/CS5024 Ethics Project'
\n
"
,
"sys.path.append(dir_path)"
]},{
"cell_type"
:
"code"
,
"execution_count"
:
4
,
"metadata"
:
{
"colab"
:
{
"base_uri"
:
"https://localhost:8080/"
},
"executionInfo"
:
{
"elapsed"
:
10280
,
"status"
:
"ok"
,
"timestamp"
:
1670510433664
,
"user"
:
{
"displayName"
:
"Danny Mathieson"
,
"userId"
:
"14123752102169145219"
},
"user_tz"
:
300
},
"id"
:
"jmTuxI2t8PwQ"
,
"outputId"
:
"d312c306-e195-45bc-95de-fed482fa0869"
},
"outputs"
:
[{
"output_type"
:
"stream"
,
"name"
:
"stderr"
,
"text"
:
[
"/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (5,11) have mixed types.Specify dtype option on import or set low_memory=False.
\n
"
,
" exec(code_obj, self.user_global_ns, self.user_ns)
\n
"
]}],
"source"
:
[
"# Create DF for train and test set
\n
"
,
"train_df = pd.read_csv(f'
{
dir_path
}
/DrivenDataCompetition_DataFiles/TrainingData.csv')
\n
"
,
"test_df = pd.read_csv(f'
{
dir_path}/DrivenDataCompetition_DataFiles/TestData.csv')"]
}
,{"
cell_type
":"
code
","
source
":["
print
(
train_df
.
shape
)
\n
","
print
(
test_df
.
shape
)
"],"
metadata
":
{
"colab":{"base_uri":"https://localhost:8080/"
}
,"
id
":"
0
NPhp8WQ1YA0
","
executionInfo
":{"
status
":"
ok
","
timestamp
":1670510433665,"
user_tz
":300,"
elapsed
":7,"
user
":{"
displayName
":"
Danny
Mathieson
","
userId
":"
14123752102169145219
"}},"
outputId
":"
33582
c11
-
9
cd6
-
4
a16
-
ef75
-
957
d55f8c906
"},"
execution_count
":5,"
outputs
":[{"
output_type
":"
stream
","
name
":"
stdout
","
text
":["
(
400277
,
26
)
\n
","
(
50064
,
17
)
\n
"]}]},{"
cell_type
":"
markdown
","
source
":["
# Data Preprocessing"],"metadata":{"id":"Y558pu6Bdbbw"}},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":797},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1670510433665,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"},"user_tz":300},"id":"Q9Jzqrlwiu83","outputId":"cf12f590-0d36-4704-de43-6193bc7fcf82"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Function Use \\\n","Unnamed: 0 \n","134338 Teacher Compensation Instruction \n","206341 NO_LABEL NO_LABEL \n","326408 Teacher Compensation Instruction \n","364634 Substitute Compensation Instruction \n","47683 Substitute Compensation Instruction \n","... ... ... \n","109283 Professional Development ISPD \n","102430 Substitute Compensation Instruction \n","413949 Parent & Community Relations NO_LABEL \n","433672 Library & Media Instruction \n","415831 Substitute Compensation Instruction \n","\n"," Sharing Reporting Student_Type \\\n","Unnamed: 0 \n","134338 School Reported School NO_LABEL \n","206341 NO_LABEL NO_LABEL NO_LABEL \n","326408 School Reported School Unspecified \n","364634 School Reported School Unspecified \n","47683 School Reported School Unspecified \n","... ... ... ... \n","109283 Shared Services Non-School Unspecified \n","102430 School Reported School Unspecified \n","413949 School Reported School NO_LABEL \n","433672 School on Central Budgets Non-School Unspecified \n","415831 School Reported School Poverty \n","\n"," Position_Type Object_Type Pre_K \\\n","Unnamed: 0 \n","134338 Teacher NO_LABEL NO_LABEL \n","206341 NO_LABEL NO_LABEL NO_LABEL \n","326408 Teacher Base Salary/Compensation Non PreK \n","364634 Substitute Benefits NO_LABEL \n","47683 Teacher Substitute Compensation NO_LABEL \n","... ... ... ... \n","109283 Instructional Coach Other Compensation/Stipend NO_LABEL \n","102430 Substitute Base Salary/Compensation NO_LABEL \n","413949 Other NO_LABEL NO_LABEL \n","433672 Librarian Benefits NO_LABEL \n","415831 Substitute Substitute Compensation Non PreK \n","\n"," Operating_Status \\\n","Unnamed: 0 \n","134338 PreK-12 Operating \n","206341 Non-Operating \n","326408 PreK-12 Operating \n","364634 PreK-12 Operating \n","47683 PreK-12 Operating \n","... ... \n","109283 PreK-12 Operating \n","102430 PreK-12 Operating \n","413949 PreK-12 Operating \n","433672 PreK-12 Operating \n","415831 PreK-12 Operating \n","\n"," Object_Description ... \\\n","Unnamed: 0 ... \n","134338 NaN ... \n","206341 CONTRACTOR SERVICES ... \n","326408 Personal Services - Teachers ... \n","364634 EMPLOYEE BENEFITS ... \n","47683 TEACHER COVERAGE FOR TEACHER ... \n","... ... ... \n","109283 WORKSHOP PARTICIPANT ... \n","102430 SALARIES OF PART TIME EMPLOYEE ... \n","413949 NaN ... \n","433672 EMPLOYEE BENEFITS ... \n","415831 Salaries And Wages For Substitute Professionals ... \n","\n"," Sub_Object_Description \\\n","Unnamed: 0 \n","134338 NaN \n","206341 NaN \n","326408 NaN \n","364634 NaN \n","47683 NaN \n","... ... \n","109283 NaN \n","102430 NaN \n","413949 NaN \n","433672 NaN \n","415831 Inservice Substitute Teachers Grant Funded \n","\n"," Location_Description FTE \\\n","Unnamed: 0 \n","134338 NaN 1.00000 \n","206341 NaN NaN \n","326408 NaN 1.00000 \n","364634 NaN NaN \n","47683 NaN NaN \n","... ... ... \n","109283 STAFF DEV AND INSTR MEDIA NaN \n","102430 NaN 0.00431 \n","413949 NaN 1.00000 \n","433672 ED RESOURCE SERVICES NaN \n","415831 School NaN \n","\n"," Function_Description Facility_or_Department \\\n","Unnamed: 0 \n","134338 NaN NaN \n","206341 RGN GOB NaN \n","326408 NaN NaN \n","364634 UNALLOC BUDGETS/SCHOOLS NaN \n","47683 NON-PROJECT NaN \n","... ... ... \n","109283 INST STAFF TRAINING SVCS NaN \n","102430 TITLE II,D NaN \n","413949 NaN NaN \n","433672 NON-PROJECT NaN \n","415831 Instruction Instruction And Curriculum \n","\n"," Position_Extra Total \\\n","Unnamed: 0 \n","134338 KINDERGARTEN 50471.810000 \n","206341 UNDESIGNATED 3477.860000 \n","326408 TEACHER 62237.130000 \n","364634 PROFESSIONAL-INSTRUCTIONAL 22.300000 \n","47683 PROFESSIONAL-INSTRUCTIONAL 54.166000 \n","... ... ... \n","109283 NaN 48.620000 \n","102430 PROFESSIONAL-INSTRUCTIONAL 128.824985 \n","413949 PARENT/TITLE I 4902.290000 \n","433672 OFFICE/ADMINISTRATIVE SUPPORT 4020.290000 \n","415831 CERTIFIED SUBSTITUTE 46.530000 \n","\n"," Program_Description \\\n","Unnamed: 0 \n","134338 KINDERGARTEN \n","206341 BUILDING IMPROVEMENT SERVICES \n","326408 Instruction - Regular \n","364634 GENERAL MIDDLE/JUNIOR HIGH SCH \n","47683 GENERAL HIGH SCHOOL EDUCATION \n","... ... \n","109283 NaN \n","102430 INSTRUCTIONAL STAFF TRAINING \n","413949 Misc \n","433672 MEDIA SUPPORT SERVICES \n","415831 Accelerated Education \n","\n"," Fund_Description \\\n","Unnamed: 0 \n","134338 General Fund \n","206341 NaN \n","326408 General Purpose School \n","364634 NaN \n","47683 NaN \n","... ... \n","109283 GENERAL FUND \n","102430 NaN \n","413949 Schoolwide Schools \n","433672 NaN \n","415831 \"Title Part A Improving Basic Programs\" \n","\n"," Text_1 \n","Unnamed: 0 \n","134338 NaN \n","206341 BUILDING IMPROVEMENT SERVICES \n","326408 NaN \n","364634 REGULAR INSTRUCTION \n","47683 REGULAR INSTRUCTION \n","... ... \n","109283 STAFF DEV AND INSTR MEDIA \n","102430 INSTRUCTIONAL STAFF \n","413949 NaN \n","433672 INSTRUCTIONAL STAFF \n","415831 MISCELLANEOUS \n","\n","[400277 rows x 25 columns]"],"text/html":["\n"," <div id=\"df-f666659d-cc53-4867-b64a-0d1b481a7c61\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Function</th>\n"," <th>Use</th>\n"," <th>Sharing</th>\n"," <th>Reporting</th>\n"," <th>Student_Type</th>\n"," <th>Position_Type</th>\n"," <th>Object_Type</th>\n"," <th>Pre_K</th>\n"," <th>Operating_Status</th>\n"," <th>Object_Description</th>\n"," <th>...</th>\n"," <th>Sub_Object_Description</th>\n"," <th>Location_Description</th>\n"," <th>FTE</th>\n"," <th>Function_Description</th>\n"," <th>Facility_or_Department</th>\n"," <th>Position_Extra</th>\n"," <th>Total</th>\n"," <th>Program_Description</th>\n"," <th>Fund_Description</th>\n"," <th>Text_1</th>\n"," </tr>\n"," <tr>\n"," <th>Unnamed: 0</th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>134338</th>\n"," <td>Teacher Compensation</td>\n"," <td>Instruction</td>\n"," <td>School Reported</td>\n"," <td>School</td>\n"," <td>NO_LABEL</td>\n"," <td>Teacher</td>\n"," <td>NO_LABEL</td>\n"," <td>NO_LABEL</td>\n"," <td>PreK-12 Operating</td>\n"," <td>NaN</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>1.00000</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>KINDERGARTEN</td>\n"," <td>50471.810000</td>\n"," <td>KINDERGARTEN</td>\n"," <td>General Fund</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>206341</th>\n"," <td>NO_LABEL</td>\n"," <td>NO_LABEL</td>\n"," <td>NO_LABEL</td>\n"," <td>NO_LABEL</td>\n"," <td>NO_LABEL</td>\n"," <td>NO_LABEL</td>\n"," <td>NO_LABEL</td>\n"," <td>NO_LABEL</td>\n"," <td>Non-Operating</td>\n"," <td>CONTRACTOR SERVICES</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>RGN GOB</td>\n"," <td>NaN</td>\n"," <td>UNDESIGNATED</td>\n"," <td>3477.860000</td>\n"," <td>BUILDING IMPROVEMENT SERVICES</td>\n"," <td>NaN</td>\n"," <td>BUILDING IMPROVEMENT SERVICES</td>\n"," </tr>\n"," <tr>\n"," <th>326408</th>\n"," <td>Teacher Compensation</td>\n"," <td>Instruction</td>\n"," <td>School Reported</td>\n"," <td>School</td>\n"," <td>Unspecified</td>\n"," <td>Teacher</td>\n"," <td>Base Salary/Compensation</td>\n"," <td>Non PreK</td>\n"," <td>PreK-12 Operating</td>\n"," <td>Personal Services - Teachers</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>1.00000</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>TEACHER</td>\n"," <td>62237.130000</td>\n"," <td>Instruction - Regular</td>\n"," <td>General Purpose School</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>364634</th>\n"," <td>Substitute Compensation</td>\n"," <td>Instruction</td>\n"," <td>School Reported</td>\n"," <td>School</td>\n"," <td>Unspecified</td>\n"," <td>Substitute</td>\n"," <td>Benefits</td>\n"," <td>NO_LABEL</td>\n"," <td>PreK-12 Operating</td>\n"," <td>EMPLOYEE BENEFITS</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>UNALLOC BUDGETS/SCHOOLS</td>\n"," <td>NaN</td>\n"," <td>PROFESSIONAL-INSTRUCTIONAL</td>\n"," <td>22.300000</td>\n"," <td>GENERAL MIDDLE/JUNIOR HIGH SCH</td>\n"," <td>NaN</td>\n"," <td>REGULAR INSTRUCTION</td>\n"," </tr>\n"," <tr>\n"," <th>47683</th>\n"," <td>Substitute Compensation</td>\n"," <td>Instruction</td>\n"," <td>School Reported</td>\n"," <td>School</td>\n"," <td>Unspecified</td>\n"," <td>Teacher</td>\n"," <td>Substitute Compensation</td>\n"," <td>NO_LABEL</td>\n"," <td>PreK-12 Operating</td>\n"," <td>TEACHER COVERAGE FOR TEACHER</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NON-PROJECT</td>\n"," <td>NaN</td>\n"," <td>PROFESSIONAL-INSTRUCTIONAL</td>\n"," <td>54.166000</td>\n"," <td>GENERAL HIGH SCHOOL EDUCATION</td>\n"," <td>NaN</td>\n"," <td>REGULAR INSTRUCTION</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>109283</th>\n"," <td>Professional Development</td>\n"," <td>ISPD</td>\n"," <td>Shared Services</td>\n"," <td>Non-School</td>\n"," <td>Unspecified</td>\n"," <td>Instructional Coach</td>\n"," <td>Other Compensation/Stipend</td>\n"," <td>NO_LABEL</td>\n"," <td>PreK-12 Operating</td>\n"," <td>WORKSHOP PARTICIPANT</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>STAFF DEV AND INSTR MEDIA</td>\n"," <td>NaN</td>\n"," <td>INST STAFF TRAINING SVCS</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>48.620000</td>\n"," <td>NaN</td>\n"," <td>GENERAL FUND</td>\n"," <td>STAFF DEV AND INSTR MEDIA</td>\n"," </tr>\n"," <tr>\n"," <th>102430</th>\n"," <td>Substitute Compensation</td>\n"," <td>Instruction</td>\n"," <td>School Reported</td>\n"," <td>School</td>\n"," <td>Unspecified</td>\n"," <td>Substitute</td>\n"," <td>Base Salary/Compensation</td>\n"," <td>NO_LABEL</td>\n"," <td>PreK-12 Operating</td>\n"," <td>SALARIES OF PART TIME EMPLOYEE</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>0.00431</td>\n"," <td>TITLE II,D</td>\n"," <td>NaN</td>\n"," <td>PROFESSIONAL-INSTRUCTIONAL</td>\n"," <td>128.824985</td>\n"," <td>INSTRUCTIONAL STAFF TRAINING</td>\n"," <td>NaN</td>\n"," <td>INSTRUCTIONAL STAFF</td>\n"," </tr>\n"," <tr>\n"," <th>413949</th>\n"," <td>Parent & Community Relations</td>\n"," <td>NO_LABEL</td>\n"," <td>School Reported</td>\n"," <td>School</td>\n"," <td>NO_LABEL</td>\n"," <td>Other</td>\n"," <td>NO_LABEL</td>\n"," <td>NO_LABEL</td>\n"," <td>PreK-12 Operating</td>\n"," <td>NaN</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>1.00000</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>PARENT/TITLE I</td>\n"," <td>4902.290000</td>\n"," <td>Misc</td>\n"," <td>Schoolwide Schools</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>433672</th>\n"," <td>Library & Media</td>\n"," <td>Instruction</td>\n"," <td>School on Central Budgets</td>\n"," <td>Non-School</td>\n"," <td>Unspecified</td>\n"," <td>Librarian</td>\n"," <td>Benefits</td>\n"," <td>NO_LABEL</td>\n"," <td>PreK-12 Operating</td>\n"," <td>EMPLOYEE BENEFITS</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>ED RESOURCE SERVICES</td>\n"," <td>NaN</td>\n"," <td>NON-PROJECT</td>\n"," <td>NaN</td>\n"," <td>OFFICE/ADMINISTRATIVE SUPPORT</td>\n"," <td>4020.290000</td>\n"," <td>MEDIA SUPPORT SERVICES</td>\n"," <td>NaN</td>\n"," <td>INSTRUCTIONAL STAFF</td>\n"," </tr>\n"," <tr>\n"," <th>415831</th>\n"," <td>Substitute Compensation</td>\n"," <td>Instruction</td>\n"," <td>School Reported</td>\n"," <td>School</td>\n"," <td>Poverty</td>\n"," <td>Substitute</td>\n"," <td>Substitute Compensation</td>\n"," <td>Non PreK</td>\n"," <td>PreK-12 Operating</td>\n"," <td>Salaries And Wages For Substitute Professionals</td>\n"," <td>...</td>\n"," <td>Inservice Substitute Teachers Grant Funded</td>\n"," <td>School</td>\n"," <td>NaN</td>\n"," <td>Instruction</td>\n"," <td>Instruction And Curriculum</td>\n"," <td>CERTIFIED SUBSTITUTE</td>\n"," <td>46.530000</td>\n"," <td>Accelerated Education</td>\n"," <td>\"Title Part A Improving Basic Programs\"</td>\n"," <td>MISCELLANEOUS</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>400277 rows × 25 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f666659d-cc53-4867-b64a-0d1b481a7c61')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-f666659d-cc53-4867-b64a-0d1b481a7c61 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-f666659d-cc53-4867-b64a-0d1b481a7c61');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":6}],"source":["test_df.set_index('Unnamed: 0')\n","train_df.set_index('Unnamed: 0')"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"elapsed":2146,"status":"ok","timestamp":1670510435808,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"},"user_tz":300},"id":"Thh9gm2q8Pt_","outputId":"4c390cc1-191f-4c51-aa7d-d7a4e549cd93"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" 0 \\\n","Unnamed: 0 \n","134338 Teacher-Elementary KINDERGA... \n","206341 CONTRACTOR SERVICES BOND EXPENDITURES BUILDING... \n","326408 Personal Services - Teachers TCHER 2ND GRA... \n","364634 EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te... \n","47683 TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE... \n","... ... \n","109283 WORKSHOP PARTICIPANT CURRICULUM ... \n","102430 SALARIES OF PART TIME EMPLOYEE FEDERAL GDPG ... \n","413949 School Liaison PARENT/TITLE ... \n","433672 EMPLOYEE BENEFITS EDUCATIONAL RESOURCE SERVICE... \n","415831 Salaries And Wages For Substitute Professional... \n","\n"," Function_Aides Compensation \\\n","Unnamed: 0 \n","134338 0 \n","206341 0 \n","326408 0 \n","364634 0 \n","47683 0 \n","... ... \n","109283 0 \n","102430 0 \n","413949 0 \n","433672 0 \n","415831 0 \n","\n"," Function_Career & Academic Counseling Function_Communications \\\n","Unnamed: 0 \n","134338 0 0 \n","206341 0 0 \n","326408 0 0 \n","364634 0 0 \n","47683 0 0 \n","... ... ... \n","109283 0 0 \n","102430 0 0 \n","413949 0 0 \n","433672 0 0 \n","415831 0 0 \n","\n"," Function_Curriculum Development \\\n","Unnamed: 0 \n","134338 0 \n","206341 0 \n","326408 0 \n","364634 0 \n","47683 0 \n","... ... \n","109283 0 \n","102430 0 \n","413949 0 \n","433672 0 \n","415831 0 \n","\n"," Function_Data Processing & Information Services \\\n","Unnamed: 0 \n","134338 0 \n","206341 0 \n","326408 0 \n","364634 0 \n","47683 0 \n","... ... \n","109283 0 \n","102430 0 \n","413949 0 \n","433672 0 \n","415831 0 \n","\n"," Function_Development & Fundraising Function_Enrichment \\\n","Unnamed: 0 \n","134338 0 0 \n","206341 0 0 \n","326408 0 0 \n","364634 0 0 \n","47683 0 0 \n","... ... ... \n","109283 0 0 \n","102430 0 0 \n","413949 0 0 \n","433672 0 0 \n","415831 0 0 \n","\n"," Function_Extended Time & Tutoring \\\n","Unnamed: 0 \n","134338 0 \n","206341 0 \n","326408 0 \n","364634 0 \n","47683 0 \n","... ... \n","109283 0 \n","102430 0 \n","413949 0 \n","433672 0 \n","415831 0 \n","\n"," Function_Facilities & Maintenance ... \\\n","Unnamed: 0 ... \n","134338 0 ... \n","206341 0 ... \n","326408 0 ... \n","364634 0 ... \n","47683 0 ... \n","... ... ... \n","109283 0 ... \n","102430 0 ... \n","413949 0 ... \n","433672 0 ... \n","415831 0 ... \n","\n"," Student_Type_Special Education Student_Type_Unspecified \\\n","Unnamed: 0 \n","134338 0 0 \n","206341 0 0 \n","326408 0 1 \n","364634 0 1 \n","47683 0 1 \n","... ... ... \n","109283 0 1 \n","102430 0 1 \n","413949 0 0 \n","433672 0 1 \n","415831 0 0 \n","\n"," Use_Business Services Use_ISPD Use_Instruction Use_Leadership \\\n","Unnamed: 0 \n","134338 0 0 1 0 \n","206341 0 0 0 0 \n","326408 0 0 1 0 \n","364634 0 0 1 0 \n","47683 0 0 1 0 \n","... ... ... ... ... \n","109283 0 1 0 0 \n","102430 0 0 1 0 \n","413949 0 0 0 0 \n","433672 0 0 1 0 \n","415831 0 0 1 0 \n","\n"," Use_NO_LABEL Use_O&M Use_Pupil Services & Enrichment \\\n","Unnamed: 0 \n","134338 0 0 0 \n","206341 1 0 0 \n","326408 0 0 0 \n","364634 0 0 0 \n","47683 0 0 0 \n","... ... ... ... \n","109283 0 0 0 \n","102430 0 0 0 \n","413949 1 0 0 \n","433672 0 0 0 \n","415831 0 0 0 \n","\n"," Use_Untracked Budget Set-Aside \n","Unnamed: 0 \n","134338 0 \n","206341 0 \n","326408 0 \n","364634 0 \n","47683 0 \n","... ... \n","109283 0 \n","102430 0 \n","413949 0 \n","433672 0 \n","415831 0 \n","\n","[400277 rows x 105 columns]"],"text/html":["\n"," <div id=\"df-efe9ca42-7ce1-4fd4-a880-234b5631bd03\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>0</th>\n"," <th>Function_Aides Compensation</th>\n"," <th>Function_Career & Academic Counseling</th>\n"," <th>Function_Communications</th>\n"," <th>Function_Curriculum Development</th>\n"," <th>Function_Data Processing & Information Services</th>\n"," <th>Function_Development & Fundraising</th>\n"," <th>Function_Enrichment</th>\n"," <th>Function_Extended Time & Tutoring</th>\n"," <th>Function_Facilities & Maintenance</th>\n"," <th>...</th>\n"," <th>Student_Type_Special Education</th>\n"," <th>Student_Type_Unspecified</th>\n"," <th>Use_Business Services</th>\n"," <th>Use_ISPD</th>\n"," <th>Use_Instruction</th>\n"," <th>Use_Leadership</th>\n"," <th>Use_NO_LABEL</th>\n"," <th>Use_O&M</th>\n"," <th>Use_Pupil Services & Enrichment</th>\n"," <th>Use_Untracked Budget Set-Aside</th>\n"," </tr>\n"," <tr>\n"," <th>Unnamed: 0</th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>134338</th>\n"," <td>Teacher-Elementary KINDERGA...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>206341</th>\n"," <td>CONTRACTOR SERVICES BOND EXPENDITURES BUILDING...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>326408</th>\n"," <td>Personal Services - Teachers TCHER 2ND GRA...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>364634</th>\n"," <td>EMPLOYEE BENEFITS TEACHER SUBS GENERAL FUND Te...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>47683</th>\n"," <td>TEACHER COVERAGE FOR TEACHER TEACHER SUBS GENE...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>109283</th>\n"," <td>WORKSHOP PARTICIPANT CURRICULUM ...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>102430</th>\n"," <td>SALARIES OF PART TIME EMPLOYEE FEDERAL GDPG ...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>413949</th>\n"," <td>School Liaison PARENT/TITLE ...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>433672</th>\n"," <td>EMPLOYEE BENEFITS EDUCATIONAL RESOURCE SERVICE...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>415831</th>\n"," <td>Salaries And Wages For Substitute Professional...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>...</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>400277 rows × 105 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-efe9ca42-7ce1-4fd4-a880-234b5631bd03')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-efe9ca42-7ce1-4fd4-a880-234b5631bd03 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-efe9ca42-7ce1-4fd4-a880-234b5631bd03');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":7}],"source":["# set target Cols\n","target_cols = ['Function', 'Object_Type', 'Operating_Status', 'Position_Type', 'Pre_K', 'Reporting', 'Sharing', 'Student_Type', 'Use']\n","train_df_dummies = pd.DataFrame()\n","# drop train int cols\n","train_df.drop(['FTE', 'Total'], axis = 1, inplace=True)\n","# get dummies for target cols\n","col = pd.get_dummies(train_df[target_cols])\n","# drop target cols\n","train_df = train_df.drop(target_cols, axis=1)\n","# fill NaN with space\n","train_df.fillna(' ', inplace=True)\n","# combine all text into single col\n","combined = pd.DataFrame([' '.join(row) for row in train_df[train_df.columns[1:]].values])\n","# join combined text col with dummy labels\n","train_df_dummies = pd.concat([combined, col], axis = 1)\n","# drop test int cols\n","test_df.drop(['FTE', 'Total'], axis = 1, inplace=True)\n","# fill NaN with space\n","test_df.fillna(' ', inplace=True)\n","# combine all text into single col\n","test_df_cleaned = pd.DataFrame([' '.join(row) for row in test_df[test_df.columns[1:]].values])\n","# reset indices to original\n","test_df_cleaned.set_index(test_df['Unnamed: 0'])\n","train_df_dummies.set_index(train_df['Unnamed: 0'])"]},{"cell_type":"code","execution_count":8,"metadata":{"id":"sIdbRlvsE2Fn","executionInfo":{"status":"ok","timestamp":1670510435809,"user_tz":300,"elapsed":5,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["cols = col.columns"]},{"cell_type":"code","execution_count":9,"metadata":{"id":"5ZhHV0028PjT","executionInfo":{"status":"ok","timestamp":1670510435809,"user_tz":300,"elapsed":4,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["# rename text col\n","test_df_cleaned.rename(columns={0: \"text\"}, inplace=True)\n","train_df_dummies.rename(columns={0: \"text\"}, inplace=True)"]},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1292,"status":"ok","timestamp":1670510437097,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"},"user_tz":300},"id":"DSMXH2Qmc0Y_","outputId":"fe62771e-2bf1-44f4-88e1-d56dae02b300"},"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Unzipping corpora/stopwords.zip.\n","[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]}],"source":["import nltk\n","nltk.download('stopwords')\n","nltk.download('punkt')\n","from nltk import word_tokenize\n","from nltk.corpus import stopwords\n","from nltk.stem.snowball import SnowballStemmer\n","import re"]},{"cell_type":"code","execution_count":11,"metadata":{"id":"Xly7pR2-CZhx","executionInfo":{"status":"ok","timestamp":1670510437098,"user_tz":300,"elapsed":4,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["def text_processing(df):\n"," stop_words = set(stopwords.words('english'))\n"," st = SnowballStemmer('english')\n"," # lower and trim spaces\n"," df['text'] = df['text'].apply(lambda x: x.lower().strip())\n"," # remove other spaces\n"," df['text'] = df['text'].apply(lambda x: re.sub(' +', ' ', x))\n"," # remove punctuation\n"," df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))\n"," # remove stopwords stem\n"," df['text'] = df['text'].apply(lambda x: ' '.join(st.stem(text) for text in x.split() if text not in stop_words))"]},{"cell_type":"code","execution_count":12,"metadata":{"id":"rR1Z3FPoh7c3","executionInfo":{"status":"ok","timestamp":1670510527234,"user_tz":300,"elapsed":90139,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["# clean df\n","text_processing(train_df_dummies)\n","text_processing(test_df_cleaned)"]},{"cell_type":"markdown","source":["# Embedding"],"metadata":{"id":"4DkHoAlAdS8j"}},{"cell_type":"code","execution_count":13,"metadata":{"id":"iQAX3AoUnrtv","executionInfo":{"status":"ok","timestamp":1670510527235,"user_tz":300,"elapsed":4,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["from sklearn.feature_extraction.text import TfidfVectorizer"]},{"cell_type":"code","execution_count":14,"metadata":{"id":"dpLEeR7dC9Bd","executionInfo":{"status":"ok","timestamp":1670510527655,"user_tz":300,"elapsed":423,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["X = train_df_dummies['text'].values\n","y = train_df_dummies[list(train_df_dummies.columns[1:])].values\n","X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.2,train_size =0.8)"]},{"cell_type":"code","execution_count":24,"metadata":{"id":"J4KQiBI0oZUp","executionInfo":{"status":"ok","timestamp":1670511912627,"user_tz":300,"elapsed":9889,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["def tfidf_features(X_train, X_val, X_test):\n"," \"\"\"\n"," X_train, X_test — samples \n"," return TF-IDF vectorized representation of each sample and vocabulary\n"," \"\"\"\n"," # Create TF-IDF vectorizer with a proper parameters choice\n"," tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=1.0, min_df=1, token_pattern='(\\S+)')\n"," # Fit the vectorizer on the train set\n"," X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\n"," # Transform the train, test set and return the result\n"," X_val_tfidf = tfidf_vectorizer.transform(X_val)\n"," X_test_tfidf = tfidf_vectorizer.transform(X_test)\n"," \n"," return X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_\n","timer = time.time()\n","X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, test_df_cleaned.text)\n","tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}"]},{"cell_type":"markdown","source":["# Logistic Regression"],"metadata":{"id":"ydQ-7nJdT7Ad"}},{"cell_type":"code","execution_count":25,"metadata":{"id":"nwyPgHYMplz3","executionInfo":{"status":"ok","timestamp":1670513291331,"user_tz":300,"elapsed":1378713,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"045b6532-7308-4018-b3c3-af14efe9aec5"},"outputs":[{"output_type":"stream","name":"stdout","text":["Time for NLP training\" 1387.7084062099457\n","Time for NLP inference\" 0.6320400238037109\n"]}],"source":["from sklearn.multiclass import OneVsRestClassifier\n","from sklearn.linear_model import LogisticRegression, RidgeClassifier\n","\n","def train_logreg(X_train, y_train, C, regularisation):\n"," \"\"\"\n"," X_train, y_train — training data\n"," \n"," return: trained classifier\n"," \"\"\"\n"," \n"," # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n","\n"," model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)\n"," return model\n","\n","logreg_tfidf = train_logreg(X_train_tfidf, y_train, C = 2, regularisation = 'l2')\n","print (f'Time for NLP training\" {time.time()-timer}')\n","timer = time.time()\n","y_val_predicted_labels_logreg = logreg_tfidf.predict(X_val_tfidf)\n","print (f'Time for NLP inference\" {time.time()-timer}')\n"]},{"cell_type":"markdown","source":["# XG Boost"],"metadata":{"id":"SOFP4vZFUJiu"}},{"cell_type":"code","source":["# from sklearn.multiclass import OneVsRestClassifier\n","# from xgboost import XGBClassifier\n","\n","# def train_xgb(X_train, y_train, params):\n","# \"\"\"\n","# X_train, y_train — training data\n"," \n","# return: trained classifier\n","# \"\"\"\n"," \n","# # Create and fit XGBoost wraped into OneVsRestClassifier.\n","\n","# model = OneVsRestClassifier(XGBClassifier(**params)).fit(X_train, y_train)\n","# return model\n","# xgb_params = {'eta': 0.3, \n","# 'max_depth': 5, \n","# 'subsample': 0.8, \n","# 'colsample_bytree': 0.8, \n","# 'tree_method' : 'gpu_hist',\n","# 'objective': 'binary:logistic', \n","# 'eval_metric': 'auc', \n","# 'seed': 42\n","# }\n","# xgb_tfidf = train_xgb(X_train_tfidf, y_train, xgb_params)\n","# y_val_predicted_labels_xgb = xgb_tfidf.predict(X_val_tfidf)"],"metadata":{"id":"r42fPuwwURXN","executionInfo":{"status":"ok","timestamp":1670513291331,"user_tz":300,"elapsed":10,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"execution_count":26,"outputs":[]},{"cell_type":"markdown","source":["# SGD"],"metadata":{"id":"WeL9xN2WLpyO"}},{"cell_type":"code","source":["# from sklearn.multiclass import OneVsRestClassifier\n","# from sklearn.linear_model import SGDClassifier\n","# def train_sgd(X_train, y_train):\n","# \"\"\"\n","# X_train, y_train — training data\n"," \n","# return: trained classifier\n","# \"\"\"\n"," \n","# # Create and fit LogisticRegression wraped into OneVsRestClassifier.\n","\n","# model = OneVsRestClassifier(SGDClassifier(loss = 'log', penalty = 'l2')).fit(X_train, y_train)\n","# return model\n","\n","# sgd_tfidf = train_sgd(X_train_tfidf, y_train)\n","# print (f'Time for NLP training\" {time.time()-timer}')\n","# timer = time.time()\n","# y_val_predicted_labels_sgd = sgd_tfidf.predict(X_val_tfidf)\n","# print (f'Time for NLP inference\" {time.time()-timer}')\n"],"metadata":{"id":"CxxB9ABSLrs1","executionInfo":{"status":"ok","timestamp":1670513291331,"user_tz":300,"elapsed":8,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"execution_count":27,"outputs":[]},{"cell_type":"markdown","source":["## Model Evaluation"],"metadata":{"id":"aToqJ94RdO9R"}},{"cell_type":"code","source":["from sklearn.metrics import accuracy_score\n","from sklearn.metrics import f1_score\n","from sklearn.metrics import roc_auc_score \n","from sklearn.metrics import precision_score\n","from sklearn.metrics import average_precision_score\n","from sklearn.metrics import recall_score\n","from sklearn.metrics import log_loss"],"metadata":{"id":"a2ULt-H4dAT5","executionInfo":{"status":"ok","timestamp":1670513291332,"user_tz":300,"elapsed":8,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"execution_count":28,"outputs":[]},{"cell_type":"code","source":["def print_evaluation_scores(y_test, predicted):\n"," \n"," print('Accuracy: ', accuracy_score(y_test, predicted, normalize=True))\n"," print('F1-score macro: ', f1_score(y_test, predicted, average='macro', labels=[1]))\n"," print('F1-score micro: ', f1_score(y_test, predicted, average='micro', labels=[1]))\n"," print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))\n"," print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))\n"," print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))\n"," print('Precision weighted: ', precision_score(y_test, predicted, average='weighted', labels=[1]))\n"," print('Log Loss: ', log_loss(y_test, predicted, normalize=True))\n","\n"," \n","print('Metrics')\n","print_evaluation_scores(y_val, y_val_predicted_labels_logreg)\n","# print_evaluation_scores(y_val, y_val_predicted_labels_xgb)\n","# print_evaluation_scores(y_val, y_val_predicted_labels_sgd)"],"metadata":{"id":"g1xaUV6Ic1kS","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1670513299813,"user_tz":300,"elapsed":8489,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}},"outputId":"e1fb88d6-f5e0-4b75-c656-9b0ee6ea87da"},"execution_count":29,"outputs":[{"output_type":"stream","name":"stdout","text":["Metrics\n","Accuracy: 0.898533526531428\n","F1-score macro: 0.857397504456328\n","F1-score micro: 0.857397504456328\n","F1-score weighted: 0.9780481032620225\n","Precision macro: 0.8548790839083935\n","Precision micro: 0.9595717445537028\n","Precision weighted: 0.8809523809523809\n","Log Loss: 27.62320822221433\n"]}]},{"cell_type":"markdown","source":["# Submission"],"metadata":{"id":"4e9yV9CadHwO"}},{"cell_type":"code","execution_count":30,"metadata":{"id":"75VcWGLmvvCi","executionInfo":{"status":"ok","timestamp":1670513300205,"user_tz":300,"elapsed":394,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["test_predictions = logreg_tfidf.predict_proba(X_test_tfidf)"]},{"cell_type":"code","execution_count":31,"metadata":{"id":"GhNpC96cwRmD","executionInfo":{"status":"ok","timestamp":1670513301440,"user_tz":300,"elapsed":1237,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}},"colab":{"base_uri":"https://localhost:8080/","height":554},"outputId":"aa8f5639-897c-4437-aca7-daafddd03797"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Function__Aides Compensation Function__Career & Academic Counseling \\\n"," \n","180042 0.005369 0.001911 \n","28872 0.000574 0.001182 \n","186915 0.017363 0.000992 \n","412396 0.011996 0.000990 \n","427740 0.001207 0.002310 \n","... ... ... \n","169063 0.004662 0.000559 \n","433255 0.004662 0.000559 \n","232204 0.004662 0.000559 \n","171685 0.004662 0.000559 \n","249087 0.004662 0.000559 \n","\n"," Function__Communications Function__Curriculum Development \\\n"," \n","180042 0.000062 0.000037 \n","28872 0.000243 0.005150 \n","186915 0.000156 0.001177 \n","412396 0.000155 0.001122 \n","427740 0.000585 0.002508 \n","... ... ... \n","169063 0.000271 0.001190 \n","433255 0.000271 0.001190 \n","232204 0.000271 0.001190 \n","171685 0.000271 0.001190 \n","249087 0.000271 0.001190 \n","\n"," Function__Data Processing & Information Services \\\n"," \n","180042 0.000424 \n","28872 0.001041 \n","186915 0.000616 \n","412396 0.000625 \n","427740 0.002355 \n","... ... \n","169063 0.000868 \n","433255 0.000868 \n","232204 0.000868 \n","171685 0.000868 \n","249087 0.000868 \n","\n"," Function__Development & Fundraising Function__Enrichment \\\n"," \n","180042 0.000088 0.000996 \n","28872 0.000150 0.227637 \n","186915 0.000108 0.000500 \n","412396 0.000108 0.000475 \n","427740 0.000135 0.002673 \n","... ... ... \n","169063 0.000236 0.002325 \n","433255 0.000236 0.002325 \n","232204 0.000236 0.002325 \n","171685 0.000236 0.002325 \n","249087 0.000236 0.002325 \n","\n"," Function__Extended Time & Tutoring \\\n"," \n","180042 0.000203 \n","28872 0.003900 \n","186915 0.000463 \n","412396 0.000489 \n","427740 0.000067 \n","... ... \n","169063 0.000239 \n","433255 0.000239 \n","232204 0.000239 \n","171685 0.000239 \n","249087 0.000239 \n","\n"," Function__Facilities & Maintenance Function__Facilities Planning \\\n"," \n","180042 0.001402 0.000029 \n","28872 0.006152 0.000040 \n","186915 0.001457 0.000034 \n","412396 0.001456 0.000034 \n","427740 0.003695 0.000041 \n","... ... ... \n","169063 0.001862 0.000044 \n","433255 0.001862 0.000044 \n","232204 0.001862 0.000044 \n","171685 0.001862 0.000044 \n","249087 0.001862 0.000044 \n","\n"," ... Student_Type__Special Education Student_Type__Unspecified \\\n"," ... \n","180042 ... 0.002347 0.836926 \n","28872 ... 0.005913 0.894910 \n","186915 ... 0.004487 0.282162 \n","412396 ... 0.004447 0.250461 \n","427740 ... 0.004405 0.980520 \n","... ... ... ... \n","169063 ... 0.004677 0.037555 \n","433255 ... 0.004677 0.037555 \n","232204 ... 0.004677 0.037555 \n","171685 ... 0.004677 0.037555 \n","249087 ... 0.004677 0.037555 \n","\n"," Use__Business Services Use__ISPD Use__Instruction Use__Leadership \\\n"," \n","180042 0.000200 0.003739 0.064349 0.003398 \n","28872 0.003721 0.009994 0.011118 0.023945 \n","186915 0.000895 0.012540 0.695857 0.008499 \n","412396 0.000910 0.010479 0.693750 0.007253 \n","427740 0.008272 0.040371 0.001226 0.805063 \n","... ... ... ... ... \n","169063 0.003252 0.003751 0.017954 0.002785 \n","433255 0.003252 0.003751 0.017954 0.002785 \n","232204 0.003252 0.003751 0.017954 0.002785 \n","171685 0.003252 0.003751 0.017954 0.002785 \n","249087 0.003252 0.003751 0.017954 0.002785 \n","\n"," Use__NO_LABEL Use__O&M Use__Pupil Services & Enrichment \\\n"," \n","180042 0.264019 0.006676 0.005294 \n","28872 0.047036 0.023556 0.146622 \n","186915 0.016858 0.003341 0.009452 \n","412396 0.017822 0.003547 0.008566 \n","427740 0.005581 0.037933 0.004177 \n","... ... ... ... \n","169063 0.883382 0.002572 0.044943 \n","433255 0.883382 0.002572 0.044943 \n","232204 0.883382 0.002572 0.044943 \n","171685 0.883382 0.002572 0.044943 \n","249087 0.883382 0.002572 0.044943 \n","\n"," Use__Untracked Budget Set-Aside \n"," \n","180042 0.000372 \n","28872 0.000244 \n","186915 0.000168 \n","412396 0.000168 \n","427740 0.000233 \n","... ... \n","169063 0.000341 \n","433255 0.000341 \n","232204 0.000341 \n","171685 0.000341 \n","249087 0.000341 \n","\n","[50064 rows x 104 columns]"],"text/html":["\n"," <div id=\"df-73e59131-014c-49d1-a337-c844b74a21b5\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Function__Aides Compensation</th>\n"," <th>Function__Career & Academic Counseling</th>\n"," <th>Function__Communications</th>\n"," <th>Function__Curriculum Development</th>\n"," <th>Function__Data Processing & Information Services</th>\n"," <th>Function__Development & Fundraising</th>\n"," <th>Function__Enrichment</th>\n"," <th>Function__Extended Time & Tutoring</th>\n"," <th>Function__Facilities & Maintenance</th>\n"," <th>Function__Facilities Planning</th>\n"," <th>...</th>\n"," <th>Student_Type__Special Education</th>\n"," <th>Student_Type__Unspecified</th>\n"," <th>Use__Business Services</th>\n"," <th>Use__ISPD</th>\n"," <th>Use__Instruction</th>\n"," <th>Use__Leadership</th>\n"," <th>Use__NO_LABEL</th>\n"," <th>Use__O&M</th>\n"," <th>Use__Pupil Services & Enrichment</th>\n"," <th>Use__Untracked Budget Set-Aside</th>\n"," </tr>\n"," <tr>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>180042</th>\n"," <td>0.005369</td>\n"," <td>0.001911</td>\n"," <td>0.000062</td>\n"," <td>0.000037</td>\n"," <td>0.000424</td>\n"," <td>0.000088</td>\n"," <td>0.000996</td>\n"," <td>0.000203</td>\n"," <td>0.001402</td>\n"," <td>0.000029</td>\n"," <td>...</td>\n"," <td>0.002347</td>\n"," <td>0.836926</td>\n"," <td>0.000200</td>\n"," <td>0.003739</td>\n"," <td>0.064349</td>\n"," <td>0.003398</td>\n"," <td>0.264019</td>\n"," <td>0.006676</td>\n"," <td>0.005294</td>\n"," <td>0.000372</td>\n"," </tr>\n"," <tr>\n"," <th>28872</th>\n"," <td>0.000574</td>\n"," <td>0.001182</td>\n"," <td>0.000243</td>\n"," <td>0.005150</td>\n"," <td>0.001041</td>\n"," <td>0.000150</td>\n"," <td>0.227637</td>\n"," <td>0.003900</td>\n"," <td>0.006152</td>\n"," <td>0.000040</td>\n"," <td>...</td>\n"," <td>0.005913</td>\n"," <td>0.894910</td>\n"," <td>0.003721</td>\n"," <td>0.009994</td>\n"," <td>0.011118</td>\n"," <td>0.023945</td>\n"," <td>0.047036</td>\n"," <td>0.023556</td>\n"," <td>0.146622</td>\n"," <td>0.000244</td>\n"," </tr>\n"," <tr>\n"," <th>186915</th>\n"," <td>0.017363</td>\n"," <td>0.000992</td>\n"," <td>0.000156</td>\n"," <td>0.001177</td>\n"," <td>0.000616</td>\n"," <td>0.000108</td>\n"," <td>0.000500</td>\n"," <td>0.000463</td>\n"," <td>0.001457</td>\n"," <td>0.000034</td>\n"," <td>...</td>\n"," <td>0.004487</td>\n"," <td>0.282162</td>\n"," <td>0.000895</td>\n"," <td>0.012540</td>\n"," <td>0.695857</td>\n"," <td>0.008499</td>\n"," <td>0.016858</td>\n"," <td>0.003341</td>\n"," <td>0.009452</td>\n"," <td>0.000168</td>\n"," </tr>\n"," <tr>\n"," <th>412396</th>\n"," <td>0.011996</td>\n"," <td>0.000990</td>\n"," <td>0.000155</td>\n"," <td>0.001122</td>\n"," <td>0.000625</td>\n"," <td>0.000108</td>\n"," <td>0.000475</td>\n"," <td>0.000489</td>\n"," <td>0.001456</td>\n"," <td>0.000034</td>\n"," <td>...</td>\n"," <td>0.004447</td>\n"," <td>0.250461</td>\n"," <td>0.000910</td>\n"," <td>0.010479</td>\n"," <td>0.693750</td>\n"," <td>0.007253</td>\n"," <td>0.017822</td>\n"," <td>0.003547</td>\n"," <td>0.008566</td>\n"," <td>0.000168</td>\n"," </tr>\n"," <tr>\n"," <th>427740</th>\n"," <td>0.001207</td>\n"," <td>0.002310</td>\n"," <td>0.000585</td>\n"," <td>0.002508</td>\n"," <td>0.002355</td>\n"," <td>0.000135</td>\n"," <td>0.002673</td>\n"," <td>0.000067</td>\n"," <td>0.003695</td>\n"," <td>0.000041</td>\n"," <td>...</td>\n"," <td>0.004405</td>\n"," <td>0.980520</td>\n"," <td>0.008272</td>\n"," <td>0.040371</td>\n"," <td>0.001226</td>\n"," <td>0.805063</td>\n"," <td>0.005581</td>\n"," <td>0.037933</td>\n"," <td>0.004177</td>\n"," <td>0.000233</td>\n"," </tr>\n"," <tr>\n"," <th>...</th>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," <td>...</td>\n"," </tr>\n"," <tr>\n"," <th>169063</th>\n"," <td>0.004662</td>\n"," <td>0.000559</td>\n"," <td>0.000271</td>\n"," <td>0.001190</td>\n"," <td>0.000868</td>\n"," <td>0.000236</td>\n"," <td>0.002325</td>\n"," <td>0.000239</td>\n"," <td>0.001862</td>\n"," <td>0.000044</td>\n"," <td>...</td>\n"," <td>0.004677</td>\n"," <td>0.037555</td>\n"," <td>0.003252</td>\n"," <td>0.003751</td>\n"," <td>0.017954</td>\n"," <td>0.002785</td>\n"," <td>0.883382</td>\n"," <td>0.002572</td>\n"," <td>0.044943</td>\n"," <td>0.000341</td>\n"," </tr>\n"," <tr>\n"," <th>433255</th>\n"," <td>0.004662</td>\n"," <td>0.000559</td>\n"," <td>0.000271</td>\n"," <td>0.001190</td>\n"," <td>0.000868</td>\n"," <td>0.000236</td>\n"," <td>0.002325</td>\n"," <td>0.000239</td>\n"," <td>0.001862</td>\n"," <td>0.000044</td>\n"," <td>...</td>\n"," <td>0.004677</td>\n"," <td>0.037555</td>\n"," <td>0.003252</td>\n"," <td>0.003751</td>\n"," <td>0.017954</td>\n"," <td>0.002785</td>\n"," <td>0.883382</td>\n"," <td>0.002572</td>\n"," <td>0.044943</td>\n"," <td>0.000341</td>\n"," </tr>\n"," <tr>\n"," <th>232204</th>\n"," <td>0.004662</td>\n"," <td>0.000559</td>\n"," <td>0.000271</td>\n"," <td>0.001190</td>\n"," <td>0.000868</td>\n"," <td>0.000236</td>\n"," <td>0.002325</td>\n"," <td>0.000239</td>\n"," <td>0.001862</td>\n"," <td>0.000044</td>\n"," <td>...</td>\n"," <td>0.004677</td>\n"," <td>0.037555</td>\n"," <td>0.003252</td>\n"," <td>0.003751</td>\n"," <td>0.017954</td>\n"," <td>0.002785</td>\n"," <td>0.883382</td>\n"," <td>0.002572</td>\n"," <td>0.044943</td>\n"," <td>0.000341</td>\n"," </tr>\n"," <tr>\n"," <th>171685</th>\n"," <td>0.004662</td>\n"," <td>0.000559</td>\n"," <td>0.000271</td>\n"," <td>0.001190</td>\n"," <td>0.000868</td>\n"," <td>0.000236</td>\n"," <td>0.002325</td>\n"," <td>0.000239</td>\n"," <td>0.001862</td>\n"," <td>0.000044</td>\n"," <td>...</td>\n"," <td>0.004677</td>\n"," <td>0.037555</td>\n"," <td>0.003252</td>\n"," <td>0.003751</td>\n"," <td>0.017954</td>\n"," <td>0.002785</td>\n"," <td>0.883382</td>\n"," <td>0.002572</td>\n"," <td>0.044943</td>\n"," <td>0.000341</td>\n"," </tr>\n"," <tr>\n"," <th>249087</th>\n"," <td>0.004662</td>\n"," <td>0.000559</td>\n"," <td>0.000271</td>\n"," <td>0.001190</td>\n"," <td>0.000868</td>\n"," <td>0.000236</td>\n"," <td>0.002325</td>\n"," <td>0.000239</td>\n"," <td>0.001862</td>\n"," <td>0.000044</td>\n"," <td>...</td>\n"," <td>0.004677</td>\n"," <td>0.037555</td>\n"," <td>0.003252</td>\n"," <td>0.003751</td>\n"," <td>0.017954</td>\n"," <td>0.002785</td>\n"," <td>0.883382</td>\n"," <td>0.002572</td>\n"," <td>0.044943</td>\n"," <td>0.000341</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>50064 rows × 104 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-73e59131-014c-49d1-a337-c844b74a21b5')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-73e59131-014c-49d1-a337-c844b74a21b5 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-73e59131-014c-49d1-a337-c844b74a21b5');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":31}],"source":["submission_cols = pd.read_csv(f'{dir_path}/DrivenDataCompetition_DataFiles/SubmissionFormat.csv')\n","cols_list = list(submission_cols.columns.values)\n","cols_list = cols_list[1:] ## remove the first column which is the index\n","submission = pd.DataFrame(test_predictions, columns=cols_list)\n","submission.set_index(test_df['Unnamed: 0'], inplace=True)\n","submission.index.name = \"\"\n","submission"]},{"cell_type":"code","execution_count":32,"metadata":{"id":"ep-ruNCfyMKH","executionInfo":{"status":"ok","timestamp":1670513308615,"user_tz":300,"elapsed":7178,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":["submission.to_csv('12072022_tfidf_logreg_Cval_2.csv')"]},{"cell_type":"code","execution_count":32,"metadata":{"id":"XwaSy1i13nax","executionInfo":{"status":"ok","timestamp":1670513308616,"user_tz":300,"elapsed":5,"user":{"displayName":"Danny Mathieson","userId":"14123752102169145219"}}},"outputs":[],"source":[]}],"metadata":{"colab":{"provenance":[{"file_id":"1dLdfXkr3V01-zlCKmWbNew2UXHqO9Vwo","timestamp":1668433872897}],"machine_shape":"hm"},"gpuClass":"premium","kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":0}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment