Skip to content
Snippets Groups Projects
Commit 5ec5b210 authored by Ishi Bansal's avatar Ishi Bansal
Browse files

Delete add_labels3.py

parent 7b4feac8
No related branches found
No related tags found
No related merge requests found
import json
json_data = {
"projectFileVersion": "2022-05-01",
"stringIndexType": "Utf16CodeUnit",
"metadata": {
"projectKind": "CustomEntityRecognition",
"storageInputContainerName": "example-data",
"projectName": "dcc-capstone",
"multilingual": "false",
"description": "Project-description",
"language": "en",
"settings": {}
},
"assets": {
"projectKind": "CustomEntityRecognition",
"entities": [
{"category": "positive"},
{"category": "negative"},
{"category": "percentage"},
{"category": "demographic"}
],
"documents": []
}
}
for i in range(1, 7):
with open(str(i) + ".txt", 'r', encoding="utf-8") as f:
utf_data = f.read()
region_length = len(utf_data)
category = ""
offset = 0
length = 0
last_position = 0
is_a_category = False
document_structure = {
"location": str(i) + ".txt",
"language": "en-US",
"entities": [
{
"regionOffset": 0,
"regionLength": region_length,
"labels": []
}
]
}
for ind, word in enumerate(utf_data.split()):
if word.casefold().__contains__("positive") or word.casefold().__contains__("excite") or word.casefold().__contains__("benefit") or word.casefold().__contains__("beneficial") or word.casefold().__contains__("favor") or word.casefold().__contains__("good") or word.casefold().__contains__("optimist") or word.casefold().__contains__("favor"):
category = "positive"
offset = utf_data.find(word, last_position)
length = len(word)
last_position = offset + length
is_a_category = True
elif word.casefold().__contains__("negative") or word.casefold().__contains__("concern") or word.casefold().__contains__("harm") or word.casefold().__contains__("reject") or word.casefold().__contains__("oppose") or word.casefold().__contains__("danger") or word.casefold().__contains__("bad") or word.casefold().__contains__("pessimist") or word.casefold().__contains__("risk") or word.casefold().__contains__("threat"):
category = "negative"
offset = utf_data.find(word, last_position)
length = len(word)
last_position = offset + length
is_a_category = True
elif word.casefold().__contains__("%") or word.casefold().__contains__("half") or word.casefold().__contains__("third") or word.casefold().__contains__("fourth") or word.casefold().__contains__("fifth") or word.casefold().__contains__("tenth"):
category = "percentage"
offset = utf_data.find(word, last_position)
length = len(word)
last_position = offset + length
is_a_category = True
elif word.casefold().__contains__("American") or word.casefold().__contains__("white") or word.casefold().__contains__("black") or word.casefold().__contains__("asian") or word.casefold().__contains__("hispanic") or word.casefold().__contains__("women") or word.casefold().__contains__("men"):
category = "demographic"
offset = utf_data.find(word, last_position)
length = len(word)
last_position = offset + length
is_a_category = True
if is_a_category:
document_structure["entities"][0]["labels"].append({
"category": category,
"offset": offset,
"length": length
})
is_a_category = False
json_data["assets"]["documents"].append(document_structure)
# Move the writing of JSON data outside the loop
with open('labels.json', 'w') as file:
json.dump(json_data, file, indent=4)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment