Skip to content
Snippets Groups Projects
nltk_summarizer.py 1.58 KiB
Newer Older
import nltk          
from nltk.corpus import stopwords      
nltk.download('stopwords') 
nltk.download('punkt')                  
from nltk.tokenize import word_tokenize, sent_tokenize

with open('summarizer_input.txt', 'r', encoding='utf-8') as file:
    text = file.read()

stopWords = set(stopwords.words("english"))
words = word_tokenize(text)

freqTable = dict()  
for word in words:               
    word = word.lower()                 
    if word in stopWords:                 
        continue                  
    if word in freqTable:                       
        freqTable[word] += 1            
    else:          
        freqTable[word] = 1

sentences = sent_tokenize(text)                 
sentenceValue = dict()                     

for sentence in sentences:               
    for word, freq in freqTable.items():              
        if word in sentence.lower():           
            if word in sentence.lower():                   
                if sentence in sentenceValue:                                 
                    sentenceValue[sentence] += freq                       
                else:                       
                    sentenceValue[sentence] = freq                    

sumValues = 0                        
for sentence in sentenceValue:              
    sumValues += sentenceValue[sentence] 

average = int(sumValues / len(sentenceValue))

summary = ''      
for sentence in sentences:
    if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)):                
        summary += " " + sentence                  
print(summary)