gnu-keywords/gnukeywords.py

104 lines
3.1 KiB
Python
Raw Permalink Normal View History

2022-05-18 18:51:42 +02:00
#
# find keywords in articles and returns them
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
# SPDX-License-Identifier: GPL-3.0-or-later
#
import spacy
from collections import Counter
from config import posAllow, posDeny, skipWord, remap, entsScore
from nltk.stem.snowball import SnowballStemmer
# from snowballstemmer import stemmer
class GnuKeywords:
modules = {
'de': 'de_core_news_lg',
'de_sm': 'de_core_news_sm',
'en': 'en_core_web_lg'
}
def __init__(self, lang, amountofkeywords=5):
self.loadModule(self.modules[lang])
self.amount = amountofkeywords
# self.stemmer = stemmer('german')
def loadModule(self, module):
try:
self.nlp = spacy.load(module)
except OSError:
spacy.cli.download(module)
self.nlp = spacy.load(module)
def __call__(self, htmlstr):
self.score = Counter()
doc = self.nlp(htmlstr)
self.addScoreByEnts(doc)
self.addScoreByCount(doc)
keywords = self.getKeywordsByScore()
del self.score
return keywords
def cusromFilter(self, word):
if word in skipWord:
return None
try:
word = remap[word]
except KeyError:
pass
return word
def addScoreByCount(self, doc):
for token in doc:
if token.is_punct or token.is_stop or token.is_digit:
continue
if token.pos_ in posDeny:
continue
if token.pos_ not in posAllow:
raise NotImplementedError(
'A token.pos_ is not set in config.py:',
token, token.pos_)
word = self.cusromFilter(token.lemma_)
if word is None:
continue
self.score[word] += 1
def getKeywordsByScore(self, minscore=2):
keywords = []
for (word, score) in self.score.most_common():
# Check if there is already a word in the the keywords
# and replace it if it is a subword
if self.thereIsSubWord(word, keywords):
continue
keywords.append(word)
if score < minscore:
break
if len(keywords) >= self.amount:
break
return keywords
def thereIsSubWord(self, word, keywords):
for i, kw in enumerate(keywords):
if word in kw:
keywords[i] = word
return True
elif kw in word:
return True
return False
def addScoreByEnts(self, doc):
for wordspan in doc.ents:
for word in wordspan.lemma_.split('--'):
word = word.strip(' .,!?_-')
# This idea with stem words doesn't works well.
# phone -> phon is not wished
# word = self.stemmer.stemWord(word)
if len(word) <= 1:
continue
if '://' in word:
continue
if "\n" in word:
continue
self.score[word] += entsScore