102 lines
3.1 KiB
Python
102 lines
3.1 KiB
Python
#
|
|
# find keywords in articles and returns them
|
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
#
|
|
import spacy
|
|
from collections import Counter
|
|
from config import posAllow, posDeny, skipWord, remap, entsScore
|
|
# from snowballstemmer import stemmer
|
|
|
|
|
|
class GnuKeywords:
|
|
modules = {
|
|
'de': 'de_core_news_lg',
|
|
'de_sm': 'de_core_news_sm',
|
|
'en': 'en_core_web_lg'
|
|
}
|
|
|
|
def __init__(self, lang, amountofkeywords=5):
|
|
self.loadModule(self.modules[lang])
|
|
self.amount = amountofkeywords
|
|
# self.stemmer = stemmer('german')
|
|
|
|
def loadModule(self, module):
|
|
try:
|
|
self.nlp = spacy.load(module)
|
|
except OSError:
|
|
spacy.cli.download(module)
|
|
self.nlp = spacy.load(module)
|
|
|
|
def __call__(self, htmlstr):
|
|
self.score = Counter()
|
|
doc = self.nlp(htmlstr)
|
|
self.addScoreByEnts(doc)
|
|
self.addScoreByCount(doc)
|
|
keywords = self.getKeywordsByScore()
|
|
del self.score
|
|
return keywords
|
|
|
|
def cusromFilter(self, word):
|
|
if word in skipWord:
|
|
return None
|
|
try:
|
|
word = remap[word]
|
|
except KeyError:
|
|
pass
|
|
return word
|
|
|
|
def addScoreByCount(self, doc):
|
|
for token in doc:
|
|
if token.is_punct or token.is_stop or token.is_digit:
|
|
continue
|
|
if token.pos_ in posDeny:
|
|
continue
|
|
if token.pos_ not in posAllow:
|
|
raise NotImplementedError(
|
|
'A token.pos_ is not set in config.py:',
|
|
token, token.pos_)
|
|
|
|
word = self.cusromFilter(token.lemma_)
|
|
if word is None:
|
|
continue
|
|
|
|
self.score[word] += 1
|
|
|
|
def getKeywordsByScore(self, minscore=2):
|
|
keywords = []
|
|
for (word, score) in self.score.most_common():
|
|
# Check if there is already a word in the the keywords
|
|
# and replace it if it is a subword
|
|
if self.thereIsSubWord(word, keywords):
|
|
continue
|
|
keywords.append(word)
|
|
if score < minscore:
|
|
break
|
|
if len(keywords) >= self.amount:
|
|
break
|
|
return keywords
|
|
|
|
def thereIsSubWord(self, word, keywords):
|
|
for i, kw in enumerate(keywords):
|
|
if word in kw:
|
|
keywords[i] = word
|
|
return True
|
|
elif kw in word:
|
|
return True
|
|
return False
|
|
|
|
def addScoreByEnts(self, doc):
|
|
for wordspan in doc.ents:
|
|
for word in wordspan.lemma_.split('--'):
|
|
word = word.strip(' .,!?_-')
|
|
# This idea with stem words doesn't works well.
|
|
# phone -> phon is not wished
|
|
# word = self.stemmer.stemWord(word)
|
|
if len(word) <= 1:
|
|
continue
|
|
if '://' in word:
|
|
continue
|
|
if "\n" in word:
|
|
continue
|
|
self.score[word] += entsScore
|