# # find keywords in articles and returns them # SPDX-FileCopyrightText: 2022 Beat Jäckle # SPDX-License-Identifier: GPL-3.0-or-later # import spacy from collections import Counter from config import posAllow, posDeny, skipWord, remap, entsScore # from snowballstemmer import stemmer class GnuKeywords: modules = { 'de': 'de_core_news_lg', 'de_sm': 'de_core_news_sm', 'en': 'en_core_web_lg' } def __init__(self, lang, amountofkeywords=5): self.loadModule(self.modules[lang]) self.amount = amountofkeywords # self.stemmer = stemmer('german') def loadModule(self, module): try: self.nlp = spacy.load(module) except OSError: spacy.cli.download(module) self.nlp = spacy.load(module) def __call__(self, htmlstr): self.score = Counter() doc = self.nlp(htmlstr) self.addScoreByEnts(doc) self.addScoreByCount(doc) keywords = self.getKeywordsByScore() del self.score return keywords def cusromFilter(self, word): if word in skipWord: return None try: word = remap[word] except KeyError: pass return word def addScoreByCount(self, doc): for token in doc: if token.is_punct or token.is_stop or token.is_digit: continue if token.pos_ in posDeny: continue if token.pos_ not in posAllow: raise NotImplementedError( 'A token.pos_ is not set in config.py:', token, token.pos_) word = self.cusromFilter(token.lemma_) if word is None: continue self.score[word] += 1 def getKeywordsByScore(self, minscore=2): keywords = [] for (word, score) in self.score.most_common(): # Check if there is already a word in the the keywords # and replace it if it is a subword if self.thereIsSubWord(word, keywords): continue keywords.append(word) if score < minscore: break if len(keywords) >= self.amount: break return keywords def thereIsSubWord(self, word, keywords): for i, kw in enumerate(keywords): if word in kw: keywords[i] = word return True elif kw in word: return True return False def addScoreByEnts(self, doc): for wordspan in doc.ents: for word in wordspan.lemma_.split('--'): word = word.strip(' .,!?_-') # This idea with stem words doesn't works well. # phone -> phon is not wished # word = self.stemmer.stemWord(word) if len(word) <= 1: continue if '://' in word: continue if "\n" in word: continue self.score[word] += entsScore