gnu-keywords/gnukeywords.py

#
# find keywords in articles and returns them
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
# SPDX-License-Identifier: GPL-3.0-or-later
#
import spacy
from collections import Counter
from config import posAllow, posDeny, skipWord, remap, entsScore
# from snowballstemmer import stemmer


class GnuKeywords:
    modules = {
        'de': 'de_core_news_lg',
        'de_sm': 'de_core_news_sm',
        'en': 'en_core_web_lg'
    }

    def __init__(self, lang, amountofkeywords=5):
        self.loadModule(self.modules[lang])
        self.amount = amountofkeywords
        # self.stemmer = stemmer('german')

    def loadModule(self, module):
        try:
            self.nlp = spacy.load(module)
        except OSError:
            spacy.cli.download(module)
            self.nlp = spacy.load(module)

    def __call__(self, htmlstr):
        self.score = Counter()
        doc = self.nlp(htmlstr)
        self.addScoreByEnts(doc)
        self.addScoreByCount(doc)
        keywords = self.getKeywordsByScore()
        del self.score
        return keywords

    def cusromFilter(self, word):
        if word in skipWord:
            return None
        try:
            word = remap[word]
        except KeyError:
            pass
        return word

    def addScoreByCount(self, doc):
        for token in doc:
            if token.is_punct or token.is_stop or token.is_digit:
                continue
            if token.pos_ in posDeny:
                continue
            if token.pos_ not in posAllow:
                raise NotImplementedError(
                    'A token.pos_ is not set in config.py:',
                    token, token.pos_)

            word = self.cusromFilter(token.lemma_)
            if word is None:
                continue

            self.score[word] += 1

    def getKeywordsByScore(self, minscore=2):
        keywords = []
        for (word, score) in self.score.most_common():
            # Check if there is already a word in the the keywords
            # and replace it if it is a subword
            if self.thereIsSubWord(word, keywords):
                continue
            keywords.append(word)
            if score < minscore:
                break
            if len(keywords) >= self.amount:
                break
        return keywords

    def thereIsSubWord(self, word, keywords):
        for i, kw in enumerate(keywords):
            if word in kw:
                keywords[i] = word
                return True
            elif kw in word:
                return True
        return False

    def addScoreByEnts(self, doc):
        for wordspan in doc.ents:
            for word in wordspan.lemma_.split('--'):
                word = word.strip(' .,!?_-')
                # This idea with stem words doesn't works well.
                # phone -> phon is not wished
                # word = self.stemmer.stemWord(word)
                if len(word) <= 1:
                    continue
                if '://' in word:
                    continue
                if "\n" in word:
                    continue
                self.score[word] += entsScore