gnu-keywords/gnukeywords.py

#
# find keywords in articles and returns them
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
# SPDX-License-Identifier: GPL-3.0-or-later
#
import spacy
from collections import Counter
from config import posAllow, posDeny, skipWord, remap, entsScore
# from snowballstemmer import stemmer


class GnuKeywords:
    modules = {
        'de': 'de_core_news_lg',
        'de_sm': 'de_core_news_sm',
        'en': 'en_core_web_lg'
    }

    def __init__(self, lang, amountofkeywords=5):
        self.loadModule(self.modules[lang])
        self.amount = amountofkeywords
        # self.stemmer = stemmer('german')

    def loadModule(self, module):
        try:
            self.nlp = spacy.load(module)
        except OSError:
            spacy.cli.download(module)
            self.nlp = spacy.load(module)

    def __call__(self, htmlstr):
        self.score = Counter()
        doc = self.nlp(htmlstr)
        self.addScoreByEnts(doc)
        self.addScoreByCount(doc)
        keywords = self.getKeywordsByScore()
        del self.score
        return keywords

    def cusromFilter(self, word):
        if word in skipWord:
            return None
        try:
            word = remap[word]
        except KeyError:
            pass
        return word

    def addScoreByCount(self, doc):
        for token in doc:
            if token.is_punct or token.is_stop or token.is_digit:
                continue
            if token.pos_ in posDeny:
                continue
            if token.pos_ not in posAllow:
                raise NotImplementedError(
                    'A token.pos_ is not set in config.py:',
                    token, token.pos_)

            word = self.cusromFilter(token.lemma_)
            if word is None:
                continue

            self.score[word] += 1

    def getKeywordsByScore(self, minscore=2):
        keywords = []
        for (word, score) in self.score.most_common():
            # Check if there is already a word in the the keywords
            # and replace it if it is a subword
            if self.thereIsSubWord(word, keywords):
                continue
            keywords.append(word)
            if score < minscore:
                break
            if len(keywords) >= self.amount:
                break
        return keywords

    def thereIsSubWord(self, word, keywords):
        for i, kw in enumerate(keywords):
            if word in kw:
                keywords[i] = word
                return True
            elif kw in word:
                return True
        return False

    def addScoreByEnts(self, doc):
        for wordspan in doc.ents:
            for word in wordspan.lemma_.split('--'):
                word = word.strip(' .,!?_-')
                # This idea with stem words doesn't works well.
                # phone -> phon is not wished
                # word = self.stemmer.stemWord(word)
                if len(word) <= 1:
                    continue
                if '://' in word:
                    continue
                if "\n" in word:
                    continue
                self.score[word] += entsScore
First public version 2022-05-18 18:51:42 +02:00			`#`
			`# find keywords in articles and returns them`
			`# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>`
			`# SPDX-License-Identifier: GPL-3.0-or-later`
			`#`
			`import spacy`
			`from collections import Counter`
			`from config import posAllow, posDeny, skipWord, remap, entsScore`
			`# from snowballstemmer import stemmer`


			`class GnuKeywords:`
			`modules = {`
			`'de': 'de_core_news_lg',`
			`'de_sm': 'de_core_news_sm',`
			`'en': 'en_core_web_lg'`
			`}`

			`def __init__(self, lang, amountofkeywords=5):`
			`self.loadModule(self.modules[lang])`
			`self.amount = amountofkeywords`
			`# self.stemmer = stemmer('german')`

			`def loadModule(self, module):`
			`try:`
			`self.nlp = spacy.load(module)`
			`except OSError:`
			`spacy.cli.download(module)`
			`self.nlp = spacy.load(module)`

			`def __call__(self, htmlstr):`
			`self.score = Counter()`
			`doc = self.nlp(htmlstr)`
			`self.addScoreByEnts(doc)`
			`self.addScoreByCount(doc)`
			`keywords = self.getKeywordsByScore()`
			`del self.score`
			`return keywords`

			`def cusromFilter(self, word):`
			`if word in skipWord:`
			`return None`
			`try:`
			`word = remap[word]`
			`except KeyError:`
			`pass`
			`return word`

			`def addScoreByCount(self, doc):`
			`for token in doc:`
			`if token.is_punct or token.is_stop or token.is_digit:`
			`continue`
			`if token.pos_ in posDeny:`
			`continue`
			`if token.pos_ not in posAllow:`
			`raise NotImplementedError(`
			`'A token.pos_ is not set in config.py:',`
			`token, token.pos_)`

			`word = self.cusromFilter(token.lemma_)`
			`if word is None:`
			`continue`

			`self.score[word] += 1`

			`def getKeywordsByScore(self, minscore=2):`
			`keywords = []`
			`for (word, score) in self.score.most_common():`
			`# Check if there is already a word in the the keywords`
			`# and replace it if it is a subword`
			`if self.thereIsSubWord(word, keywords):`
			`continue`
			`keywords.append(word)`
			`if score < minscore:`
			`break`
			`if len(keywords) >= self.amount:`
			`break`
			`return keywords`

			`def thereIsSubWord(self, word, keywords):`
			`for i, kw in enumerate(keywords):`
			`if word in kw:`
			`keywords[i] = word`
			`return True`
			`elif kw in word:`
			`return True`
			`return False`

			`def addScoreByEnts(self, doc):`
			`for wordspan in doc.ents:`
			`for word in wordspan.lemma_.split('--'):`
			`word = word.strip(' .,!?_-')`
			`# This idea with stem words doesn't works well.`
			`# phone -> phon is not wished`
			`# word = self.stemmer.stemWord(word)`
			`if len(word) <= 1:`
			`continue`
			`if '://' in word:`
			`continue`
			`if "\n" in word:`
			`continue`
			`self.score[word] += entsScore`