First public version

2022-05-18 18:51:42 +02:00 · 2022-05-18 18:51:42 +02:00 · a842542ffa
commit a842542ffa
parent a13fd12aa0
9 changed files with 303 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,8 @@
 data
 data.zip
 *.kate-swp
 tags.json
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/README.md
+++ b/README.md
@ -1,3 +1,46 @@
 # gnu-keywords
-https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten
+Das ist das Repo für die Abgabe von Beat Jäckle zu dem [Wettbewerb von gnulinux.ch](https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten).
 Die Datei `data.zip` wird benötigt und stammt von der [GNU/Linux.ch Cloud](https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi).
 Der Download kann mit `setup.sh` durchgeführt werden.
 Diese Software ist lizensiert unter GPL3+.
 ## Installation
 Kopiere dieses Reprository an einen beliebigen Ort auf deinem Computer.
 In der `setup.sh` Datei sind die Anweisungen gespeichert.
 Die Datei kann auch als Bash-Skript ausgeführt werden.
 ## Start
 Die Pythonumgebung muss aktiviert sein.
 `. .env/bin/activate`
 Danach kann man die `main.py` ausführen.
 Momentan git es keine Optionen, die man in der Befehlszeile anpassen könnte.
 Beim ersten mal Starten lädt die Software das Sprachmodul herunder. Dies kann länger dauern.
 ## Anpassungen
 Die Konfiguartionen für das Programm ist in der `config.py` Datei direkt im Python Syntax gespeichert.
 Darin kann man die Parameter anpassen, welche schlüsselgebend für die Bestimmung der Keywords sind.
 Wenn der Ablauf angepasst werden sollte, so kann man die `main.py` sehr gut anpassen.
 Der Prozess besteht aus 9 Zeilen. Dieser ist in drei Blöcke unterteilt:
 - Initialisierung der Objekte  
 4 Zeilen
 - Keywords bestimmen  
 3 Zeilen
 - Export  
 2 Zeilen
 # Mögliche Roadmap
 - Die `main.py` Datei könnte eine usage/help bekommen.
 - Optionen in die `main.py` einbauen.
 - Die Score Funktionen in `gnukeywords.py` können natürlich ganz anders gschrieben werden.
--- a/config.py
+++ b/config.py
@ -0,0 +1,33 @@
 #
 # settings for gnukeywords
 # SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
 # SPDX-License-Identifier: WTFPL
 # Wortwarten
 # posAllow sind erlaubte Wortwarten
 # posDeny sind nicht erlaubte Wortwarten
 posAllow = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
 posDeny = {'VERB', 'SPACE', 'X', 'PUNCT', 'ADP',
           'NUM', 'PRON', 'DET', 'SCONJ', 'PART', 'AUX', 'CCONJ'}
 # Einzelne Wörter ganz auslassen
 skipWord = {
    'Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So',
    'Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli',
    'August', 'September', 'Oktober', 'November', 'Dezember',
    'Linux', 'Befehl',
    'S.u', 'S.E.', '4.4.1',
    }
 # Hier können wir gewisse Wörter oder Abkürzungen festhalten,
 # welche beim verarbeiten in der Software ersetzt werden.
 remap = {
    'Apps': 'App',
    'Pakete': 'Paket',
    'CLT': 'Chemnitzer Linux Tage'
 }
 # Hier können wir gewichten, wie viel ein "ent" Wort wert hat.
 # Ich finde die "ent"-Wörter sehr gute Keywords,
 # also gab ich eine hohe Punktzahl.
 entsScore = 10
--- a/datareader.py
+++ b/datareader.py
@ -0,0 +1,43 @@
 #
 # <one line to give the program's name and a brief idea of what it does.>
 # SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 from os import listdir
 class DataReader:
    def __init__(self, datadir,
                 indexname='index.txt',
                 skippFirstLine=True
                 ):
        if datadir[-1] != '/':
            datadir += '/'
        self.datadir = datadir
        self.skippFirstLine = skippFirstLine
        self.__readTree__()
        self.indexname = indexname
    def __readTree__(self):
        self.articles = listdir(self.datadir)
    def __iter__(self):
        self.indexnext = 0
        return self
    def __next__(self):
        try:
            articlename = self.articles[self.indexnext]
            path = self.datadir +\
                articlename +\
                '/'+self.indexname
            self.indexnext += 1
        except IndexError:
            raise StopIteration
            return None
        with open(path, "r") as f:
            if self.skippFirstLine:
                f.readline()
            return [articlename, f.read()]
--- a/gnukeywords.py
+++ b/gnukeywords.py
@ -0,0 +1,103 @@
 #
 # find keywords in articles and returns them
 # SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 import spacy
 from collections import Counter
 from config import posAllow, posDeny, skipWord, remap, entsScore
 from nltk.stem.snowball import SnowballStemmer
 # from snowballstemmer import stemmer
 class GnuKeywords:
    modules = {
        'de': 'de_core_news_lg',
        'de_sm': 'de_core_news_sm',
        'en': 'en_core_web_lg'
    }
    def __init__(self, lang, amountofkeywords=5):
        self.loadModule(self.modules[lang])
        self.amount = amountofkeywords
        # self.stemmer = stemmer('german')
    def loadModule(self, module):
        try:
            self.nlp = spacy.load(module)
        except OSError:
            spacy.cli.download(module)
            self.nlp = spacy.load(module)
    def __call__(self, htmlstr):
        self.score = Counter()
        doc = self.nlp(htmlstr)
        self.addScoreByEnts(doc)
        self.addScoreByCount(doc)
        keywords = self.getKeywordsByScore()
        del self.score
        return keywords
    def cusromFilter(self, word):
        if word in skipWord:
            return None
        try:
            word = remap[word]
        except KeyError:
            pass
        return word
    def addScoreByCount(self, doc):
        for token in doc:
            if token.is_punct or token.is_stop or token.is_digit:
                continue
            if token.pos_ in posDeny:
                continue
            if token.pos_ not in posAllow:
                raise NotImplementedError(
                    'A token.pos_ is not set in config.py:',
                    token, token.pos_)
            word = self.cusromFilter(token.lemma_)
            if word is None:
                continue
            self.score[word] += 1
    def getKeywordsByScore(self, minscore=2):
        keywords = []
        for (word, score) in self.score.most_common():
            # Check if there is already a word in the the keywords
            # and replace it if it is a subword
            if self.thereIsSubWord(word, keywords):
                continue
            keywords.append(word)
            if score < minscore:
                break
            if len(keywords) >= self.amount:
                break
        return keywords
    def thereIsSubWord(self, word, keywords):
        for i, kw in enumerate(keywords):
            if word in kw:
                keywords[i] = word
                return True
            elif kw in word:
                return True
        return False
    def addScoreByEnts(self, doc):
        for wordspan in doc.ents:
            for word in wordspan.lemma_.split('--'):
                word = word.strip(' .,!?_-')
                # This idea with stem words doesn't works well.
                # phone -> phon is not wished
                # word = self.stemmer.stemWord(word)
                if len(word) <= 1:
                    continue
                if '://' in word:
                    continue
                if "\n" in word:
                    continue
                self.score[word] += entsScore
--- a/hmlttotext.py
+++ b/hmlttotext.py
@ -0,0 +1,14 @@
 #
 # takes html string from gnulinux.ch and return the plain text as a string
 # SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 from trafilatura import extract
 class HmltToText:
    def __init__(self):
        pass
    def __call__(self, htmlstr):
        return extract(htmlstr)
--- a/main.py
+++ b/main.py
@ -0,0 +1,34 @@
 #!/usr/bin/env python3
 # find keywords and exports them
 # SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 from datareader import DataReader
 from gnukeywords import GnuKeywords
 from hmlttotext import HmltToText
 from json import dump as jsondump
 def main(args):
    # Objekte initiieren
    keywords = dict()
    datareader = DataReader(datadir='./data/')
    gnukeywords = GnuKeywords(lang='de')
    hmltToText = HmltToText()
    # Keywords in das dictionary schreiben
    for articlename, htmlstr in datareader:
        text = hmltToText(htmlstr)
        keywords[articlename] = gnukeywords(text)
    # Keywords speichern
    with open('tags.json', 'w') as f:
        jsondump(keywords, f, indent=4, ensure_ascii=False)
    return 0
 if __name__ == '__main__':
    import sys
    sys.exit(main(sys.argv))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 spacy
 trafilatura
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,25 @@
 #!/bin/bash
 # setup gnu-keywords, automatet script.
 # Perhaps you want to do it yourself, in your way. Feel free
 # SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # Erstelle eine Pythonumgebung, falls noch keine existiert.
 if [ ! -d .env ]
 then python -m venv .env
 fi
 # Aktiviere die Pythonumgebung
 . .env/bin/activate
 # Installiere die benötigten Pakete
 pip install -U pip
 pip install -r requirements.txt
 # Lade die Berichte von gnulinux.ch herunter und entpacke sie in den Ordner ./data
 wget https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi/download/data.zip
 unzip data.zip
 # Der Bericht von beautiful-soup ist leider unvollständig. Wir löschen ihn bereits hier.
 if [ -f data/beautiful-soup/index.txt ];then rm data/beautiful-soup/index.txt; rmdir data/beautiful-soup/;fi