First public version

2022-05-18 18:51:42 +02:00 · 2022-05-18 18:51:42 +02:00 · a842542ffa
commit a842542ffa
parent a13fd12aa0
9 changed files with 303 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,8 @@
+data
+data.zip
+*.kate-swp
+tags.json
+
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/README.md
+++ b/README.md
@ -1,3 +1,46 @@
 # gnu-keywords

-https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten
+Das ist das Repo für die Abgabe von Beat Jäckle zu dem [Wettbewerb von gnulinux.ch](https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten).
+
+Die Datei `data.zip` wird benötigt und stammt von der [GNU/Linux.ch Cloud](https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi).
+Der Download kann mit `setup.sh` durchgeführt werden.
+
+Diese Software ist lizensiert unter GPL3+.
+
+## Installation
+
+Kopiere dieses Reprository an einen beliebigen Ort auf deinem Computer.
+
+In der `setup.sh` Datei sind die Anweisungen gespeichert.
+Die Datei kann auch als Bash-Skript ausgeführt werden.
+
+## Start
+
+Die Pythonumgebung muss aktiviert sein.
+`. .env/bin/activate`
+
+Danach kann man die `main.py` ausführen.
+Momentan git es keine Optionen, die man in der Befehlszeile anpassen könnte.
+
+Beim ersten mal Starten lädt die Software das Sprachmodul herunder. Dies kann länger dauern.
+
+## Anpassungen
+
+Die Konfiguartionen für das Programm ist in der `config.py` Datei direkt im Python Syntax gespeichert.
+Darin kann man die Parameter anpassen, welche schlüsselgebend für die Bestimmung der Keywords sind.
+
+Wenn der Ablauf angepasst werden sollte, so kann man die `main.py` sehr gut anpassen.
+Der Prozess besteht aus 9 Zeilen. Dieser ist in drei Blöcke unterteilt:
+
+ - Initialisierung der Objekte  
+ 4 Zeilen
+ - Keywords bestimmen  
+ 3 Zeilen
+ - Export  
+ 2 Zeilen
+
+# Mögliche Roadmap
+
+ - Die `main.py` Datei könnte eine usage/help bekommen.
+ - Optionen in die `main.py` einbauen.
+ - Die Score Funktionen in `gnukeywords.py` können natürlich ganz anders gschrieben werden.
--- a/config.py
+++ b/config.py
@ -0,0 +1,33 @@
+#
+# settings for gnukeywords
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: WTFPL
+
+# Wortwarten
+# posAllow sind erlaubte Wortwarten
+# posDeny sind nicht erlaubte Wortwarten
+posAllow = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
+posDeny = {'VERB', 'SPACE', 'X', 'PUNCT', 'ADP',
+           'NUM', 'PRON', 'DET', 'SCONJ', 'PART', 'AUX', 'CCONJ'}
+
+# Einzelne Wörter ganz auslassen
+skipWord = {
+    'Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So',
+    'Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli',
+    'August', 'September', 'Oktober', 'November', 'Dezember',
+    'Linux', 'Befehl',
+    'S.u', 'S.E.', '4.4.1',
+    }
+
+# Hier können wir gewisse Wörter oder Abkürzungen festhalten,
+# welche beim verarbeiten in der Software ersetzt werden.
+remap = {
+    'Apps': 'App',
+    'Pakete': 'Paket',
+    'CLT': 'Chemnitzer Linux Tage'
+}
+
+# Hier können wir gewichten, wie viel ein "ent" Wort wert hat.
+# Ich finde die "ent"-Wörter sehr gute Keywords,
+# also gab ich eine hohe Punktzahl.
+entsScore = 10
--- a/datareader.py
+++ b/datareader.py
@ -0,0 +1,43 @@
+#
+# <one line to give the program's name and a brief idea of what it does.>
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+from os import listdir
+
+
+class DataReader:
+    def __init__(self, datadir,
+                 indexname='index.txt',
+                 skippFirstLine=True
+                 ):
+        if datadir[-1] != '/':
+            datadir += '/'
+        self.datadir = datadir
+        self.skippFirstLine = skippFirstLine
+
+        self.__readTree__()
+        self.indexname = indexname
+
+    def __readTree__(self):
+        self.articles = listdir(self.datadir)
+
+    def __iter__(self):
+        self.indexnext = 0
+        return self
+
+    def __next__(self):
+        try:
+            articlename = self.articles[self.indexnext]
+            path = self.datadir +\
+                articlename +\
+                '/'+self.indexname
+            self.indexnext += 1
+        except IndexError:
+            raise StopIteration
+            return None
+
+        with open(path, "r") as f:
+            if self.skippFirstLine:
+                f.readline()
+            return [articlename, f.read()]
--- a/gnukeywords.py
+++ b/gnukeywords.py
@ -0,0 +1,103 @@
+#
+# find keywords in articles and returns them
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+import spacy
+from collections import Counter
+from config import posAllow, posDeny, skipWord, remap, entsScore
+from nltk.stem.snowball import SnowballStemmer
+# from snowballstemmer import stemmer
+
+
+class GnuKeywords:
+    modules = {
+        'de': 'de_core_news_lg',
+        'de_sm': 'de_core_news_sm',
+        'en': 'en_core_web_lg'
+    }
+
+    def __init__(self, lang, amountofkeywords=5):
+        self.loadModule(self.modules[lang])
+        self.amount = amountofkeywords
+        # self.stemmer = stemmer('german')
+
+    def loadModule(self, module):
+        try:
+            self.nlp = spacy.load(module)
+        except OSError:
+            spacy.cli.download(module)
+            self.nlp = spacy.load(module)
+
+    def __call__(self, htmlstr):
+        self.score = Counter()
+        doc = self.nlp(htmlstr)
+        self.addScoreByEnts(doc)
+        self.addScoreByCount(doc)
+        keywords = self.getKeywordsByScore()
+        del self.score
+        return keywords
+
+    def cusromFilter(self, word):
+        if word in skipWord:
+            return None
+        try:
+            word = remap[word]
+        except KeyError:
+            pass
+        return word
+
+    def addScoreByCount(self, doc):
+        for token in doc:
+            if token.is_punct or token.is_stop or token.is_digit:
+                continue
+            if token.pos_ in posDeny:
+                continue
+            if token.pos_ not in posAllow:
+                raise NotImplementedError(
+                    'A token.pos_ is not set in config.py:',
+                    token, token.pos_)
+
+            word = self.cusromFilter(token.lemma_)
+            if word is None:
+                continue
+
+            self.score[word] += 1
+
+    def getKeywordsByScore(self, minscore=2):
+        keywords = []
+        for (word, score) in self.score.most_common():
+            # Check if there is already a word in the the keywords
+            # and replace it if it is a subword
+            if self.thereIsSubWord(word, keywords):
+                continue
+            keywords.append(word)
+            if score < minscore:
+                break
+            if len(keywords) >= self.amount:
+                break
+        return keywords
+
+    def thereIsSubWord(self, word, keywords):
+        for i, kw in enumerate(keywords):
+            if word in kw:
+                keywords[i] = word
+                return True
+            elif kw in word:
+                return True
+        return False
+
+    def addScoreByEnts(self, doc):
+        for wordspan in doc.ents:
+            for word in wordspan.lemma_.split('--'):
+                word = word.strip(' .,!?_-')
+                # This idea with stem words doesn't works well.
+                # phone -> phon is not wished
+                # word = self.stemmer.stemWord(word)
+                if len(word) <= 1:
+                    continue
+                if '://' in word:
+                    continue
+                if "\n" in word:
+                    continue
+                self.score[word] += entsScore
--- a/hmlttotext.py
+++ b/hmlttotext.py
@ -0,0 +1,14 @@
+#
+# takes html string from gnulinux.ch and return the plain text as a string
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+from trafilatura import extract
+
+
+class HmltToText:
+    def __init__(self):
+        pass
+
+    def __call__(self, htmlstr):
+        return extract(htmlstr)
--- a/main.py
+++ b/main.py
@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# find keywords and exports them
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+from datareader import DataReader
+from gnukeywords import GnuKeywords
+from hmlttotext import HmltToText
+from json import dump as jsondump
+
+
+def main(args):
+
+    # Objekte initiieren
+    keywords = dict()
+    datareader = DataReader(datadir='./data/')
+    gnukeywords = GnuKeywords(lang='de')
+    hmltToText = HmltToText()
+
+    # Keywords in das dictionary schreiben
+    for articlename, htmlstr in datareader:
+        text = hmltToText(htmlstr)
+        keywords[articlename] = gnukeywords(text)
+
+    # Keywords speichern
+    with open('tags.json', 'w') as f:
+        jsondump(keywords, f, indent=4, ensure_ascii=False)
+
+    return 0
+
+
+if __name__ == '__main__':
+    import sys
+    sys.exit(main(sys.argv))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+spacy
+trafilatura
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+# setup gnu-keywords, automatet script.
+# Perhaps you want to do it yourself, in your way. Feel free
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+# Erstelle eine Pythonumgebung, falls noch keine existiert.
+if [ ! -d .env ]
+then python -m venv .env
+fi
+
+# Aktiviere die Pythonumgebung
+. .env/bin/activate
+
+# Installiere die benötigten Pakete
+pip install -U pip
+pip install -r requirements.txt
+
+# Lade die Berichte von gnulinux.ch herunter und entpacke sie in den Ordner ./data
+wget https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi/download/data.zip
+unzip data.zip
+
+# Der Bericht von beautiful-soup ist leider unvollständig. Wir löschen ihn bereits hier.
+if [ -f data/beautiful-soup/index.txt ];then rm data/beautiful-soup/index.txt; rmdir data/beautiful-soup/;fi