First public version
This commit is contained in:
parent
a13fd12aa0
commit
a842542ffa
9 changed files with 303 additions and 1 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -1,3 +1,8 @@
|
||||||
|
data
|
||||||
|
data.zip
|
||||||
|
*.kate-swp
|
||||||
|
tags.json
|
||||||
|
|
||||||
# ---> Python
|
# ---> Python
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|
45
README.md
45
README.md
|
@ -1,3 +1,46 @@
|
||||||
# gnu-keywords
|
# gnu-keywords
|
||||||
|
|
||||||
https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten
|
Das ist das Repo für die Abgabe von Beat Jäckle zu dem [Wettbewerb von gnulinux.ch](https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten).
|
||||||
|
|
||||||
|
Die Datei `data.zip` wird benötigt und stammt von der [GNU/Linux.ch Cloud](https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi).
|
||||||
|
Der Download kann mit `setup.sh` durchgeführt werden.
|
||||||
|
|
||||||
|
Diese Software ist lizensiert unter GPL3+.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
Kopiere dieses Reprository an einen beliebigen Ort auf deinem Computer.
|
||||||
|
|
||||||
|
In der `setup.sh` Datei sind die Anweisungen gespeichert.
|
||||||
|
Die Datei kann auch als Bash-Skript ausgeführt werden.
|
||||||
|
|
||||||
|
## Start
|
||||||
|
|
||||||
|
Die Pythonumgebung muss aktiviert sein.
|
||||||
|
`. .env/bin/activate`
|
||||||
|
|
||||||
|
Danach kann man die `main.py` ausführen.
|
||||||
|
Momentan git es keine Optionen, die man in der Befehlszeile anpassen könnte.
|
||||||
|
|
||||||
|
Beim ersten mal Starten lädt die Software das Sprachmodul herunder. Dies kann länger dauern.
|
||||||
|
|
||||||
|
## Anpassungen
|
||||||
|
|
||||||
|
Die Konfiguartionen für das Programm ist in der `config.py` Datei direkt im Python Syntax gespeichert.
|
||||||
|
Darin kann man die Parameter anpassen, welche schlüsselgebend für die Bestimmung der Keywords sind.
|
||||||
|
|
||||||
|
Wenn der Ablauf angepasst werden sollte, so kann man die `main.py` sehr gut anpassen.
|
||||||
|
Der Prozess besteht aus 9 Zeilen. Dieser ist in drei Blöcke unterteilt:
|
||||||
|
|
||||||
|
- Initialisierung der Objekte
|
||||||
|
4 Zeilen
|
||||||
|
- Keywords bestimmen
|
||||||
|
3 Zeilen
|
||||||
|
- Export
|
||||||
|
2 Zeilen
|
||||||
|
|
||||||
|
# Mögliche Roadmap
|
||||||
|
|
||||||
|
- Die `main.py` Datei könnte eine usage/help bekommen.
|
||||||
|
- Optionen in die `main.py` einbauen.
|
||||||
|
- Die Score Funktionen in `gnukeywords.py` können natürlich ganz anders gschrieben werden.
|
||||||
|
|
33
config.py
Normal file
33
config.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
#
|
||||||
|
# settings for gnukeywords
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: WTFPL
|
||||||
|
|
||||||
|
# Wortwarten
|
||||||
|
# posAllow sind erlaubte Wortwarten
|
||||||
|
# posDeny sind nicht erlaubte Wortwarten
|
||||||
|
posAllow = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
|
||||||
|
posDeny = {'VERB', 'SPACE', 'X', 'PUNCT', 'ADP',
|
||||||
|
'NUM', 'PRON', 'DET', 'SCONJ', 'PART', 'AUX', 'CCONJ'}
|
||||||
|
|
||||||
|
# Einzelne Wörter ganz auslassen
|
||||||
|
skipWord = {
|
||||||
|
'Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So',
|
||||||
|
'Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli',
|
||||||
|
'August', 'September', 'Oktober', 'November', 'Dezember',
|
||||||
|
'Linux', 'Befehl',
|
||||||
|
'S.u', 'S.E.', '4.4.1',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Hier können wir gewisse Wörter oder Abkürzungen festhalten,
|
||||||
|
# welche beim verarbeiten in der Software ersetzt werden.
|
||||||
|
remap = {
|
||||||
|
'Apps': 'App',
|
||||||
|
'Pakete': 'Paket',
|
||||||
|
'CLT': 'Chemnitzer Linux Tage'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Hier können wir gewichten, wie viel ein "ent" Wort wert hat.
|
||||||
|
# Ich finde die "ent"-Wörter sehr gute Keywords,
|
||||||
|
# also gab ich eine hohe Punktzahl.
|
||||||
|
entsScore = 10
|
43
datareader.py
Normal file
43
datareader.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
#
|
||||||
|
# <one line to give the program's name and a brief idea of what it does.>
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
from os import listdir
|
||||||
|
|
||||||
|
|
||||||
|
class DataReader:
|
||||||
|
def __init__(self, datadir,
|
||||||
|
indexname='index.txt',
|
||||||
|
skippFirstLine=True
|
||||||
|
):
|
||||||
|
if datadir[-1] != '/':
|
||||||
|
datadir += '/'
|
||||||
|
self.datadir = datadir
|
||||||
|
self.skippFirstLine = skippFirstLine
|
||||||
|
|
||||||
|
self.__readTree__()
|
||||||
|
self.indexname = indexname
|
||||||
|
|
||||||
|
def __readTree__(self):
|
||||||
|
self.articles = listdir(self.datadir)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
self.indexnext = 0
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
try:
|
||||||
|
articlename = self.articles[self.indexnext]
|
||||||
|
path = self.datadir +\
|
||||||
|
articlename +\
|
||||||
|
'/'+self.indexname
|
||||||
|
self.indexnext += 1
|
||||||
|
except IndexError:
|
||||||
|
raise StopIteration
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(path, "r") as f:
|
||||||
|
if self.skippFirstLine:
|
||||||
|
f.readline()
|
||||||
|
return [articlename, f.read()]
|
103
gnukeywords.py
Normal file
103
gnukeywords.py
Normal file
|
@ -0,0 +1,103 @@
|
||||||
|
#
|
||||||
|
# find keywords in articles and returns them
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
import spacy
|
||||||
|
from collections import Counter
|
||||||
|
from config import posAllow, posDeny, skipWord, remap, entsScore
|
||||||
|
from nltk.stem.snowball import SnowballStemmer
|
||||||
|
# from snowballstemmer import stemmer
|
||||||
|
|
||||||
|
|
||||||
|
class GnuKeywords:
|
||||||
|
modules = {
|
||||||
|
'de': 'de_core_news_lg',
|
||||||
|
'de_sm': 'de_core_news_sm',
|
||||||
|
'en': 'en_core_web_lg'
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, lang, amountofkeywords=5):
|
||||||
|
self.loadModule(self.modules[lang])
|
||||||
|
self.amount = amountofkeywords
|
||||||
|
# self.stemmer = stemmer('german')
|
||||||
|
|
||||||
|
def loadModule(self, module):
|
||||||
|
try:
|
||||||
|
self.nlp = spacy.load(module)
|
||||||
|
except OSError:
|
||||||
|
spacy.cli.download(module)
|
||||||
|
self.nlp = spacy.load(module)
|
||||||
|
|
||||||
|
def __call__(self, htmlstr):
|
||||||
|
self.score = Counter()
|
||||||
|
doc = self.nlp(htmlstr)
|
||||||
|
self.addScoreByEnts(doc)
|
||||||
|
self.addScoreByCount(doc)
|
||||||
|
keywords = self.getKeywordsByScore()
|
||||||
|
del self.score
|
||||||
|
return keywords
|
||||||
|
|
||||||
|
def cusromFilter(self, word):
|
||||||
|
if word in skipWord:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
word = remap[word]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
return word
|
||||||
|
|
||||||
|
def addScoreByCount(self, doc):
|
||||||
|
for token in doc:
|
||||||
|
if token.is_punct or token.is_stop or token.is_digit:
|
||||||
|
continue
|
||||||
|
if token.pos_ in posDeny:
|
||||||
|
continue
|
||||||
|
if token.pos_ not in posAllow:
|
||||||
|
raise NotImplementedError(
|
||||||
|
'A token.pos_ is not set in config.py:',
|
||||||
|
token, token.pos_)
|
||||||
|
|
||||||
|
word = self.cusromFilter(token.lemma_)
|
||||||
|
if word is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.score[word] += 1
|
||||||
|
|
||||||
|
def getKeywordsByScore(self, minscore=2):
|
||||||
|
keywords = []
|
||||||
|
for (word, score) in self.score.most_common():
|
||||||
|
# Check if there is already a word in the the keywords
|
||||||
|
# and replace it if it is a subword
|
||||||
|
if self.thereIsSubWord(word, keywords):
|
||||||
|
continue
|
||||||
|
keywords.append(word)
|
||||||
|
if score < minscore:
|
||||||
|
break
|
||||||
|
if len(keywords) >= self.amount:
|
||||||
|
break
|
||||||
|
return keywords
|
||||||
|
|
||||||
|
def thereIsSubWord(self, word, keywords):
|
||||||
|
for i, kw in enumerate(keywords):
|
||||||
|
if word in kw:
|
||||||
|
keywords[i] = word
|
||||||
|
return True
|
||||||
|
elif kw in word:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def addScoreByEnts(self, doc):
|
||||||
|
for wordspan in doc.ents:
|
||||||
|
for word in wordspan.lemma_.split('--'):
|
||||||
|
word = word.strip(' .,!?_-')
|
||||||
|
# This idea with stem words doesn't works well.
|
||||||
|
# phone -> phon is not wished
|
||||||
|
# word = self.stemmer.stemWord(word)
|
||||||
|
if len(word) <= 1:
|
||||||
|
continue
|
||||||
|
if '://' in word:
|
||||||
|
continue
|
||||||
|
if "\n" in word:
|
||||||
|
continue
|
||||||
|
self.score[word] += entsScore
|
14
hmlttotext.py
Normal file
14
hmlttotext.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
#
|
||||||
|
# takes html string from gnulinux.ch and return the plain text as a string
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
from trafilatura import extract
|
||||||
|
|
||||||
|
|
||||||
|
class HmltToText:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __call__(self, htmlstr):
|
||||||
|
return extract(htmlstr)
|
34
main.py
Executable file
34
main.py
Executable file
|
@ -0,0 +1,34 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# find keywords and exports them
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
from datareader import DataReader
|
||||||
|
from gnukeywords import GnuKeywords
|
||||||
|
from hmlttotext import HmltToText
|
||||||
|
from json import dump as jsondump
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
|
||||||
|
# Objekte initiieren
|
||||||
|
keywords = dict()
|
||||||
|
datareader = DataReader(datadir='./data/')
|
||||||
|
gnukeywords = GnuKeywords(lang='de')
|
||||||
|
hmltToText = HmltToText()
|
||||||
|
|
||||||
|
# Keywords in das dictionary schreiben
|
||||||
|
for articlename, htmlstr in datareader:
|
||||||
|
text = hmltToText(htmlstr)
|
||||||
|
keywords[articlename] = gnukeywords(text)
|
||||||
|
|
||||||
|
# Keywords speichern
|
||||||
|
with open('tags.json', 'w') as f:
|
||||||
|
jsondump(keywords, f, indent=4, ensure_ascii=False)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
sys.exit(main(sys.argv))
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
spacy
|
||||||
|
trafilatura
|
25
setup.sh
Executable file
25
setup.sh
Executable file
|
@ -0,0 +1,25 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# setup gnu-keywords, automatet script.
|
||||||
|
# Perhaps you want to do it yourself, in your way. Feel free
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
|
||||||
|
# Erstelle eine Pythonumgebung, falls noch keine existiert.
|
||||||
|
if [ ! -d .env ]
|
||||||
|
then python -m venv .env
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Aktiviere die Pythonumgebung
|
||||||
|
. .env/bin/activate
|
||||||
|
|
||||||
|
# Installiere die benötigten Pakete
|
||||||
|
pip install -U pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Lade die Berichte von gnulinux.ch herunter und entpacke sie in den Ordner ./data
|
||||||
|
wget https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi/download/data.zip
|
||||||
|
unzip data.zip
|
||||||
|
|
||||||
|
# Der Bericht von beautiful-soup ist leider unvollständig. Wir löschen ihn bereits hier.
|
||||||
|
if [ -f data/beautiful-soup/index.txt ];then rm data/beautiful-soup/index.txt; rmdir data/beautiful-soup/;fi
|
Loading…
Reference in a new issue