First public version
This commit is contained in:
parent
a13fd12aa0
commit
a842542ffa
9 changed files with 303 additions and 1 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -1,3 +1,8 @@
|
|||
data
|
||||
data.zip
|
||||
*.kate-swp
|
||||
tags.json
|
||||
|
||||
# ---> Python
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
|
45
README.md
45
README.md
|
@ -1,3 +1,46 @@
|
|||
# gnu-keywords
|
||||
|
||||
https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten
|
||||
Das ist das Repo für die Abgabe von Beat Jäckle zu dem [Wettbewerb von gnulinux.ch](https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten).
|
||||
|
||||
Die Datei `data.zip` wird benötigt und stammt von der [GNU/Linux.ch Cloud](https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi).
|
||||
Der Download kann mit `setup.sh` durchgeführt werden.
|
||||
|
||||
Diese Software ist lizensiert unter GPL3+.
|
||||
|
||||
## Installation
|
||||
|
||||
Kopiere dieses Reprository an einen beliebigen Ort auf deinem Computer.
|
||||
|
||||
In der `setup.sh` Datei sind die Anweisungen gespeichert.
|
||||
Die Datei kann auch als Bash-Skript ausgeführt werden.
|
||||
|
||||
## Start
|
||||
|
||||
Die Pythonumgebung muss aktiviert sein.
|
||||
`. .env/bin/activate`
|
||||
|
||||
Danach kann man die `main.py` ausführen.
|
||||
Momentan git es keine Optionen, die man in der Befehlszeile anpassen könnte.
|
||||
|
||||
Beim ersten mal Starten lädt die Software das Sprachmodul herunder. Dies kann länger dauern.
|
||||
|
||||
## Anpassungen
|
||||
|
||||
Die Konfiguartionen für das Programm ist in der `config.py` Datei direkt im Python Syntax gespeichert.
|
||||
Darin kann man die Parameter anpassen, welche schlüsselgebend für die Bestimmung der Keywords sind.
|
||||
|
||||
Wenn der Ablauf angepasst werden sollte, so kann man die `main.py` sehr gut anpassen.
|
||||
Der Prozess besteht aus 9 Zeilen. Dieser ist in drei Blöcke unterteilt:
|
||||
|
||||
- Initialisierung der Objekte
|
||||
4 Zeilen
|
||||
- Keywords bestimmen
|
||||
3 Zeilen
|
||||
- Export
|
||||
2 Zeilen
|
||||
|
||||
# Mögliche Roadmap
|
||||
|
||||
- Die `main.py` Datei könnte eine usage/help bekommen.
|
||||
- Optionen in die `main.py` einbauen.
|
||||
- Die Score Funktionen in `gnukeywords.py` können natürlich ganz anders gschrieben werden.
|
||||
|
|
33
config.py
Normal file
33
config.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
#
|
||||
# settings for gnukeywords
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: WTFPL
|
||||
|
||||
# Wortwarten
|
||||
# posAllow sind erlaubte Wortwarten
|
||||
# posDeny sind nicht erlaubte Wortwarten
|
||||
posAllow = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
|
||||
posDeny = {'VERB', 'SPACE', 'X', 'PUNCT', 'ADP',
|
||||
'NUM', 'PRON', 'DET', 'SCONJ', 'PART', 'AUX', 'CCONJ'}
|
||||
|
||||
# Einzelne Wörter ganz auslassen
|
||||
skipWord = {
|
||||
'Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So',
|
||||
'Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli',
|
||||
'August', 'September', 'Oktober', 'November', 'Dezember',
|
||||
'Linux', 'Befehl',
|
||||
'S.u', 'S.E.', '4.4.1',
|
||||
}
|
||||
|
||||
# Hier können wir gewisse Wörter oder Abkürzungen festhalten,
|
||||
# welche beim verarbeiten in der Software ersetzt werden.
|
||||
remap = {
|
||||
'Apps': 'App',
|
||||
'Pakete': 'Paket',
|
||||
'CLT': 'Chemnitzer Linux Tage'
|
||||
}
|
||||
|
||||
# Hier können wir gewichten, wie viel ein "ent" Wort wert hat.
|
||||
# Ich finde die "ent"-Wörter sehr gute Keywords,
|
||||
# also gab ich eine hohe Punktzahl.
|
||||
entsScore = 10
|
43
datareader.py
Normal file
43
datareader.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
#
|
||||
# <one line to give the program's name and a brief idea of what it does.>
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
from os import listdir
|
||||
|
||||
|
||||
class DataReader:
|
||||
def __init__(self, datadir,
|
||||
indexname='index.txt',
|
||||
skippFirstLine=True
|
||||
):
|
||||
if datadir[-1] != '/':
|
||||
datadir += '/'
|
||||
self.datadir = datadir
|
||||
self.skippFirstLine = skippFirstLine
|
||||
|
||||
self.__readTree__()
|
||||
self.indexname = indexname
|
||||
|
||||
def __readTree__(self):
|
||||
self.articles = listdir(self.datadir)
|
||||
|
||||
def __iter__(self):
|
||||
self.indexnext = 0
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
try:
|
||||
articlename = self.articles[self.indexnext]
|
||||
path = self.datadir +\
|
||||
articlename +\
|
||||
'/'+self.indexname
|
||||
self.indexnext += 1
|
||||
except IndexError:
|
||||
raise StopIteration
|
||||
return None
|
||||
|
||||
with open(path, "r") as f:
|
||||
if self.skippFirstLine:
|
||||
f.readline()
|
||||
return [articlename, f.read()]
|
103
gnukeywords.py
Normal file
103
gnukeywords.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
#
|
||||
# find keywords in articles and returns them
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
import spacy
|
||||
from collections import Counter
|
||||
from config import posAllow, posDeny, skipWord, remap, entsScore
|
||||
from nltk.stem.snowball import SnowballStemmer
|
||||
# from snowballstemmer import stemmer
|
||||
|
||||
|
||||
class GnuKeywords:
|
||||
modules = {
|
||||
'de': 'de_core_news_lg',
|
||||
'de_sm': 'de_core_news_sm',
|
||||
'en': 'en_core_web_lg'
|
||||
}
|
||||
|
||||
def __init__(self, lang, amountofkeywords=5):
|
||||
self.loadModule(self.modules[lang])
|
||||
self.amount = amountofkeywords
|
||||
# self.stemmer = stemmer('german')
|
||||
|
||||
def loadModule(self, module):
|
||||
try:
|
||||
self.nlp = spacy.load(module)
|
||||
except OSError:
|
||||
spacy.cli.download(module)
|
||||
self.nlp = spacy.load(module)
|
||||
|
||||
def __call__(self, htmlstr):
|
||||
self.score = Counter()
|
||||
doc = self.nlp(htmlstr)
|
||||
self.addScoreByEnts(doc)
|
||||
self.addScoreByCount(doc)
|
||||
keywords = self.getKeywordsByScore()
|
||||
del self.score
|
||||
return keywords
|
||||
|
||||
def cusromFilter(self, word):
|
||||
if word in skipWord:
|
||||
return None
|
||||
try:
|
||||
word = remap[word]
|
||||
except KeyError:
|
||||
pass
|
||||
return word
|
||||
|
||||
def addScoreByCount(self, doc):
|
||||
for token in doc:
|
||||
if token.is_punct or token.is_stop or token.is_digit:
|
||||
continue
|
||||
if token.pos_ in posDeny:
|
||||
continue
|
||||
if token.pos_ not in posAllow:
|
||||
raise NotImplementedError(
|
||||
'A token.pos_ is not set in config.py:',
|
||||
token, token.pos_)
|
||||
|
||||
word = self.cusromFilter(token.lemma_)
|
||||
if word is None:
|
||||
continue
|
||||
|
||||
self.score[word] += 1
|
||||
|
||||
def getKeywordsByScore(self, minscore=2):
|
||||
keywords = []
|
||||
for (word, score) in self.score.most_common():
|
||||
# Check if there is already a word in the the keywords
|
||||
# and replace it if it is a subword
|
||||
if self.thereIsSubWord(word, keywords):
|
||||
continue
|
||||
keywords.append(word)
|
||||
if score < minscore:
|
||||
break
|
||||
if len(keywords) >= self.amount:
|
||||
break
|
||||
return keywords
|
||||
|
||||
def thereIsSubWord(self, word, keywords):
|
||||
for i, kw in enumerate(keywords):
|
||||
if word in kw:
|
||||
keywords[i] = word
|
||||
return True
|
||||
elif kw in word:
|
||||
return True
|
||||
return False
|
||||
|
||||
def addScoreByEnts(self, doc):
|
||||
for wordspan in doc.ents:
|
||||
for word in wordspan.lemma_.split('--'):
|
||||
word = word.strip(' .,!?_-')
|
||||
# This idea with stem words doesn't works well.
|
||||
# phone -> phon is not wished
|
||||
# word = self.stemmer.stemWord(word)
|
||||
if len(word) <= 1:
|
||||
continue
|
||||
if '://' in word:
|
||||
continue
|
||||
if "\n" in word:
|
||||
continue
|
||||
self.score[word] += entsScore
|
14
hmlttotext.py
Normal file
14
hmlttotext.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
#
|
||||
# takes html string from gnulinux.ch and return the plain text as a string
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
from trafilatura import extract
|
||||
|
||||
|
||||
class HmltToText:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __call__(self, htmlstr):
|
||||
return extract(htmlstr)
|
34
main.py
Executable file
34
main.py
Executable file
|
@ -0,0 +1,34 @@
|
|||
#!/usr/bin/env python3
|
||||
# find keywords and exports them
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
from datareader import DataReader
|
||||
from gnukeywords import GnuKeywords
|
||||
from hmlttotext import HmltToText
|
||||
from json import dump as jsondump
|
||||
|
||||
|
||||
def main(args):
|
||||
|
||||
# Objekte initiieren
|
||||
keywords = dict()
|
||||
datareader = DataReader(datadir='./data/')
|
||||
gnukeywords = GnuKeywords(lang='de')
|
||||
hmltToText = HmltToText()
|
||||
|
||||
# Keywords in das dictionary schreiben
|
||||
for articlename, htmlstr in datareader:
|
||||
text = hmltToText(htmlstr)
|
||||
keywords[articlename] = gnukeywords(text)
|
||||
|
||||
# Keywords speichern
|
||||
with open('tags.json', 'w') as f:
|
||||
jsondump(keywords, f, indent=4, ensure_ascii=False)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
sys.exit(main(sys.argv))
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
spacy
|
||||
trafilatura
|
25
setup.sh
Executable file
25
setup.sh
Executable file
|
@ -0,0 +1,25 @@
|
|||
#!/bin/bash
|
||||
# setup gnu-keywords, automatet script.
|
||||
# Perhaps you want to do it yourself, in your way. Feel free
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
|
||||
# Erstelle eine Pythonumgebung, falls noch keine existiert.
|
||||
if [ ! -d .env ]
|
||||
then python -m venv .env
|
||||
fi
|
||||
|
||||
# Aktiviere die Pythonumgebung
|
||||
. .env/bin/activate
|
||||
|
||||
# Installiere die benötigten Pakete
|
||||
pip install -U pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Lade die Berichte von gnulinux.ch herunter und entpacke sie in den Ordner ./data
|
||||
wget https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi/download/data.zip
|
||||
unzip data.zip
|
||||
|
||||
# Der Bericht von beautiful-soup ist leider unvollständig. Wir löschen ihn bereits hier.
|
||||
if [ -f data/beautiful-soup/index.txt ];then rm data/beautiful-soup/index.txt; rmdir data/beautiful-soup/;fi
|
Loading…
Reference in a new issue