First public version

This commit is contained in:
Beat Jäckle 2022-05-18 18:51:42 +02:00
parent a13fd12aa0
commit 3d91ad040e
7 changed files with 107 additions and 1 deletions

4
.gitignore vendored
View file

@ -1,3 +1,7 @@
data
data.zip
*.kate-swp
# ---> Python # ---> Python
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/

View file

@ -1,3 +1,7 @@
# gnu-keywords # gnu-keywords
https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten Das ist das Repo für den [Wettbewerb von gnulinux.ch](https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten).
Die datei data.zip stammt von der [GNU/Linux.ch Cloud](https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi).
Momentan ist das Repro noch privat. Der Code an sich ist GPL3+

39
datareader.py Normal file
View file

@ -0,0 +1,39 @@
#
# <one line to give the program's name and a brief idea of what it does.>
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
# SPDX-License-Identifier: GPL-3.0-or-later
#
from os import listdir
class DataReader:
def __init__(self, datadir,
indexname='index.txt'
):
if datadir[-1] is not '/':
datadir += '/'
self.datadir = datadir
self.__readTree__()
self.indexname = indexname
def __readTree__(self):
self.articles = listdir(self.datadir)
def __iter__(self):
self.indexnext = 0
return self
def __next__(self):
try:
articlename = self.articles[self.indexnext]
path = self.datadir +\
articlename +\
'/'+self.indexname
self.indexnext += 1
except IndexError:
raise StopIteration
return None
with open(path, "r") as f:
return [articlename, f.read()]

19
gnukeywords.py Normal file
View file

@ -0,0 +1,19 @@
#
# find keywords in articles and returns them
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
# SPDX-License-Identifier: GPL-3.0-or-later
#
import spacy
class GnuKeywords:
modules = {
'de': 'de_core_news_sm',
'en': 'en_core_web_lg'
}
def __init__(self, lang):
self.lang = lang # de_core_news_sm or
def __call__(self, htmlstr):
return ['gnu', 'linux', 'for', 'ever', 'love']

22
main.py Normal file
View file

@ -0,0 +1,22 @@
#!/usr/bin/env python3
# find keywords and exports them
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
# SPDX-License-Identifier: GPL-3.0-or-later
#
from datareader import DataReader
from gnukeywords import GnuKeywords
def main(args):
keywords = dict()
datareader = DataReader(datadir='./data/')
gnukeywords = GnuKeywords(lang='de')
for articlename, htmlstr in datareader:
keywords[articlename] = gnukeywords(htmlstr)
print(f"{articlename} -> {keywords[articlename]}")
return 0
if __name__ == '__main__':
import sys
sys.exit(main(sys.argv))

1
requirements.txt Normal file
View file

@ -0,0 +1 @@
spacy

17
setup.sh Executable file
View file

@ -0,0 +1,17 @@
#!/bin/bash
# setup gnu-keywords, automatet script.
# Perhaps you want to do it yourself, in your way. Feel free
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
# SPDX-License-Identifier: GPL-3.0-or-later
#
if [ ! -d .env ]
then python -m venv .env
fi
. .env/bin/activate
pip install -U pip
pip install -r requirements.txt
python -m spacy download de_core_news_sm
wget https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi/download/data.zip
unzip data.zip