From 3d91ad040e215c1d1fba9324284742bbfcec721e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beat=20J=C3=A4ckle?= Date: Wed, 18 May 2022 18:51:42 +0200 Subject: [PATCH] First public version --- .gitignore | 4 ++++ README.md | 6 +++++- datareader.py | 39 +++++++++++++++++++++++++++++++++++++++ gnukeywords.py | 19 +++++++++++++++++++ main.py | 22 ++++++++++++++++++++++ requirements.txt | 1 + setup.sh | 17 +++++++++++++++++ 7 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 datareader.py create mode 100644 gnukeywords.py create mode 100644 main.py create mode 100644 requirements.txt create mode 100755 setup.sh diff --git a/.gitignore b/.gitignore index f8b73e7..7411747 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +data +data.zip +*.kate-swp + # ---> Python # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 51ecdc5..ef27c3b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ # gnu-keywords -https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten \ No newline at end of file +Das ist das Repo für den [Wettbewerb von gnulinux.ch](https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten). + +Die datei data.zip stammt von der [GNU/Linux.ch Cloud](https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi). + +Momentan ist das Repro noch privat. Der Code an sich ist GPL3+ diff --git a/datareader.py b/datareader.py new file mode 100644 index 0000000..70f8aea --- /dev/null +++ b/datareader.py @@ -0,0 +1,39 @@ +# +# +# SPDX-FileCopyrightText: 2022 Beat Jäckle +# SPDX-License-Identifier: GPL-3.0-or-later +# +from os import listdir + + +class DataReader: + def __init__(self, datadir, + indexname='index.txt' + ): + if datadir[-1] is not '/': + datadir += '/' + self.datadir = datadir + + self.__readTree__() + self.indexname = indexname + + def __readTree__(self): + self.articles = listdir(self.datadir) + + def __iter__(self): + self.indexnext = 0 + return self + + def __next__(self): + try: + articlename = self.articles[self.indexnext] + path = self.datadir +\ + articlename +\ + '/'+self.indexname + self.indexnext += 1 + except IndexError: + raise StopIteration + return None + + with open(path, "r") as f: + return [articlename, f.read()] diff --git a/gnukeywords.py b/gnukeywords.py new file mode 100644 index 0000000..1f4c030 --- /dev/null +++ b/gnukeywords.py @@ -0,0 +1,19 @@ +# +# find keywords in articles and returns them +# SPDX-FileCopyrightText: 2022 Beat Jäckle +# SPDX-License-Identifier: GPL-3.0-or-later +# +import spacy + + +class GnuKeywords: + modules = { + 'de': 'de_core_news_sm', + 'en': 'en_core_web_lg' + } + + def __init__(self, lang): + self.lang = lang # de_core_news_sm or + + def __call__(self, htmlstr): + return ['gnu', 'linux', 'for', 'ever', 'love'] diff --git a/main.py b/main.py new file mode 100644 index 0000000..981fcb9 --- /dev/null +++ b/main.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# find keywords and exports them +# SPDX-FileCopyrightText: 2022 Beat Jäckle +# SPDX-License-Identifier: GPL-3.0-or-later +# +from datareader import DataReader +from gnukeywords import GnuKeywords + + +def main(args): + keywords = dict() + datareader = DataReader(datadir='./data/') + gnukeywords = GnuKeywords(lang='de') + for articlename, htmlstr in datareader: + keywords[articlename] = gnukeywords(htmlstr) + print(f"{articlename} -> {keywords[articlename]}") + return 0 + + +if __name__ == '__main__': + import sys + sys.exit(main(sys.argv)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..568e4fc --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +spacy diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..3eaeffe --- /dev/null +++ b/setup.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# setup gnu-keywords, automatet script. +# Perhaps you want to do it yourself, in your way. Feel free +# SPDX-FileCopyrightText: 2022 Beat Jäckle +# SPDX-License-Identifier: GPL-3.0-or-later +# +if [ ! -d .env ] +then python -m venv .env +fi + +. .env/bin/activate +pip install -U pip +pip install -r requirements.txt +python -m spacy download de_core_news_sm + +wget https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi/download/data.zip +unzip data.zip