First public version

2022-05-18 18:51:42 +02:00 · 2022-05-18 18:51:42 +02:00 · 3d91ad040e
commit 3d91ad040e
parent a13fd12aa0
7 changed files with 107 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,7 @@
+data
+data.zip
+*.kate-swp
+
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/README.md
+++ b/README.md
@ -1,3 +1,7 @@
 # gnu-keywords

-https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten
+Das ist das Repo für den [Wettbewerb von gnulinux.ch](https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten).
+
+Die datei data.zip stammt von der [GNU/Linux.ch Cloud](https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi).
+
+Momentan ist das Repro noch privat. Der Code an sich ist GPL3+
--- a/datareader.py
+++ b/datareader.py
@ -0,0 +1,39 @@
+#
+# <one line to give the program's name and a brief idea of what it does.>
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+from os import listdir
+
+
+class DataReader:
+    def __init__(self, datadir,
+                 indexname='index.txt'
+                 ):
+        if datadir[-1] is not '/':
+            datadir += '/'
+        self.datadir = datadir
+
+        self.__readTree__()
+        self.indexname = indexname
+
+    def __readTree__(self):
+        self.articles = listdir(self.datadir)
+
+    def __iter__(self):
+        self.indexnext = 0
+        return self
+
+    def __next__(self):
+        try:
+            articlename = self.articles[self.indexnext]
+            path = self.datadir +\
+                articlename +\
+                '/'+self.indexname
+            self.indexnext += 1
+        except IndexError:
+            raise StopIteration
+            return None
+
+        with open(path, "r") as f:
+            return [articlename, f.read()]
--- a/gnukeywords.py
+++ b/gnukeywords.py
@ -0,0 +1,19 @@
+#
+# find keywords in articles and returns them
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+import spacy
+
+
+class GnuKeywords:
+    modules = {
+        'de': 'de_core_news_sm',
+        'en': 'en_core_web_lg'
+    }
+
+    def __init__(self, lang):
+        self.lang = lang  # de_core_news_sm or
+
+    def __call__(self, htmlstr):
+        return ['gnu', 'linux', 'for', 'ever', 'love']
--- a/main.py
+++ b/main.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+# find keywords and exports them
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+from datareader import DataReader
+from gnukeywords import GnuKeywords
+
+
+def main(args):
+    keywords = dict()
+    datareader = DataReader(datadir='./data/')
+    gnukeywords = GnuKeywords(lang='de')
+    for articlename, htmlstr in datareader:
+        keywords[articlename] = gnukeywords(htmlstr)
+        print(f"{articlename} -> {keywords[articlename]}")
+    return 0
+
+
+if __name__ == '__main__':
+    import sys
+    sys.exit(main(sys.argv))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+spacy
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+# setup gnu-keywords, automatet script.
+# Perhaps you want to do it yourself, in your way. Feel free
+# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+if [ ! -d .env ]
+then python -m venv .env
+fi
+
+. .env/bin/activate
+pip install -U pip
+pip install -r requirements.txt
+python -m spacy download de_core_news_sm
+
+wget https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi/download/data.zip
+unzip data.zip