First public version
This commit is contained in:
parent
a13fd12aa0
commit
3d91ad040e
7 changed files with 107 additions and 1 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,3 +1,7 @@
|
||||||
|
data
|
||||||
|
data.zip
|
||||||
|
*.kate-swp
|
||||||
|
|
||||||
# ---> Python
|
# ---> Python
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
# gnu-keywords
|
# gnu-keywords
|
||||||
|
|
||||||
https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten
|
Das ist das Repo für den [Wettbewerb von gnulinux.ch](https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten).
|
||||||
|
|
||||||
|
Die datei data.zip stammt von der [GNU/Linux.ch Cloud](https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi).
|
||||||
|
|
||||||
|
Momentan ist das Repro noch privat. Der Code an sich ist GPL3+
|
||||||
|
|
39
datareader.py
Normal file
39
datareader.py
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
#
|
||||||
|
# <one line to give the program's name and a brief idea of what it does.>
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
from os import listdir
|
||||||
|
|
||||||
|
|
||||||
|
class DataReader:
|
||||||
|
def __init__(self, datadir,
|
||||||
|
indexname='index.txt'
|
||||||
|
):
|
||||||
|
if datadir[-1] is not '/':
|
||||||
|
datadir += '/'
|
||||||
|
self.datadir = datadir
|
||||||
|
|
||||||
|
self.__readTree__()
|
||||||
|
self.indexname = indexname
|
||||||
|
|
||||||
|
def __readTree__(self):
|
||||||
|
self.articles = listdir(self.datadir)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
self.indexnext = 0
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
try:
|
||||||
|
articlename = self.articles[self.indexnext]
|
||||||
|
path = self.datadir +\
|
||||||
|
articlename +\
|
||||||
|
'/'+self.indexname
|
||||||
|
self.indexnext += 1
|
||||||
|
except IndexError:
|
||||||
|
raise StopIteration
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(path, "r") as f:
|
||||||
|
return [articlename, f.read()]
|
19
gnukeywords.py
Normal file
19
gnukeywords.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
#
|
||||||
|
# find keywords in articles and returns them
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
|
||||||
|
class GnuKeywords:
|
||||||
|
modules = {
|
||||||
|
'de': 'de_core_news_sm',
|
||||||
|
'en': 'en_core_web_lg'
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, lang):
|
||||||
|
self.lang = lang # de_core_news_sm or
|
||||||
|
|
||||||
|
def __call__(self, htmlstr):
|
||||||
|
return ['gnu', 'linux', 'for', 'ever', 'love']
|
22
main.py
Normal file
22
main.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# find keywords and exports them
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
from datareader import DataReader
|
||||||
|
from gnukeywords import GnuKeywords
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
keywords = dict()
|
||||||
|
datareader = DataReader(datadir='./data/')
|
||||||
|
gnukeywords = GnuKeywords(lang='de')
|
||||||
|
for articlename, htmlstr in datareader:
|
||||||
|
keywords[articlename] = gnukeywords(htmlstr)
|
||||||
|
print(f"{articlename} -> {keywords[articlename]}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
sys.exit(main(sys.argv))
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
spacy
|
17
setup.sh
Executable file
17
setup.sh
Executable file
|
@ -0,0 +1,17 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# setup gnu-keywords, automatet script.
|
||||||
|
# Perhaps you want to do it yourself, in your way. Feel free
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
if [ ! -d .env ]
|
||||||
|
then python -m venv .env
|
||||||
|
fi
|
||||||
|
|
||||||
|
. .env/bin/activate
|
||||||
|
pip install -U pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
python -m spacy download de_core_news_sm
|
||||||
|
|
||||||
|
wget https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi/download/data.zip
|
||||||
|
unzip data.zip
|
Loading…
Add table
Reference in a new issue