First public version
This commit is contained in:
parent
a13fd12aa0
commit
3d91ad040e
7 changed files with 107 additions and 1 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,3 +1,7 @@
|
|||
data
|
||||
data.zip
|
||||
*.kate-swp
|
||||
|
||||
# ---> Python
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
# gnu-keywords
|
||||
|
||||
https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten
|
||||
Das ist das Repo für den [Wettbewerb von gnulinux.ch](https://gnulinux.ch/wettbewerb-gnu-linux-ch-verschlagworten).
|
||||
|
||||
Die datei data.zip stammt von der [GNU/Linux.ch Cloud](https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi).
|
||||
|
||||
Momentan ist das Repro noch privat. Der Code an sich ist GPL3+
|
||||
|
|
39
datareader.py
Normal file
39
datareader.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
#
|
||||
# <one line to give the program's name and a brief idea of what it does.>
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
from os import listdir
|
||||
|
||||
|
||||
class DataReader:
|
||||
def __init__(self, datadir,
|
||||
indexname='index.txt'
|
||||
):
|
||||
if datadir[-1] is not '/':
|
||||
datadir += '/'
|
||||
self.datadir = datadir
|
||||
|
||||
self.__readTree__()
|
||||
self.indexname = indexname
|
||||
|
||||
def __readTree__(self):
|
||||
self.articles = listdir(self.datadir)
|
||||
|
||||
def __iter__(self):
|
||||
self.indexnext = 0
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
try:
|
||||
articlename = self.articles[self.indexnext]
|
||||
path = self.datadir +\
|
||||
articlename +\
|
||||
'/'+self.indexname
|
||||
self.indexnext += 1
|
||||
except IndexError:
|
||||
raise StopIteration
|
||||
return None
|
||||
|
||||
with open(path, "r") as f:
|
||||
return [articlename, f.read()]
|
19
gnukeywords.py
Normal file
19
gnukeywords.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
#
|
||||
# find keywords in articles and returns them
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
import spacy
|
||||
|
||||
|
||||
class GnuKeywords:
|
||||
modules = {
|
||||
'de': 'de_core_news_sm',
|
||||
'en': 'en_core_web_lg'
|
||||
}
|
||||
|
||||
def __init__(self, lang):
|
||||
self.lang = lang # de_core_news_sm or
|
||||
|
||||
def __call__(self, htmlstr):
|
||||
return ['gnu', 'linux', 'for', 'ever', 'love']
|
22
main.py
Normal file
22
main.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env python3
|
||||
# find keywords and exports them
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
from datareader import DataReader
|
||||
from gnukeywords import GnuKeywords
|
||||
|
||||
|
||||
def main(args):
|
||||
keywords = dict()
|
||||
datareader = DataReader(datadir='./data/')
|
||||
gnukeywords = GnuKeywords(lang='de')
|
||||
for articlename, htmlstr in datareader:
|
||||
keywords[articlename] = gnukeywords(htmlstr)
|
||||
print(f"{articlename} -> {keywords[articlename]}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
sys.exit(main(sys.argv))
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
|
@ -0,0 +1 @@
|
|||
spacy
|
17
setup.sh
Executable file
17
setup.sh
Executable file
|
@ -0,0 +1,17 @@
|
|||
#!/bin/bash
|
||||
# setup gnu-keywords, automatet script.
|
||||
# Perhaps you want to do it yourself, in your way. Feel free
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
if [ ! -d .env ]
|
||||
then python -m venv .env
|
||||
fi
|
||||
|
||||
. .env/bin/activate
|
||||
pip install -U pip
|
||||
pip install -r requirements.txt
|
||||
python -m spacy download de_core_news_sm
|
||||
|
||||
wget https://cloud.gnulinux.ch/index.php/s/YTw6dn4wFKGA7oi/download/data.zip
|
||||
unzip data.zip
|
Loading…
Add table
Reference in a new issue