35 lines
874 B
Python
35 lines
874 B
Python
|
#!/usr/bin/env python3
|
||
|
# find keywords and exports them
|
||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git,jdmweb2.ch>
|
||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||
|
#
|
||
|
from datareader import DataReader
|
||
|
from gnukeywords import GnuKeywords
|
||
|
from hmlttotext import HmltToText
|
||
|
from json import dump as jsondump
|
||
|
|
||
|
|
||
|
def main(args):
|
||
|
|
||
|
# Objekte initiieren
|
||
|
keywords = dict()
|
||
|
datareader = DataReader(datadir='./data/')
|
||
|
gnukeywords = GnuKeywords(lang='de')
|
||
|
hmltToText = HmltToText()
|
||
|
|
||
|
# Keywords in das dictionary schreiben
|
||
|
for articlename, htmlstr in datareader:
|
||
|
text = hmltToText(htmlstr)
|
||
|
keywords[articlename] = gnukeywords(text)
|
||
|
|
||
|
# Keywords speichern
|
||
|
with open('tags.json', 'w') as f:
|
||
|
jsondump(keywords, f, indent=4, ensure_ascii=False)
|
||
|
|
||
|
return 0
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
import sys
|
||
|
sys.exit(main(sys.argv))
|