From ad4bf818fab563a9e668b54829b5cc3486665d4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beat=20J=C3=A4ckle?= Date: Wed, 11 May 2022 20:00:30 +0200 Subject: [PATCH] CutWords --- cutwords.py | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 cutwords.py diff --git a/cutwords.py b/cutwords.py new file mode 100644 index 0000000..8c7308c --- /dev/null +++ b/cutwords.py @@ -0,0 +1,113 @@ +# +# takes any length string, returns of list strings with each 128 words +# SPDX-FileCopyrightText: 2022 Beat Jäckle +# SPDX-License-Identifier: GPL-3.0-or-later +# +import re + +class CutWords: + def __init__(self, string, maxlen=128, mode=None): + self.setMaxlen(maxlen) + self.setString(string) + self.setMode(mode) + + def __call__(self): + return self.getBlocks() + + def setMode(self, mode): + self.mode = mode + self.seperators = None + self.blocks = None + + def setMaxlen(self, maxlen): + self.maxlen = maxlen + self.seperators = None + self.blocks = None + + def setString(self, string): + self.string = string + self.words = string.split() + self.seperators = None + self.blocks = None + + def getBlocks(self, mode=None): + if self.blocks is None: + self.makeBlocks() + if mode == 'strings': + return [' '.join(b) for b in self.blocks] + return self.blocks + + def makeBlocks(self): + if self.seperators is None: + self.makeSeperators(mode='points') + blocks = [] + lastSeperator = 0 + for seperator in self.seperators: + blocks.append(self.words[lastSeperator:seperator]) + lastSeperator = seperator + blocks.append(self.words[lastSeperator:]) + self.blocks = blocks + return blocks + + def makeSeperators(self, mode=None): + self.seperators = self.findSeperators( + mode=mode + ) + + def findSeperators(self, mode=None): + if mode is None: + mode = 'points' + if mode == 'even': + nseperators = (len(self.words)-1)//self.maxlen + return [ + len(self.words)//nseperators*(i+1) + for i in range(nseperators) + ] + if mode == 'points': + seperators = [] + lastSeperator = 0 + while lastSeperator + self.maxlen < len(self.words): + for i in range(1, self.maxlen): + index = lastSeperator+self.maxlen-i + #if the word ends with ?! or a . + if re.search('^[\w]*[\.\?!]$', self.words[index]): + lastSeperator = index+1 + seperators.append(index+1) + break + if lastSeperator < index: + print('WARNING CutWords: No points found') + print(f"WARNING: string = {self.string[:40]}") + print( + f"WARNING: position {lastSeperator} = " + f"{self.string[lastSeperator:lastSeperator+20]}...") + lastSeperator = lastSeperator + self.maxlen + seperators.append(lastSeperator) + + # print(f"Seperates in {len(seperators)+1} blocks by points."+\ + # f"Needed minimum {(len(self.words)-1)//self.maxlen+1}.") + return seperators + raise Exception(f"CutWords: mode = {mode} is unknown.") + + def __getitem__(self, i): + if self.blocks is None: + self.makeBlocks() + return ' '.join(self.blocks[i]) + + def __iter__(self): + if self.blocks is None: + self.makeBlocks() + self.indexlast = -1 + return self + + def __next__(self): + try: + self.indexlast += 1 + return self.__getitem__(self.indexlast) + except IndexError: + raise StopIteration + return None + + def __len__(self): + if self.blocks is None: + self.makeBlocks() + return len(self.blocks)