# # takes any length string, returns of list strings with each 128 words # SPDX-FileCopyrightText: 2022 Beat Jäckle # SPDX-License-Identifier: GPL-3.0-or-later # import re class CutWords: version = '1.0' def __init__(self, string, maxlen=128, mode=None): self.setMaxlen(maxlen) self.setString(string) self.setMode(mode) def __call__(self): return self.getBlocks() def setMode(self, mode): self.mode = mode self.seperators = None self.blocks = None def setMaxlen(self, maxlen): self.maxlen = maxlen self.seperators = None self.blocks = None def setString(self, string): self.string = string self.words = string.split() self.seperators = None self.blocks = None def getBlocks(self, mode=None): if self.blocks is None: self.makeBlocks() if mode == 'strings': return [' '.join(b) for b in self.blocks] return self.blocks def makeBlocks(self): if self.seperators is None: self.makeSeperators(mode='points') blocks = [] lastSeperator = 0 for seperator in self.seperators: blocks.append(self.words[lastSeperator:seperator]) lastSeperator = seperator blocks.append(self.words[lastSeperator:]) self.blocks = blocks return blocks def makeSeperators(self, mode=None): self.seperators = self.findSeperators( mode=mode ) def findSeperators(self, mode=None): if mode is None: mode = 'points' if mode == 'even': nseperators = (len(self.words)-1)//self.maxlen return [ len(self.words)//nseperators*(i+1) for i in range(nseperators) ] if mode == 'points': seperators = [] lastSeperator = 0 while lastSeperator + self.maxlen < len(self.words): for i in range(1, self.maxlen): index = lastSeperator+self.maxlen-i #if the word ends with ?! or a . if re.search('^[\w]*[\.\?!]$', self.words[index]): lastSeperator = index+1 seperators.append(index+1) break if lastSeperator < index: print('WARNING CutWords: No points found') print(f"WARNING: string = {self.string[:40]}") print( f"WARNING: position {lastSeperator} = " f"{self.string[lastSeperator:lastSeperator+20]}...") lastSeperator = lastSeperator + self.maxlen seperators.append(lastSeperator) # print(f"Seperates in {len(seperators)+1} blocks by points."+\ # f"Needed minimum {(len(self.words)-1)//self.maxlen+1}.") return seperators raise Exception(f"CutWords: mode = {mode} is unknown.") def __getitem__(self, i): if self.blocks is None: self.makeBlocks() return ' '.join(self.blocks[i]) def __iter__(self): if self.blocks is None: self.makeBlocks() self.indexlast = -1 return self def __next__(self): try: self.indexlast += 1 return self.__getitem__(self.indexlast) except IndexError: raise StopIteration return None def __len__(self): if self.blocks is None: self.makeBlocks() return len(self.blocks)