114 lines
3.6 KiB
Python
114 lines
3.6 KiB
Python
|
#
|
||
|
# takes any length string, returns of list strings with each 128 words
|
||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git.jdmweb2.ch>
|
||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||
|
#
|
||
|
import re
|
||
|
|
||
|
class CutWords:
|
||
|
def __init__(self, string, maxlen=128, mode=None):
|
||
|
self.setMaxlen(maxlen)
|
||
|
self.setString(string)
|
||
|
self.setMode(mode)
|
||
|
|
||
|
def __call__(self):
|
||
|
return self.getBlocks()
|
||
|
|
||
|
def setMode(self, mode):
|
||
|
self.mode = mode
|
||
|
self.seperators = None
|
||
|
self.blocks = None
|
||
|
|
||
|
def setMaxlen(self, maxlen):
|
||
|
self.maxlen = maxlen
|
||
|
self.seperators = None
|
||
|
self.blocks = None
|
||
|
|
||
|
def setString(self, string):
|
||
|
self.string = string
|
||
|
self.words = string.split()
|
||
|
self.seperators = None
|
||
|
self.blocks = None
|
||
|
|
||
|
def getBlocks(self, mode=None):
|
||
|
if self.blocks is None:
|
||
|
self.makeBlocks()
|
||
|
if mode == 'strings':
|
||
|
return [' '.join(b) for b in self.blocks]
|
||
|
return self.blocks
|
||
|
|
||
|
def makeBlocks(self):
|
||
|
if self.seperators is None:
|
||
|
self.makeSeperators(mode='points')
|
||
|
blocks = []
|
||
|
lastSeperator = 0
|
||
|
for seperator in self.seperators:
|
||
|
blocks.append(self.words[lastSeperator:seperator])
|
||
|
lastSeperator = seperator
|
||
|
blocks.append(self.words[lastSeperator:])
|
||
|
self.blocks = blocks
|
||
|
return blocks
|
||
|
|
||
|
def makeSeperators(self, mode=None):
|
||
|
self.seperators = self.findSeperators(
|
||
|
mode=mode
|
||
|
)
|
||
|
|
||
|
def findSeperators(self, mode=None):
|
||
|
if mode is None:
|
||
|
mode = 'points'
|
||
|
if mode == 'even':
|
||
|
nseperators = (len(self.words)-1)//self.maxlen
|
||
|
return [
|
||
|
len(self.words)//nseperators*(i+1)
|
||
|
for i in range(nseperators)
|
||
|
]
|
||
|
if mode == 'points':
|
||
|
seperators = []
|
||
|
lastSeperator = 0
|
||
|
while lastSeperator + self.maxlen < len(self.words):
|
||
|
for i in range(1, self.maxlen):
|
||
|
index = lastSeperator+self.maxlen-i
|
||
|
#if the word ends with ?! or a .
|
||
|
if re.search('^[\w]*[\.\?!]$', self.words[index]):
|
||
|
lastSeperator = index+1
|
||
|
seperators.append(index+1)
|
||
|
break
|
||
|
if lastSeperator < index:
|
||
|
print('WARNING CutWords: No points found')
|
||
|
print(f"WARNING: string = {self.string[:40]}")
|
||
|
print(
|
||
|
f"WARNING: position {lastSeperator} = "
|
||
|
f"{self.string[lastSeperator:lastSeperator+20]}...")
|
||
|
lastSeperator = lastSeperator + self.maxlen
|
||
|
seperators.append(lastSeperator)
|
||
|
|
||
|
# print(f"Seperates in {len(seperators)+1} blocks by points."+\
|
||
|
# f"Needed minimum {(len(self.words)-1)//self.maxlen+1}.")
|
||
|
return seperators
|
||
|
raise Exception(f"CutWords: mode = {mode} is unknown.")
|
||
|
|
||
|
def __getitem__(self, i):
|
||
|
if self.blocks is None:
|
||
|
self.makeBlocks()
|
||
|
return ' '.join(self.blocks[i])
|
||
|
|
||
|
def __iter__(self):
|
||
|
if self.blocks is None:
|
||
|
self.makeBlocks()
|
||
|
self.indexlast = -1
|
||
|
return self
|
||
|
|
||
|
def __next__(self):
|
||
|
try:
|
||
|
self.indexlast += 1
|
||
|
return self.__getitem__(self.indexlast)
|
||
|
except IndexError:
|
||
|
raise StopIteration
|
||
|
return None
|
||
|
|
||
|
def __len__(self):
|
||
|
if self.blocks is None:
|
||
|
self.makeBlocks()
|
||
|
return len(self.blocks)
|