cutwords/cutwords.py

114 lines
3.6 KiB
Python

#
# takes any length string, returns of list strings with each 128 words
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git.jdmweb2.ch>
# SPDX-License-Identifier: GPL-3.0-or-later
#
import re
class CutWords:
version = '1.0'
def __init__(self, string, maxlen=128, mode=None):
self.setMaxlen(maxlen)
self.setString(string)
self.setMode(mode)
def __call__(self):
return self.getBlocks()
def setMode(self, mode):
self.mode = mode
self.seperators = None
self.blocks = None
def setMaxlen(self, maxlen):
self.maxlen = maxlen
self.seperators = None
self.blocks = None
def setString(self, string):
self.string = string
self.words = string.split()
self.seperators = None
self.blocks = None
def getBlocks(self, mode=None):
if self.blocks is None:
self.makeBlocks()
if mode == 'strings':
return [' '.join(b) for b in self.blocks]
return self.blocks
def makeBlocks(self):
if self.seperators is None:
self.makeSeperators(mode='points')
blocks = []
lastSeperator = 0
for seperator in self.seperators:
blocks.append(self.words[lastSeperator:seperator])
lastSeperator = seperator
blocks.append(self.words[lastSeperator:])
self.blocks = blocks
return blocks
def makeSeperators(self, mode=None):
self.seperators = self.findSeperators(
mode=mode
)
def findSeperators(self, mode=None):
if mode is None:
mode = 'points'
if mode == 'even':
nseperators = (len(self.words)-1)//self.maxlen
return [
len(self.words)//nseperators*(i+1)
for i in range(nseperators)
]
if mode == 'points':
seperators = []
lastSeperator = 0
while lastSeperator + self.maxlen < len(self.words):
for i in range(1, self.maxlen):
index = lastSeperator+self.maxlen-i
#if the word ends with ?! or a .
if re.search('^[\w]*[\.\?!]$', self.words[index]):
lastSeperator = index+1
seperators.append(index+1)
break
if lastSeperator < index:
print('WARNING CutWords: No points found')
print(f"WARNING: string = {self.string[:40]}")
print(
f"WARNING: position {lastSeperator} = "
f"{self.string[lastSeperator:lastSeperator+20]}...")
lastSeperator = lastSeperator + self.maxlen
seperators.append(lastSeperator)
# print(f"Seperates in {len(seperators)+1} blocks by points."+\
# f"Needed minimum {(len(self.words)-1)//self.maxlen+1}.")
return seperators
raise Exception(f"CutWords: mode = {mode} is unknown.")
def __getitem__(self, i):
if self.blocks is None:
self.makeBlocks()
return ' '.join(self.blocks[i])
def __iter__(self):
if self.blocks is None:
self.makeBlocks()
self.indexlast = -1
return self
def __next__(self):
try:
self.indexlast += 1
return self.__getitem__(self.indexlast)
except IndexError:
raise StopIteration
return None
def __len__(self):
if self.blocks is None:
self.makeBlocks()
return len(self.blocks)