CutWords
This commit is contained in:
parent
134ea9e8ae
commit
ad4bf818fa
1 changed files with 113 additions and 0 deletions
113
cutwords.py
Normal file
113
cutwords.py
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
#
|
||||||
|
# takes any length string, returns of list strings with each 128 words
|
||||||
|
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git.jdmweb2.ch>
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
import re
|
||||||
|
|
||||||
|
class CutWords:
|
||||||
|
def __init__(self, string, maxlen=128, mode=None):
|
||||||
|
self.setMaxlen(maxlen)
|
||||||
|
self.setString(string)
|
||||||
|
self.setMode(mode)
|
||||||
|
|
||||||
|
def __call__(self):
|
||||||
|
return self.getBlocks()
|
||||||
|
|
||||||
|
def setMode(self, mode):
|
||||||
|
self.mode = mode
|
||||||
|
self.seperators = None
|
||||||
|
self.blocks = None
|
||||||
|
|
||||||
|
def setMaxlen(self, maxlen):
|
||||||
|
self.maxlen = maxlen
|
||||||
|
self.seperators = None
|
||||||
|
self.blocks = None
|
||||||
|
|
||||||
|
def setString(self, string):
|
||||||
|
self.string = string
|
||||||
|
self.words = string.split()
|
||||||
|
self.seperators = None
|
||||||
|
self.blocks = None
|
||||||
|
|
||||||
|
def getBlocks(self, mode=None):
|
||||||
|
if self.blocks is None:
|
||||||
|
self.makeBlocks()
|
||||||
|
if mode == 'strings':
|
||||||
|
return [' '.join(b) for b in self.blocks]
|
||||||
|
return self.blocks
|
||||||
|
|
||||||
|
def makeBlocks(self):
|
||||||
|
if self.seperators is None:
|
||||||
|
self.makeSeperators(mode='points')
|
||||||
|
blocks = []
|
||||||
|
lastSeperator = 0
|
||||||
|
for seperator in self.seperators:
|
||||||
|
blocks.append(self.words[lastSeperator:seperator])
|
||||||
|
lastSeperator = seperator
|
||||||
|
blocks.append(self.words[lastSeperator:])
|
||||||
|
self.blocks = blocks
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
def makeSeperators(self, mode=None):
|
||||||
|
self.seperators = self.findSeperators(
|
||||||
|
mode=mode
|
||||||
|
)
|
||||||
|
|
||||||
|
def findSeperators(self, mode=None):
|
||||||
|
if mode is None:
|
||||||
|
mode = 'points'
|
||||||
|
if mode == 'even':
|
||||||
|
nseperators = (len(self.words)-1)//self.maxlen
|
||||||
|
return [
|
||||||
|
len(self.words)//nseperators*(i+1)
|
||||||
|
for i in range(nseperators)
|
||||||
|
]
|
||||||
|
if mode == 'points':
|
||||||
|
seperators = []
|
||||||
|
lastSeperator = 0
|
||||||
|
while lastSeperator + self.maxlen < len(self.words):
|
||||||
|
for i in range(1, self.maxlen):
|
||||||
|
index = lastSeperator+self.maxlen-i
|
||||||
|
#if the word ends with ?! or a .
|
||||||
|
if re.search('^[\w]*[\.\?!]$', self.words[index]):
|
||||||
|
lastSeperator = index+1
|
||||||
|
seperators.append(index+1)
|
||||||
|
break
|
||||||
|
if lastSeperator < index:
|
||||||
|
print('WARNING CutWords: No points found')
|
||||||
|
print(f"WARNING: string = {self.string[:40]}")
|
||||||
|
print(
|
||||||
|
f"WARNING: position {lastSeperator} = "
|
||||||
|
f"{self.string[lastSeperator:lastSeperator+20]}...")
|
||||||
|
lastSeperator = lastSeperator + self.maxlen
|
||||||
|
seperators.append(lastSeperator)
|
||||||
|
|
||||||
|
# print(f"Seperates in {len(seperators)+1} blocks by points."+\
|
||||||
|
# f"Needed minimum {(len(self.words)-1)//self.maxlen+1}.")
|
||||||
|
return seperators
|
||||||
|
raise Exception(f"CutWords: mode = {mode} is unknown.")
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
if self.blocks is None:
|
||||||
|
self.makeBlocks()
|
||||||
|
return ' '.join(self.blocks[i])
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
if self.blocks is None:
|
||||||
|
self.makeBlocks()
|
||||||
|
self.indexlast = -1
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
try:
|
||||||
|
self.indexlast += 1
|
||||||
|
return self.__getitem__(self.indexlast)
|
||||||
|
except IndexError:
|
||||||
|
raise StopIteration
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
if self.blocks is None:
|
||||||
|
self.makeBlocks()
|
||||||
|
return len(self.blocks)
|
Loading…
Reference in a new issue