CutWords
This commit is contained in:
parent
134ea9e8ae
commit
ad4bf818fa
1 changed files with 113 additions and 0 deletions
113
cutwords.py
Normal file
113
cutwords.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
#
|
||||
# takes any length string, returns of list strings with each 128 words
|
||||
# SPDX-FileCopyrightText: 2022 Beat Jäckle <beat@git.jdmweb2.ch>
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
import re
|
||||
|
||||
class CutWords:
|
||||
def __init__(self, string, maxlen=128, mode=None):
|
||||
self.setMaxlen(maxlen)
|
||||
self.setString(string)
|
||||
self.setMode(mode)
|
||||
|
||||
def __call__(self):
|
||||
return self.getBlocks()
|
||||
|
||||
def setMode(self, mode):
|
||||
self.mode = mode
|
||||
self.seperators = None
|
||||
self.blocks = None
|
||||
|
||||
def setMaxlen(self, maxlen):
|
||||
self.maxlen = maxlen
|
||||
self.seperators = None
|
||||
self.blocks = None
|
||||
|
||||
def setString(self, string):
|
||||
self.string = string
|
||||
self.words = string.split()
|
||||
self.seperators = None
|
||||
self.blocks = None
|
||||
|
||||
def getBlocks(self, mode=None):
|
||||
if self.blocks is None:
|
||||
self.makeBlocks()
|
||||
if mode == 'strings':
|
||||
return [' '.join(b) for b in self.blocks]
|
||||
return self.blocks
|
||||
|
||||
def makeBlocks(self):
|
||||
if self.seperators is None:
|
||||
self.makeSeperators(mode='points')
|
||||
blocks = []
|
||||
lastSeperator = 0
|
||||
for seperator in self.seperators:
|
||||
blocks.append(self.words[lastSeperator:seperator])
|
||||
lastSeperator = seperator
|
||||
blocks.append(self.words[lastSeperator:])
|
||||
self.blocks = blocks
|
||||
return blocks
|
||||
|
||||
def makeSeperators(self, mode=None):
|
||||
self.seperators = self.findSeperators(
|
||||
mode=mode
|
||||
)
|
||||
|
||||
def findSeperators(self, mode=None):
|
||||
if mode is None:
|
||||
mode = 'points'
|
||||
if mode == 'even':
|
||||
nseperators = (len(self.words)-1)//self.maxlen
|
||||
return [
|
||||
len(self.words)//nseperators*(i+1)
|
||||
for i in range(nseperators)
|
||||
]
|
||||
if mode == 'points':
|
||||
seperators = []
|
||||
lastSeperator = 0
|
||||
while lastSeperator + self.maxlen < len(self.words):
|
||||
for i in range(1, self.maxlen):
|
||||
index = lastSeperator+self.maxlen-i
|
||||
#if the word ends with ?! or a .
|
||||
if re.search('^[\w]*[\.\?!]$', self.words[index]):
|
||||
lastSeperator = index+1
|
||||
seperators.append(index+1)
|
||||
break
|
||||
if lastSeperator < index:
|
||||
print('WARNING CutWords: No points found')
|
||||
print(f"WARNING: string = {self.string[:40]}")
|
||||
print(
|
||||
f"WARNING: position {lastSeperator} = "
|
||||
f"{self.string[lastSeperator:lastSeperator+20]}...")
|
||||
lastSeperator = lastSeperator + self.maxlen
|
||||
seperators.append(lastSeperator)
|
||||
|
||||
# print(f"Seperates in {len(seperators)+1} blocks by points."+\
|
||||
# f"Needed minimum {(len(self.words)-1)//self.maxlen+1}.")
|
||||
return seperators
|
||||
raise Exception(f"CutWords: mode = {mode} is unknown.")
|
||||
|
||||
def __getitem__(self, i):
|
||||
if self.blocks is None:
|
||||
self.makeBlocks()
|
||||
return ' '.join(self.blocks[i])
|
||||
|
||||
def __iter__(self):
|
||||
if self.blocks is None:
|
||||
self.makeBlocks()
|
||||
self.indexlast = -1
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
try:
|
||||
self.indexlast += 1
|
||||
return self.__getitem__(self.indexlast)
|
||||
except IndexError:
|
||||
raise StopIteration
|
||||
return None
|
||||
|
||||
def __len__(self):
|
||||
if self.blocks is None:
|
||||
self.makeBlocks()
|
||||
return len(self.blocks)
|
Loading…
Reference in a new issue