algo.permutations
and algo.string_metrics
: string processing
algo.permutations
Note: names of methods in this module, if seem weird, are the same as in Hunspell’s suggest.cxx
to keep track of them.
-
replchars
(word, reptable)[source] Uses
aff.REP
table (typical misspellings) to replace in the word provided. If the pattern’s replacement contains “_”, it means replacing to ” ” and yielding _two_ different hypotheses: it was one (dictionary) word “foo bar” (and should be checked as such) or it was words [“foo”, “bar”] and should be checked separately.def replchars(word: str, reptable: List[aff.RepPattern]) -> Iterator[Union[str, List[str]]]: """ Uses :attr:`aff.REP <spylls.hunspell.data.aff.Aff.REP>` table (typical misspellings) to replace in the word provided. If the pattern's replacement contains "_", it means replacing to " " and yielding _two_ different hypotheses: it was one (dictionary) word "foo bar" (and should be checked as such) or it was words ["foo", "bar"] and should be checked separately. """ if len(word) < 2 or not reptable: return for pattern in reptable: # TODO: compiled at aff loading for match in pattern.regexp.finditer(word): suggestion = word[:match.start()] + pattern.replacement.replace('_', ' ') + word[match.end():] yield suggestion if ' ' in suggestion: yield suggestion.split(' ', 2)
- Parameters
word (str) –
reptable (List[spylls.hunspell.data.aff.RepPattern]) –
- Return type
Iterator[Union[str, List[str]]]
-
mapchars
(word, maptable)[source] Uses
aff.MAP
table ( sets of potentially similar chars) and tries to replace them recursively. E.g., assumingMAP
has entryaáã
, and we have a misspelling “anarchia”,mapchars
will do this:>>> [*pmt.mapchars("anarchia", ['aáã'])] ['ánarchia', 'ánárchia', 'ánárchiá', 'ánárchiã', 'ánãrchia', 'ánãrchiá', 'ánãrchiã', 'ãnarchia', 'ãnárchia', 'ãnárchiá', 'ãnárchiã', 'ãnãrchia', 'ãnãrchiá', 'ãnãrchiã']
def mapchars(word: str, maptable: List[Set[str]]) -> Iterator[str]: """ Uses :attr:`aff.MAP <spylls.hunspell.data.aff.Aff.MAP>` table ( sets of potentially similar chars) and tries to replace them recursively. E.g., assuming ``MAP`` has entry ``aáã``, and we have a misspelling "anarchia", ``mapchars`` will do this: >>> [*pmt.mapchars("anarchia", ['aáã'])] ['ánarchia', 'ánárchia', 'ánárchiá', 'ánárchiã', 'ánãrchia', 'ánãrchiá', 'ánãrchiã', 'ãnarchia', 'ãnárchia', 'ãnárchiá', 'ãnárchiã', 'ãnãrchia', 'ãnãrchiá', 'ãnãrchiã'] """ if len(word) < 2 or not maptable: return def mapchars_internal(word, start=0): if start >= len(word): return for options in maptable: for option in options: pos = word.find(option, start) if pos != -1: for other in options: if other == option: continue replaced = word[:pos] + other + word[pos+len(option):] yield replaced for variant in mapchars_internal(replaced, pos + 1): yield variant yield from mapchars_internal(word)
- Parameters
word (str) –
maptable (List[Set[str]]) –
- Return type
Iterator[str]
-
swapchar
(word)[source] Produces permutations with adjacent chars swapped. For short (4 or 5 letters) words produces also doubleswaps: ahev -> have.
def swapchar(word: str) -> Iterator[str]: """ Produces permutations with adjacent chars swapped. For short (4 or 5 letters) words produces also doubleswaps: ahev -> have. """ if len(word) < 2: return for i in range(len(word) - 1): yield word[:i] + word[i+1] + word[i] + word[i+2:] # try double swaps for short words # ahev -> have, owudl -> would if len(word) in [4, 5]: yield word[1] + word[0] + (word[2] if len(word) == 5 else '') + word[-1] + word[-2] if len(word) == 5: yield word[0] + word[2] + word[1] + word[-1] + word[-2]
- Parameters
word (str) –
- Return type
Iterator[str]
-
longswapchar
(word)[source] Produces permutations with non-adjacent chars swapped (up to 4 chars distance)
def longswapchar(word: str) -> Iterator[str]: """ Produces permutations with non-adjacent chars swapped (up to 4 chars distance) """ for first in range(len(word) - 2): for second in range(first + 2, min(first + MAX_CHAR_DISTANCE, len(word))): yield word[:first] + word[second] + word[first+1:second] + word[first] + word[second+1:]
- Parameters
word (str) –
- Return type
Iterator[str]
-
badcharkey
(word, layout)[source] Produces permutations with chars replaced by adjacent chars on keyboard layout (“vat -> cat”) or downcased (if it was accidental uppercase).
Uses
aff.KEY
def badcharkey(word: str, layout: str) -> Iterator[str]: """ Produces permutations with chars replaced by adjacent chars on keyboard layout ("vat -> cat") or downcased (if it was accidental uppercase). Uses :attr:`aff.KEY <spylls.hunspell.data.aff.Aff.KEY>` """ for i, c in enumerate(word): before = word[:i] after = word[i+1:] if c != c.upper(): yield before + c.upper() + after if not layout: continue pos = layout.find(c) while pos != -1: if pos > 0 and layout[pos-1] != '|': yield before + layout[pos-1] + after if pos + 1 < len(layout) and layout[pos+1] != '|': yield before + layout[pos+1] + after pos = layout.find(c, pos+1)
- Parameters
word (str) –
layout (str) –
- Return type
Iterator[str]
-
extrachar
(word)[source] Produces permutations with one char removed in all possible positions
def extrachar(word: str) -> Iterator[str]: """ Produces permutations with one char removed in all possible positions """ if len(word) < 2: return for i in range(len(word)): yield word[:i] + word[i+1:]
- Parameters
word (str) –
- Return type
Iterator[str]
-
forgotchar
(word, trystring)[source] Produces permutations with one char inserted in all possible possitions.
List of chars is taken from
aff.TRY
– if it is absent, doesn’t try anything! Chars there are expected to be sorted in order of chars usage in language (most used characters first).def forgotchar(word: str, trystring: str) -> Iterator[str]: """ Produces permutations with one char inserted in all possible possitions. List of chars is taken from :attr:`aff.TRY <spylls.hunspell.data.aff.Aff.TRY>` -- if it is absent, doesn't try anything! Chars there are expected to be sorted in order of chars usage in language (most used characters first). """ if not trystring: return for c in trystring: for i in range(len(word) + 1): yield word[:i] + c + word[i:]
- Parameters
word (str) –
trystring (str) –
- Return type
Iterator[str]
-
movechar
(word)[source] Produces permutations with one character moved by 2, 3 or 4 places forward or backward (not 1, because it is already handled by
swapchar()
)def movechar(word: str) -> Iterator[str]: """ Produces permutations with one character moved by 2, 3 or 4 places forward or backward (not 1, because it is already handled by :meth:`swapchar`) """ if len(word) < 2: return for frompos, char in enumerate(word): for topos in range(frompos + 3, min(len(word), frompos + MAX_CHAR_DISTANCE + 1)): yield word[:frompos] + word[frompos+1:topos] + char + word[topos:] for frompos in reversed(range(len(word))): for topos in reversed(range(max(0, frompos - MAX_CHAR_DISTANCE + 1), frompos - 1)): yield word[:topos] + word[frompos] + word[topos:frompos] + word[frompos+1:]
- Parameters
word (str) –
- Return type
Iterator[str]
-
badchar
(word, trystring)[source] Produces permutations with chars replaced by chars in
aff.TRY
set.def badchar(word: str, trystring: str) -> Iterator[str]: """ Produces permutations with chars replaced by chars in :attr:`aff.TRY <spylls.hunspell.data.aff.Aff.TRY>` set. """ if not trystring: return for c in trystring: for i in reversed(range(len(word))): if word[i] == c: continue yield word[:i] + c + word[i+1:]
- Parameters
word (str) –
trystring (str) –
- Return type
Iterator[str]
-
doubletwochars
(word)[source] Produces permutations with accidental two-letter-doubling fixed (vacation -> vacacation)
def doubletwochars(word: str) -> Iterator[str]: """ Produces permutations with accidental two-letter-doubling fixed (vacation -> vacacation) """ if len(word) < 5: return # TODO: 1) for vacacation yields "vacation" twice, hunspell's algo kinda wiser # 2) maybe just use regexp?.. for i in range(2, len(word)): if word[i-2] == word[i] and word[i-3] == word[i-1]: yield word[:i-1] + word[i+1:]
- Parameters
word (str) –
- Return type
Iterator[str]
-
twowords
(word)[source] Produces permutation of splitting in two words in all possible positions.
def twowords(word: str) -> Iterator[List[str]]: """ Produces permutation of splitting in two words in all possible positions. """ for i in range(1, len(word)): yield [word[:i], word[i:]]
- Parameters
word (str) –
- Return type
Iterator[List[str]]
algo.string_metrics
-
commoncharacters
(s1, s2)[source] Number of occurrences of the exactly same characters in exactly same position.
def commoncharacters(s1: str, s2: str) -> int: """ Number of occurrences of the exactly same characters in exactly same position. """ return sum(c1 == c2 for c1, c2 in zip(s1, s2))
- Parameters
s1 (str) –
s2 (str) –
- Return type
int
-
leftcommonsubstring
(s1, s2)[source] Size of the common start of two strings. “foo”, “bar” => 0, “built”, “build” => 4, “cat”, “cats” => 3
def leftcommonsubstring(s1: str, s2: str) -> int: """Size of the common start of two strings. "foo", "bar" => 0, "built", "build" => 4, "cat", "cats" => 3""" for (i, (c1, c2)) in enumerate(zip(s1, s2)): if c1 != c2: return i return min(len(s1), len(s2))
- Parameters
s1 (str) –
s2 (str) –
- Return type
int
-
ngram
(max_ngram_size, s1, s2, *, weighted=False, any_mismatch=False, longer_worse=False)[source] Calculates how many of n-grams of s1 are contained in s2 (the more the number, the more words are similar).
- Parameters
max_ngram_size (int) – n in ngram
s1 (str) – string to compare
s2 (str) – string to compare
weighted (bool) – substract from result for ngrams not contained
longer_worse (bool) – add a penalty when second string is longer
any_mismatch (bool) – add a penalty for any string length difference
- Return type
int
FIXME: Actually, the last two settings do NOT participate in ngram counting by themselves, they are just adjusting the final score, but that’s how it was structured in Hunspell.
def ngram(max_ngram_size: int, s1: str, s2: str, *, weighted: bool = False, any_mismatch: bool = False, longer_worse: bool = False) -> int: """ Calculates how many of n-grams of s1 are contained in s2 (the more the number, the more words are similar). Args: max_ngram_size: n in ngram s1: string to compare s2: string to compare weighted: substract from result for ngrams *not* contained longer_worse: add a penalty when second string is longer any_mismatch: add a penalty for any string length difference FIXME: Actually, the last two settings do NOT participate in ngram counting by themselves, they are just adjusting the final score, but that's how it was structured in Hunspell. """ l2 = len(s2) if l2 == 0: return 0 l1 = len(s1) nscore = 0 # For all sizes of ngram up to desired... for ngram_size in range(1, max_ngram_size + 1): ns = 0 # Check every position in the first string for pos in range(l1 - ngram_size + 1): # ...and if the ngram of current size in this position is present in ANY place in second string if s1[pos:pos+ngram_size] in s2: # increase score ns += 1 elif weighted: # For "weighted" ngrams, decrease score if ngram is not found, ns -= 1 if pos == 0 or pos + ngram_size == l1: # ...and decrease once more if it was the beginning or end of first string ns -= 1 nscore += ns # there is no need to check for 4-gram if there were only one 3-gram if ns < 2 and not weighted: break # longer_worse setting adds a penalty if the second string is longer than first if longer_worse: penalty = (l2 - l1) - 2 # any_mismatch adds a penalty for _any_ string length difference elif any_mismatch: penalty = abs(l2 - l1) - 2 else: penalty = 0 return nscore - penalty if penalty > 0 else nscore
-
lcslen
(s1, s2)[source] Classic “LCS (longest common subsequence) length” algorithm. This implementation is stolen shamelessly from https://gist.github.com/cgt/c0c47c100efda1d11854
def lcslen(s1: str, s2: str) -> int: """ Classic "LCS (longest common subsequence) length" algorithm. This implementation is stolen shamelessly from https://gist.github.com/cgt/c0c47c100efda1d11854 """ m = len(s1) n = len(s2) c = [[0 for j in range(n+1)] for i in range(m+1)] for i in range(m): for j in range(n): if s1[i] == s2[j]: c[i][j] = c[i-1][j-1] + 1 elif c[i-1][j] >= c[i][j-1]: c[i][j] = c[i-1][j] else: c[i][j] = c[i][j-1] return c[m-1][n-1]
- Parameters
s1 (str) –
s2 (str) –
- Return type
int