algo.permutations and algo.string_metrics: string processing
algo.permutations
Note: names of methods in this module, if seem weird, are the same as in Hunspell’s suggest.cxx
to keep track of them.
-
replchars(word, reptable)[source] Uses
aff.REPtable (typical misspellings) to replace in the word provided. If the pattern’s replacement contains “_”, it means replacing to ” ” and yielding _two_ different hypotheses: it was one (dictionary) word “foo bar” (and should be checked as such) or it was words [“foo”, “bar”] and should be checked separately.def replchars(word: str, reptable: List[aff.RepPattern]) -> Iterator[Union[str, List[str]]]: """ Uses :attr:`aff.REP <spylls.hunspell.data.aff.Aff.REP>` table (typical misspellings) to replace in the word provided. If the pattern's replacement contains "_", it means replacing to " " and yielding _two_ different hypotheses: it was one (dictionary) word "foo bar" (and should be checked as such) or it was words ["foo", "bar"] and should be checked separately. """ if len(word) < 2 or not reptable: return for pattern in reptable: # TODO: compiled at aff loading for match in pattern.regexp.finditer(word): suggestion = word[:match.start()] + pattern.replacement.replace('_', ' ') + word[match.end():] yield suggestion if ' ' in suggestion: yield suggestion.split(' ', 2)
- Parameters:
word (str) –
reptable (List[spylls.hunspell.data.aff.RepPattern]) –
- Return type:
Iterator[Union[str, List[str]]]
-
mapchars(word, maptable)[source] Uses
aff.MAPtable ( sets of potentially similar chars) and tries to replace them recursively. E.g., assumingMAPhas entryaáã, and we have a misspelling “anarchia”,mapcharswill do this:>>> [*pmt.mapchars("anarchia", ['aáã'])] ['ánarchia', 'ánárchia', 'ánárchiá', 'ánárchiã', 'ánãrchia', 'ánãrchiá', 'ánãrchiã', 'ãnarchia', 'ãnárchia', 'ãnárchiá', 'ãnárchiã', 'ãnãrchia', 'ãnãrchiá', 'ãnãrchiã']
def mapchars(word: str, maptable: List[Set[str]]) -> Iterator[str]: """ Uses :attr:`aff.MAP <spylls.hunspell.data.aff.Aff.MAP>` table ( sets of potentially similar chars) and tries to replace them recursively. E.g., assuming ``MAP`` has entry ``aáã``, and we have a misspelling "anarchia", ``mapchars`` will do this: >>> [*pmt.mapchars("anarchia", ['aáã'])] ['ánarchia', 'ánárchia', 'ánárchiá', 'ánárchiã', 'ánãrchia', 'ánãrchiá', 'ánãrchiã', 'ãnarchia', 'ãnárchia', 'ãnárchiá', 'ãnárchiã', 'ãnãrchia', 'ãnãrchiá', 'ãnãrchiã'] """ if len(word) < 2 or not maptable: return def mapchars_internal(word, start=0): if start >= len(word): return for options in maptable: for option in options: pos = word.find(option, start) if pos != -1: for other in options: if other == option: continue replaced = word[:pos] + other + word[pos+len(option):] yield replaced for variant in mapchars_internal(replaced, pos + 1): yield variant yield from mapchars_internal(word)
- Parameters:
word (str) –
maptable (List[Set[str]]) –
- Return type:
Iterator[str]
-
swapchar(word)[source] Produces permutations with adjacent chars swapped. For short (4 or 5 letters) words produces also doubleswaps: ahev -> have.
def swapchar(word: str) -> Iterator[str]: """ Produces permutations with adjacent chars swapped. For short (4 or 5 letters) words produces also doubleswaps: ahev -> have. """ if len(word) < 2: return for i in range(len(word) - 1): yield word[:i] + word[i+1] + word[i] + word[i+2:] # try double swaps for short words # ahev -> have, owudl -> would if len(word) in [4, 5]: yield word[1] + word[0] + (word[2] if len(word) == 5 else '') + word[-1] + word[-2] if len(word) == 5: yield word[0] + word[2] + word[1] + word[-1] + word[-2]
- Parameters:
word (str) –
- Return type:
Iterator[str]
-
longswapchar(word)[source] Produces permutations with non-adjacent chars swapped (up to 4 chars distance)
def longswapchar(word: str) -> Iterator[str]: """ Produces permutations with non-adjacent chars swapped (up to 4 chars distance) """ for first in range(len(word) - 2): for second in range(first + 2, min(first + MAX_CHAR_DISTANCE, len(word))): yield word[:first] + word[second] + word[first+1:second] + word[first] + word[second+1:]
- Parameters:
word (str) –
- Return type:
Iterator[str]
-
badcharkey(word, layout)[source] Produces permutations with chars replaced by adjacent chars on keyboard layout (“vat -> cat”) or downcased (if it was accidental uppercase).
Uses
aff.KEYdef badcharkey(word: str, layout: str) -> Iterator[str]: """ Produces permutations with chars replaced by adjacent chars on keyboard layout ("vat -> cat") or downcased (if it was accidental uppercase). Uses :attr:`aff.KEY <spylls.hunspell.data.aff.Aff.KEY>` """ for i, c in enumerate(word): before = word[:i] after = word[i+1:] if c != c.upper(): yield before + c.upper() + after if not layout: continue pos = layout.find(c) while pos != -1: if pos > 0 and layout[pos-1] != '|': yield before + layout[pos-1] + after if pos + 1 < len(layout) and layout[pos+1] != '|': yield before + layout[pos+1] + after pos = layout.find(c, pos+1)
- Parameters:
word (str) –
layout (str) –
- Return type:
Iterator[str]
-
extrachar(word)[source] Produces permutations with one char removed in all possible positions
def extrachar(word: str) -> Iterator[str]: """ Produces permutations with one char removed in all possible positions """ if len(word) < 2: return for i in range(len(word)): yield word[:i] + word[i+1:]
- Parameters:
word (str) –
- Return type:
Iterator[str]
-
forgotchar(word, trystring)[source] Produces permutations with one char inserted in all possible possitions.
List of chars is taken from
aff.TRY– if it is absent, doesn’t try anything! Chars there are expected to be sorted in order of chars usage in language (most used characters first).def forgotchar(word: str, trystring: str) -> Iterator[str]: """ Produces permutations with one char inserted in all possible possitions. List of chars is taken from :attr:`aff.TRY <spylls.hunspell.data.aff.Aff.TRY>` -- if it is absent, doesn't try anything! Chars there are expected to be sorted in order of chars usage in language (most used characters first). """ if not trystring: return for c in trystring: for i in range(len(word) + 1): yield word[:i] + c + word[i:]
- Parameters:
word (str) –
trystring (str) –
- Return type:
Iterator[str]
-
movechar(word)[source] Produces permutations with one character moved by 2, 3 or 4 places forward or backward (not 1, because it is already handled by
swapchar())def movechar(word: str) -> Iterator[str]: """ Produces permutations with one character moved by 2, 3 or 4 places forward or backward (not 1, because it is already handled by :meth:`swapchar`) """ if len(word) < 2: return for frompos, char in enumerate(word): for topos in range(frompos + 3, min(len(word), frompos + MAX_CHAR_DISTANCE + 1)): yield word[:frompos] + word[frompos+1:topos] + char + word[topos:] for frompos in reversed(range(len(word))): for topos in reversed(range(max(0, frompos - MAX_CHAR_DISTANCE + 1), frompos - 1)): yield word[:topos] + word[frompos] + word[topos:frompos] + word[frompos+1:]
- Parameters:
word (str) –
- Return type:
Iterator[str]
-
badchar(word, trystring)[source] Produces permutations with chars replaced by chars in
aff.TRYset.def badchar(word: str, trystring: str) -> Iterator[str]: """ Produces permutations with chars replaced by chars in :attr:`aff.TRY <spylls.hunspell.data.aff.Aff.TRY>` set. """ if not trystring: return for c in trystring: for i in reversed(range(len(word))): if word[i] == c: continue yield word[:i] + c + word[i+1:]
- Parameters:
word (str) –
trystring (str) –
- Return type:
Iterator[str]
-
doubletwochars(word)[source] Produces permutations with accidental two-letter-doubling fixed (vacation -> vacacation)
def doubletwochars(word: str) -> Iterator[str]: """ Produces permutations with accidental two-letter-doubling fixed (vacation -> vacacation) """ if len(word) < 5: return # TODO: 1) for vacacation yields "vacation" twice, hunspell's algo kinda wiser # 2) maybe just use regexp?.. for i in range(2, len(word)): if word[i-2] == word[i] and word[i-3] == word[i-1]: yield word[:i-1] + word[i+1:]
- Parameters:
word (str) –
- Return type:
Iterator[str]
-
twowords(word)[source] Produces permutation of splitting in two words in all possible positions.
def twowords(word: str) -> Iterator[List[str]]: """ Produces permutation of splitting in two words in all possible positions. """ for i in range(1, len(word)): yield [word[:i], word[i:]]
- Parameters:
word (str) –
- Return type:
Iterator[List[str]]
algo.string_metrics
-
commoncharacters(s1, s2)[source] Number of occurrences of the exactly same characters in exactly same position.
def commoncharacters(s1: str, s2: str) -> int: """ Number of occurrences of the exactly same characters in exactly same position. """ return sum(c1 == c2 for c1, c2 in zip(s1, s2))
- Parameters:
s1 (str) –
s2 (str) –
- Return type:
int
-
leftcommonsubstring(s1, s2)[source] Size of the common start of two strings. “foo”, “bar” => 0, “built”, “build” => 4, “cat”, “cats” => 3
def leftcommonsubstring(s1: str, s2: str) -> int: """Size of the common start of two strings. "foo", "bar" => 0, "built", "build" => 4, "cat", "cats" => 3""" for (i, (c1, c2)) in enumerate(zip(s1, s2)): if c1 != c2: return i return min(len(s1), len(s2))
- Parameters:
s1 (str) –
s2 (str) –
- Return type:
int
-
ngram(max_ngram_size, s1, s2, *, weighted=False, any_mismatch=False, longer_worse=False)[source] Calculates how many of n-grams of s1 are contained in s2 (the more the number, the more words are similar).
- Parameters:
max_ngram_size (int) – n in ngram
s1 (str) – string to compare
s2 (str) – string to compare
weighted (bool) – substract from result for ngrams not contained
longer_worse (bool) – add a penalty when second string is longer
any_mismatch (bool) – add a penalty for any string length difference
- Return type:
int
FIXME: Actually, the last two settings do NOT participate in ngram counting by themselves, they are just adjusting the final score, but that’s how it was structured in Hunspell.
def ngram(max_ngram_size: int, s1: str, s2: str, *, weighted: bool = False, any_mismatch: bool = False, longer_worse: bool = False) -> int: """ Calculates how many of n-grams of s1 are contained in s2 (the more the number, the more words are similar). Args: max_ngram_size: n in ngram s1: string to compare s2: string to compare weighted: substract from result for ngrams *not* contained longer_worse: add a penalty when second string is longer any_mismatch: add a penalty for any string length difference FIXME: Actually, the last two settings do NOT participate in ngram counting by themselves, they are just adjusting the final score, but that's how it was structured in Hunspell. """ l2 = len(s2) if l2 == 0: return 0 l1 = len(s1) nscore = 0 # For all sizes of ngram up to desired... for ngram_size in range(1, max_ngram_size + 1): ns = 0 # Check every position in the first string for pos in range(l1 - ngram_size + 1): # ...and if the ngram of current size in this position is present in ANY place in second string if s1[pos:pos+ngram_size] in s2: # increase score ns += 1 elif weighted: # For "weighted" ngrams, decrease score if ngram is not found, ns -= 1 if pos == 0 or pos + ngram_size == l1: # ...and decrease once more if it was the beginning or end of first string ns -= 1 nscore += ns # there is no need to check for 4-gram if there were only one 3-gram if ns < 2 and not weighted: break # longer_worse setting adds a penalty if the second string is longer than first if longer_worse: penalty = (l2 - l1) - 2 # any_mismatch adds a penalty for _any_ string length difference elif any_mismatch: penalty = abs(l2 - l1) - 2 else: penalty = 0 return nscore - penalty if penalty > 0 else nscore
-
lcslen(s1, s2)[source] Classic “LCS (longest common subsequence) length” algorithm. This implementation is stolen shamelessly from https://gist.github.com/cgt/c0c47c100efda1d11854
def lcslen(s1: str, s2: str) -> int: """ Classic "LCS (longest common subsequence) length" algorithm. This implementation is stolen shamelessly from https://gist.github.com/cgt/c0c47c100efda1d11854 """ m = len(s1) n = len(s2) c = [[0 for j in range(n+1)] for i in range(m+1)] for i in range(m): for j in range(n): if s1[i] == s2[j]: c[i][j] = c[i-1][j-1] + 1 elif c[i-1][j] >= c[i][j-1]: c[i][j] = c[i-1][j] else: c[i][j] = c[i][j-1] return c[m-1][n-1]
- Parameters:
s1 (str) –
s2 (str) –
- Return type:
int