algo.permutations and algo.string_metrics: string processing

algo.permutations

Note: names of methods in this module, if seem weird, are the same as in Hunspell’s suggest.cxx to keep track of them.

replchars(word, reptable)[source]

Uses aff.REP table (typical misspellings) to replace in the word provided. If the pattern’s replacement contains “_”, it means replacing to ” ” and yielding _two_ different hypotheses: it was one (dictionary) word “foo bar” (and should be checked as such) or it was words [“foo”, “bar”] and should be checked separately.

def replchars(word: str, reptable: List[aff.RepPattern]) -> Iterator[Union[str, List[str]]]:
    """
    Uses :attr:`aff.REP <spylls.hunspell.data.aff.Aff.REP>` table (typical misspellings) to replace
    in the word provided. If the pattern's replacement contains "_", it means replacing to " " and
    yielding _two_ different hypotheses: it was one (dictionary) word "foo bar" (and should be
    checked as such) or it was words ["foo", "bar"] and should be checked separately.
    """

    if len(word) < 2 or not reptable:
        return

    for pattern in reptable:
        # TODO: compiled at aff loading
        for match in pattern.regexp.finditer(word):
            suggestion = word[:match.start()] + pattern.replacement.replace('_', ' ') + word[match.end():]
            yield suggestion
            if ' ' in suggestion:
                yield suggestion.split(' ', 2)
Parameters:
Return type:

Iterator[Union[str, List[str]]]

mapchars(word, maptable)[source]

Uses aff.MAP table ( sets of potentially similar chars) and tries to replace them recursively. E.g., assuming MAP has entry aáã, and we have a misspelling “anarchia”, mapchars will do this:

>>> [*pmt.mapchars("anarchia", ['aáã'])]
['ánarchia',
 'ánárchia',
 'ánárchiá',
 'ánárchiã',
 'ánãrchia',
 'ánãrchiá',
 'ánãrchiã',
 'ãnarchia',
 'ãnárchia',
 'ãnárchiá',
 'ãnárchiã',
 'ãnãrchia',
 'ãnãrchiá',
 'ãnãrchiã']
def mapchars(word: str, maptable: List[Set[str]]) -> Iterator[str]:
    """
    Uses :attr:`aff.MAP <spylls.hunspell.data.aff.Aff.MAP>` table ( sets of potentially similar chars)
    and tries to replace them recursively. E.g., assuming ``MAP`` has entry ``aáã``, and we have
    a misspelling "anarchia", ``mapchars`` will do this:

    >>> [*pmt.mapchars("anarchia", ['aáã'])]
    ['ánarchia',
     'ánárchia',
     'ánárchiá',
     'ánárchiã',
     'ánãrchia',
     'ánãrchiá',
     'ánãrchiã',
     'ãnarchia',
     'ãnárchia',
     'ãnárchiá',
     'ãnárchiã',
     'ãnãrchia',
     'ãnãrchiá',
     'ãnãrchiã']
    """

    if len(word) < 2 or not maptable:
        return

    def mapchars_internal(word, start=0):
        if start >= len(word):
            return

        for options in maptable:
            for option in options:
                pos = word.find(option, start)
                if pos != -1:
                    for other in options:
                        if other == option:
                            continue
                        replaced = word[:pos] + other + word[pos+len(option):]
                        yield replaced
                        for variant in mapchars_internal(replaced, pos + 1):
                            yield variant

    yield from mapchars_internal(word)
Parameters:
  • word (str) –

  • maptable (List[Set[str]]) –

Return type:

Iterator[str]

swapchar(word)[source]

Produces permutations with adjacent chars swapped. For short (4 or 5 letters) words produces also doubleswaps: ahev -> have.

def swapchar(word: str) -> Iterator[str]:
    """
    Produces permutations with adjacent chars swapped. For short (4 or 5 letters) words produces
    also doubleswaps: ahev -> have.
    """

    if len(word) < 2:
        return

    for i in range(len(word) - 1):
        yield word[:i] + word[i+1] + word[i] + word[i+2:]

    # try double swaps for short words
    # ahev -> have, owudl -> would
    if len(word) in [4,  5]:
        yield word[1] + word[0] + (word[2] if len(word) == 5 else '') + word[-1] + word[-2]
        if len(word) == 5:
            yield word[0] + word[2] + word[1] + word[-1] + word[-2]
Parameters:

word (str) –

Return type:

Iterator[str]

longswapchar(word)[source]

Produces permutations with non-adjacent chars swapped (up to 4 chars distance)

def longswapchar(word: str) -> Iterator[str]:
    """
    Produces permutations with non-adjacent chars swapped (up to 4 chars distance)
    """

    for first in range(len(word) - 2):
        for second in range(first + 2, min(first + MAX_CHAR_DISTANCE, len(word))):
            yield word[:first] + word[second] + word[first+1:second] + word[first] + word[second+1:]
Parameters:

word (str) –

Return type:

Iterator[str]

badcharkey(word, layout)[source]

Produces permutations with chars replaced by adjacent chars on keyboard layout (“vat -> cat”) or downcased (if it was accidental uppercase).

Uses aff.KEY

def badcharkey(word: str, layout: str) -> Iterator[str]:
    """
    Produces permutations with chars replaced by adjacent chars on keyboard layout ("vat -> cat")
    or downcased (if it was accidental uppercase).

    Uses :attr:`aff.KEY <spylls.hunspell.data.aff.Aff.KEY>`
    """

    for i, c in enumerate(word):
        before = word[:i]
        after = word[i+1:]
        if c != c.upper():
            yield before + c.upper() + after

        if not layout:
            continue

        pos = layout.find(c)
        while pos != -1:
            if pos > 0 and layout[pos-1] != '|':
                yield before + layout[pos-1] + after
            if pos + 1 < len(layout) and layout[pos+1] != '|':
                yield before + layout[pos+1] + after
            pos = layout.find(c, pos+1)
Parameters:
  • word (str) –

  • layout (str) –

Return type:

Iterator[str]

extrachar(word)[source]

Produces permutations with one char removed in all possible positions

def extrachar(word: str) -> Iterator[str]:
    """
    Produces permutations with one char removed in all possible positions
    """
    if len(word) < 2:
        return

    for i in range(len(word)):
        yield word[:i] + word[i+1:]
Parameters:

word (str) –

Return type:

Iterator[str]

forgotchar(word, trystring)[source]

Produces permutations with one char inserted in all possible possitions.

List of chars is taken from aff.TRY – if it is absent, doesn’t try anything! Chars there are expected to be sorted in order of chars usage in language (most used characters first).

def forgotchar(word: str, trystring: str) -> Iterator[str]:
    """
    Produces permutations with one char inserted in all possible possitions.

    List of chars is taken from :attr:`aff.TRY <spylls.hunspell.data.aff.Aff.TRY>` -- if it is absent,
    doesn't try anything! Chars there are expected to be sorted in order of chars usage in language
    (most used characters first).
    """

    if not trystring:
        return

    for c in trystring:
        for i in range(len(word) + 1):
            yield word[:i] + c + word[i:]
Parameters:
  • word (str) –

  • trystring (str) –

Return type:

Iterator[str]

movechar(word)[source]

Produces permutations with one character moved by 2, 3 or 4 places forward or backward (not 1, because it is already handled by swapchar())

def movechar(word: str) -> Iterator[str]:
    """
    Produces permutations with one character moved by 2, 3 or 4 places forward or backward (not 1,
    because it is already handled by :meth:`swapchar`)
    """

    if len(word) < 2:
        return

    for frompos, char in enumerate(word):
        for topos in range(frompos + 3, min(len(word), frompos + MAX_CHAR_DISTANCE + 1)):
            yield word[:frompos] + word[frompos+1:topos] + char + word[topos:]

    for frompos in reversed(range(len(word))):
        for topos in reversed(range(max(0, frompos - MAX_CHAR_DISTANCE + 1), frompos - 1)):
            yield word[:topos] + word[frompos] + word[topos:frompos] + word[frompos+1:]
Parameters:

word (str) –

Return type:

Iterator[str]

badchar(word, trystring)[source]

Produces permutations with chars replaced by chars in aff.TRY set.

def badchar(word: str, trystring: str) -> Iterator[str]:
    """
    Produces permutations with chars replaced by chars in :attr:`aff.TRY <spylls.hunspell.data.aff.Aff.TRY>`
    set.
    """

    if not trystring:
        return

    for c in trystring:
        for i in reversed(range(len(word))):
            if word[i] == c:
                continue
            yield word[:i] + c + word[i+1:]
Parameters:
  • word (str) –

  • trystring (str) –

Return type:

Iterator[str]

doubletwochars(word)[source]

Produces permutations with accidental two-letter-doubling fixed (vacation -> vacacation)

def doubletwochars(word: str) -> Iterator[str]:
    """
    Produces permutations with accidental two-letter-doubling fixed (vacation -> vacacation)
    """

    if len(word) < 5:
        return

    # TODO: 1) for vacacation yields "vacation" twice, hunspell's algo kinda wiser
    # 2) maybe just use regexp?..
    for i in range(2, len(word)):
        if word[i-2] == word[i] and word[i-3] == word[i-1]:
            yield word[:i-1] + word[i+1:]
Parameters:

word (str) –

Return type:

Iterator[str]

twowords(word)[source]

Produces permutation of splitting in two words in all possible positions.

def twowords(word: str) -> Iterator[List[str]]:
    """
    Produces permutation of splitting in two words in all possible positions.
    """

    for i in range(1, len(word)):
        yield [word[:i], word[i:]]
Parameters:

word (str) –

Return type:

Iterator[List[str]]

algo.string_metrics

commoncharacters(s1, s2)[source]

Number of occurrences of the exactly same characters in exactly same position.

def commoncharacters(s1: str, s2: str) -> int:
    """
    Number of occurrences of the exactly same characters in exactly same position.
    """

    return sum(c1 == c2 for c1, c2 in zip(s1, s2))
Parameters:
  • s1 (str) –

  • s2 (str) –

Return type:

int

leftcommonsubstring(s1, s2)[source]

Size of the common start of two strings. “foo”, “bar” => 0, “built”, “build” => 4, “cat”, “cats” => 3

def leftcommonsubstring(s1: str, s2: str) -> int:
    """Size of the common start of two strings. "foo", "bar" => 0, "built", "build" => 4, "cat", "cats" => 3"""
    for (i, (c1, c2)) in enumerate(zip(s1, s2)):
        if c1 != c2:
            return i

    return min(len(s1), len(s2))
Parameters:
  • s1 (str) –

  • s2 (str) –

Return type:

int

ngram(max_ngram_size, s1, s2, *, weighted=False, any_mismatch=False, longer_worse=False)[source]

Calculates how many of n-grams of s1 are contained in s2 (the more the number, the more words are similar).

Parameters:
  • max_ngram_size (int) – n in ngram

  • s1 (str) – string to compare

  • s2 (str) – string to compare

  • weighted (bool) – substract from result for ngrams not contained

  • longer_worse (bool) – add a penalty when second string is longer

  • any_mismatch (bool) – add a penalty for any string length difference

Return type:

int

FIXME: Actually, the last two settings do NOT participate in ngram counting by themselves, they are just adjusting the final score, but that’s how it was structured in Hunspell.

def ngram(max_ngram_size: int, s1: str, s2: str, *,
          weighted: bool = False, any_mismatch: bool = False, longer_worse: bool = False) -> int:

    """
    Calculates how many of n-grams of s1 are contained in s2 (the more the number, the more words
    are similar).

    Args:
      max_ngram_size: n in ngram
      s1: string to compare
      s2: string to compare
      weighted: substract from result for ngrams *not* contained
      longer_worse: add a penalty when second string is longer
      any_mismatch: add a penalty for any string length difference

    FIXME: Actually, the last two settings do NOT participate in ngram counting by themselves, they
    are just adjusting the final score, but that's how it was structured in Hunspell.
    """

    l2 = len(s2)
    if l2 == 0:
        return 0
    l1 = len(s1)

    nscore = 0
    # For all sizes of ngram up to desired...
    for ngram_size in range(1, max_ngram_size + 1):
        ns = 0
        # Check every position in the first string
        for pos in range(l1 - ngram_size + 1):
            # ...and if the ngram of current size in this position is present in ANY place in second string
            if s1[pos:pos+ngram_size] in s2:
                # increase score
                ns += 1
            elif weighted:
                # For "weighted" ngrams, decrease score if ngram is not found,
                ns -= 1
                if pos == 0 or pos + ngram_size == l1:
                    # ...and decrease once more if it was the beginning or end of first string
                    ns -= 1
        nscore += ns
        # there is no need to check for 4-gram if there were only one 3-gram
        if ns < 2 and not weighted:
            break

    # longer_worse setting adds a penalty if the second string is longer than first
    if longer_worse:
        penalty = (l2 - l1) - 2
    # any_mismatch adds a penalty for _any_ string length difference
    elif any_mismatch:
        penalty = abs(l2 - l1) - 2
    else:
        penalty = 0

    return nscore - penalty if penalty > 0 else nscore
lcslen(s1, s2)[source]

Classic “LCS (longest common subsequence) length” algorithm. This implementation is stolen shamelessly from https://gist.github.com/cgt/c0c47c100efda1d11854

def lcslen(s1: str, s2: str) -> int:
    """
    Classic "LCS (longest common subsequence) length" algorithm.
    This implementation is stolen shamelessly from https://gist.github.com/cgt/c0c47c100efda1d11854
    """

    m = len(s1)
    n = len(s2)

    c = [[0 for j in range(n+1)] for i in range(m+1)]

    for i in range(m):
        for j in range(n):
            if s1[i] == s2[j]:
                c[i][j] = c[i-1][j-1] + 1
            elif c[i-1][j] >= c[i][j-1]:
                c[i][j] = c[i-1][j]
            else:
                c[i][j] = c[i][j-1]

    return c[m-1][n-1]
Parameters:
  • s1 (str) –

  • s2 (str) –

Return type:

int