Source code for geowatch.utils.util_stringalgo

"""
pip install pygtrie
"""
import numpy as np  # NOQA



[docs]
def shortest_unique_prefixes(items, sep=None, allow_simple=True, min_length=0, allow_end=False):
    r"""
    The shortest unique prefix algorithm.

    Args:
        items (List[str]): returned prefixes will be unique wrt this set
        sep (str): if specified, all characters between separators are treated
            as a single symbol. Makes the algo much faster.
        allow_simple (bool): if True tries to construct a simple feasible
            solution before resorting to the optimal trie algorithm.
        min_length (int): minimum length each prefix can be
        allow_end (bool): if True allows for string terminators to be
            considered in the prefix

    Returns:
        list of str: a prefix for each item that uniquely identifies it
           wrt to the original items.

    References:
        http://www.geeksforgeeks.org/find-all-shortest-unique-prefixes-to-represent-each-word-in-a-given-list/
        https://github.com/Briaares/InterviewBit/blob/master/Level6/Shortest%20Unique%20Prefix.cpp

    Requires:
        pip install pygtrie

    Example:
        >>> # xdoctest: +REQUIRES(module:pygtrie)
        >>> items = ["zebra", "dog", "duck", "dove"]
        >>> shortest_unique_prefixes(items)
        ['z', 'dog', 'du', 'dov']

    Example:
        >>> # xdoctest: +REQUIRES(module:pygtrie)
        >>> from geowatch.utils.util_stringalgo import *  # NOQA
        >>> smeti = ["params.metrics.foo.mean", "params.metrics.foo.std", "params.metrics.foo.count"]
        >>> items = [p[::-1] for p in smeti]
        >>> euqine = shortest_unique_prefixes(items, sep='.', min_length=2)
        >>> unique = [p[::-1] for p in euqine]
        >>> print(f'unique={unique}')
        unique=['foo.mean', 'foo.std', 'foo.count']

    Timeing:
        >>> # DISABLE_DOCTEST
        >>> # make numbers larger to stress test
        >>> # L = max length of a string, N = number of strings,
        >>> # C = smallest gaurenteed common length
        >>> # (the setting N=10000, L=100, C=20 is feasible we are good)
        >>> import timerit
        >>> import random
        >>> def make_data(N, L, C):
        >>>     rng = random.Random(0)
        >>>     return [''.join(['a' if i < C else chr(rng.randint(97, 122))
        >>>                      for i in range(L)]) for _ in range(N)]
        >>> items = make_data(N=1000, L=10, C=0)
        >>> timerit.Timerit(3).call(shortest_unique_prefixes, items).print()
        Timed for: 3 loops, best of 3
            time per loop: best=24.54 ms, mean=24.54 ± 0.0 ms
        >>> items = make_data(N=1000, L=100, C=0)
        >>> timerit.Timerit(3).call(shortest_unique_prefixes, items).print()
        Timed for: 3 loops, best of 3
            time per loop: best=155.4 ms, mean=155.4 ± 0.0 ms
        >>> items = make_data(N=1000, L=100, C=70)
        >>> timerit.Timerit(3).call(shortest_unique_prefixes, items).print()
        Timed for: 3 loops, best of 3
            time per loop: best=232.8 ms, mean=232.8 ± 0.0 ms
        >>> items = make_data(N=10000, L=250, C=20)
        >>> timerit.Timerit(3).call(shortest_unique_prefixes, items).print()
        Timed for: 3 loops, best of 3
            time per loop: best=4.063 s, mean=4.063 ± 0.0 s
    """
    import pygtrie
    if len(set(items)) != len(items):
        raise ValueError('inputs must be unique')

    # construct trie
    if sep is None:
        trie = pygtrie.CharTrie.fromkeys(items, value=0)
    else:
        # In some simple cases we can avoid constructing a trie
        if allow_simple and min_length <= 1:
            tokens = [item.split(sep) for item in items]
            simple_solution = [t[0] for t in tokens]
            if len(simple_solution) == len(set(simple_solution)):
                return simple_solution
            for i in range(2, 10):
                # print('return simple solution at i = {!r}'.format(i))
                simple_solution = [sep.join(t[:i]) for t in tokens]
                if len(simple_solution) == len(set(simple_solution)):
                    return simple_solution

        trie = pygtrie.StringTrie.fromkeys(items, value=0, separator=sep)

    # Set the value (frequency) of all nodes to zero.
    for node in _trie_iternodes(trie):
        node.value = 0

    # For each item trace its path and increment frequencies
    for item in items:
        final_node, trace = trie._get_node(item)
        for key, node in trace:
            node.value += 1

    # if not isinstance(node.value, int):
    #     node.value = 0

    # Query for the first prefix with frequency 1 for each item.
    # This is the shortest unique prefix over all items.
    unique = []
    for item in items:
        freq = None
        for prefix, freq in trie.prefixes(item):
            if freq == 1:
                if sep is None:
                    prefix_length = len(prefix)
                else:
                    prefix_length = prefix.count(sep) + bool(len(prefix))
                if prefix_length >= min_length:
                    break
        if not allow_end:
            assert freq == 1, 'item={} has no unique prefix. freq={}'.format(item, freq)
        # print('items = {!r}'.format(items))
        unique.append(prefix)
    return unique



def _trie_iternodes(self):
    """
    Generates all nodes in the trie

    # Hack into the internal structure and insert frequencies at each node

    Ignore:
            from geowatch.utils.util_dotdict import *  # NOQA
            dat = DotDict({
                'proc1.param1': 1,
                'proc1.param2': 2,
                'proc2.param1': 3,
                'proc4.part1.param2': 8,
                'proc4.part2.param2': 9,
                'proc4.part2.param2': 10,
            })
            self = dat._prefix_trie
            for n in list(_trie_iternodes(self)):
                print(f'n.value={n.value}')
                ...
    """
    from collections import deque
    stack = deque([[self._root]])
    while stack:
        for node in stack.pop():
            yield node
            try:
                # only works in pygtrie-2.2 broken in pygtrie-2.3.2
                stack.append(node.children.values())
            except AttributeError:
                stack.append([v for k, v in node.children.iteritems()])


def _trie_iteritems(self):
    """
    Generates all nodes in the trie

    # Hack into the internal structure and insert frequencies at each node

    Ignore:
        from geowatch.utils.util_dotdict import *  # NOQA
        from geowatch.utils.util_stringalgo import *  # NOQA
        dat = DotDict({
            'proc1.param1': 1,
            'proc1.param2': 2,
            'proc2.param1': 3,
            'proc4.part1.param2': 8,
            'proc4.part2.param2': 9,
            'proc4.part2.param2': 10,
        })
        self = dat._prefix_trie
        for k, v in list(_trie_iteritems(self)):
            print(f'k={k}')
    """
    from collections import deque
    sentinel = object()
    stack = deque([[(sentinel, self._root)]])
    while stack:
        level = stack.pop()
        for key, node in level:
            if key is not sentinel:
                yield key, node
            # only works in pygtrie-2.2 broken in pygtrie-2.3.2
            try:
                stack.append(list(node.children.items()))
            except AttributeError:
                stack.append(list(node.children.iteritems()))



[docs]
def shortest_unique_suffixes(items, sep=None, min_length=0):
    r"""
    Example:
        >>> # xdoctest: +REQUIRES(--pygtrie)
        >>> items = ["zebra", "dog", "duck", "dove"]
        >>> shortest_unique_suffixes(items)
        ['a', 'g', 'k', 'e']

    Example:
        >>> # xdoctest: +REQUIRES(--pygtrie)
        >>> items = ["aa/bb/cc", "aa/bb/bc", "aa/bb/dc", "aa/cc/cc"]
        >>> shortest_unique_suffixes(items)
    """
    snoitpo = [p[::-1] for p in items]
    sexiffus = shortest_unique_prefixes(snoitpo, sep=sep, min_length=min_length)
    suffixes = [s[::-1] for s in sexiffus]
    return suffixes