"""
pip install pygtrie
"""
import numpy as np # NOQA
[docs]
def shortest_unique_prefixes(items, sep=None, allow_simple=True, min_length=0, allow_end=False):
r"""
The shortest unique prefix algorithm.
Args:
items (List[str]): returned prefixes will be unique wrt this set
sep (str): if specified, all characters between separators are treated
as a single symbol. Makes the algo much faster.
allow_simple (bool): if True tries to construct a simple feasible
solution before resorting to the optimal trie algorithm.
min_length (int): minimum length each prefix can be
allow_end (bool): if True allows for string terminators to be
considered in the prefix
Returns:
list of str: a prefix for each item that uniquely identifies it
wrt to the original items.
References:
http://www.geeksforgeeks.org/find-all-shortest-unique-prefixes-to-represent-each-word-in-a-given-list/
https://github.com/Briaares/InterviewBit/blob/master/Level6/Shortest%20Unique%20Prefix.cpp
Requires:
pip install pygtrie
Example:
>>> # xdoctest: +REQUIRES(module:pygtrie)
>>> items = ["zebra", "dog", "duck", "dove"]
>>> shortest_unique_prefixes(items)
['z', 'dog', 'du', 'dov']
Example:
>>> # xdoctest: +REQUIRES(module:pygtrie)
>>> from geowatch.utils.util_stringalgo import * # NOQA
>>> smeti = ["params.metrics.foo.mean", "params.metrics.foo.std", "params.metrics.foo.count"]
>>> items = [p[::-1] for p in smeti]
>>> euqine = shortest_unique_prefixes(items, sep='.', min_length=2)
>>> unique = [p[::-1] for p in euqine]
>>> print(f'unique={unique}')
unique=['foo.mean', 'foo.std', 'foo.count']
Timeing:
>>> # DISABLE_DOCTEST
>>> # make numbers larger to stress test
>>> # L = max length of a string, N = number of strings,
>>> # C = smallest gaurenteed common length
>>> # (the setting N=10000, L=100, C=20 is feasible we are good)
>>> import timerit
>>> import random
>>> def make_data(N, L, C):
>>> rng = random.Random(0)
>>> return [''.join(['a' if i < C else chr(rng.randint(97, 122))
>>> for i in range(L)]) for _ in range(N)]
>>> items = make_data(N=1000, L=10, C=0)
>>> timerit.Timerit(3).call(shortest_unique_prefixes, items).print()
Timed for: 3 loops, best of 3
time per loop: best=24.54 ms, mean=24.54 ± 0.0 ms
>>> items = make_data(N=1000, L=100, C=0)
>>> timerit.Timerit(3).call(shortest_unique_prefixes, items).print()
Timed for: 3 loops, best of 3
time per loop: best=155.4 ms, mean=155.4 ± 0.0 ms
>>> items = make_data(N=1000, L=100, C=70)
>>> timerit.Timerit(3).call(shortest_unique_prefixes, items).print()
Timed for: 3 loops, best of 3
time per loop: best=232.8 ms, mean=232.8 ± 0.0 ms
>>> items = make_data(N=10000, L=250, C=20)
>>> timerit.Timerit(3).call(shortest_unique_prefixes, items).print()
Timed for: 3 loops, best of 3
time per loop: best=4.063 s, mean=4.063 ± 0.0 s
"""
import pygtrie
if len(set(items)) != len(items):
raise ValueError('inputs must be unique')
# construct trie
if sep is None:
trie = pygtrie.CharTrie.fromkeys(items, value=0)
else:
# In some simple cases we can avoid constructing a trie
if allow_simple and min_length <= 1:
tokens = [item.split(sep) for item in items]
simple_solution = [t[0] for t in tokens]
if len(simple_solution) == len(set(simple_solution)):
return simple_solution
for i in range(2, 10):
# print('return simple solution at i = {!r}'.format(i))
simple_solution = [sep.join(t[:i]) for t in tokens]
if len(simple_solution) == len(set(simple_solution)):
return simple_solution
trie = pygtrie.StringTrie.fromkeys(items, value=0, separator=sep)
# Set the value (frequency) of all nodes to zero.
for node in _trie_iternodes(trie):
node.value = 0
# For each item trace its path and increment frequencies
for item in items:
final_node, trace = trie._get_node(item)
for key, node in trace:
node.value += 1
# if not isinstance(node.value, int):
# node.value = 0
# Query for the first prefix with frequency 1 for each item.
# This is the shortest unique prefix over all items.
unique = []
for item in items:
freq = None
for prefix, freq in trie.prefixes(item):
if freq == 1:
if sep is None:
prefix_length = len(prefix)
else:
prefix_length = prefix.count(sep) + bool(len(prefix))
if prefix_length >= min_length:
break
if not allow_end:
assert freq == 1, 'item={} has no unique prefix. freq={}'.format(item, freq)
# print('items = {!r}'.format(items))
unique.append(prefix)
return unique
def _trie_iternodes(self):
"""
Generates all nodes in the trie
# Hack into the internal structure and insert frequencies at each node
Ignore:
from geowatch.utils.util_dotdict import * # NOQA
dat = DotDict({
'proc1.param1': 1,
'proc1.param2': 2,
'proc2.param1': 3,
'proc4.part1.param2': 8,
'proc4.part2.param2': 9,
'proc4.part2.param2': 10,
})
self = dat._prefix_trie
for n in list(_trie_iternodes(self)):
print(f'n.value={n.value}')
...
"""
from collections import deque
stack = deque([[self._root]])
while stack:
for node in stack.pop():
yield node
try:
# only works in pygtrie-2.2 broken in pygtrie-2.3.2
stack.append(node.children.values())
except AttributeError:
stack.append([v for k, v in node.children.iteritems()])
def _trie_iteritems(self):
"""
Generates all nodes in the trie
# Hack into the internal structure and insert frequencies at each node
Ignore:
from geowatch.utils.util_dotdict import * # NOQA
from geowatch.utils.util_stringalgo import * # NOQA
dat = DotDict({
'proc1.param1': 1,
'proc1.param2': 2,
'proc2.param1': 3,
'proc4.part1.param2': 8,
'proc4.part2.param2': 9,
'proc4.part2.param2': 10,
})
self = dat._prefix_trie
for k, v in list(_trie_iteritems(self)):
print(f'k={k}')
"""
from collections import deque
sentinel = object()
stack = deque([[(sentinel, self._root)]])
while stack:
level = stack.pop()
for key, node in level:
if key is not sentinel:
yield key, node
# only works in pygtrie-2.2 broken in pygtrie-2.3.2
try:
stack.append(list(node.children.items()))
except AttributeError:
stack.append(list(node.children.iteritems()))
[docs]
def shortest_unique_suffixes(items, sep=None, min_length=0):
r"""
Example:
>>> # xdoctest: +REQUIRES(--pygtrie)
>>> items = ["zebra", "dog", "duck", "dove"]
>>> shortest_unique_suffixes(items)
['a', 'g', 'k', 'e']
Example:
>>> # xdoctest: +REQUIRES(--pygtrie)
>>> items = ["aa/bb/cc", "aa/bb/bc", "aa/bb/dc", "aa/cc/cc"]
>>> shortest_unique_suffixes(items)
"""
snoitpo = [p[::-1] for p in items]
sexiffus = shortest_unique_prefixes(snoitpo, sep=sep, min_length=min_length)
suffixes = [s[::-1] for s in sexiffus]
return suffixes