Source code for geowatch.utils.util_kwutil

"""
Functions that may be moved to kwutil
"""
import itertools as it


# Backwards compatible variant used in geowatch util

[docs]
def distributed_subitems(items, max_num=None):
    """
    Return a subset of items maximally distributed over the input index space.
    I.e. the chosen indexes maximize the space between them.

    Args:
        items (List | Dict): an ordered indexable object

    Returns:
        List | Dict: a subset of the input with a length at most ``max_num``.

    Note:
        Prefer using the generator variant :func:`farthest_first` instead.

    TODO:
        - [X] Find a better name. ChatGPT suggests using "spread", which I
              like. Maybe spreadsort, spreadshuffle, spredtraverse?
              spreadtake, takespread?
        - [ ] Figure out where this lives, probably kwutil.
        - [X] maybe we should force this to be a generator? Or make a generator variant?


    Example:
        >>> from geowatch.utils.util_kwutil import *  # NOQA
        >>> items = list(range(100))
        >>> max_num = 5
        >>> sub_items = distributed_subitems(items, max_num)
        >>> print(sub_items)
        [0, 25, 50, 75, 99]

    Example:
        >>> from geowatch.utils.util_kwutil import *  # NOQA
        >>> items = {chr(i): i for i in range(ord('a'), ord('a') + 26)}
        >>> max_num = 5
        >>> sub_items = distributed_subitems(items, max_num)
        >>> print(sub_items)
        {'a': 97, 'g': 103, 'n': 110, 't': 116, 'z': 122}
    """
    sub_index_gen = _farthest_first_indices(0, len(items))
    sub_indices = sorted(it.islice(sub_index_gen, max_num))
    if isinstance(items, dict):
        item_keys = list(items.keys())
        sub_keys = [item_keys[idx] for idx in sub_indices]
        sub_items = {key: items[key] for key in sub_keys}
    else:
        sub_items = [items[idx] for idx in sub_indices]
    return sub_items




[docs]
def farthest_first(items, max_num=None, first=None):
    """
    Return a subset of items maximally distributed over the input index space.
    I.e. the chosen indexes maximize the space between them.

    Args:
        items (List | Dict): an ordered indexable object
        first (int | None): if specified, start with this index.

    Returns:
        List | Dict: a subset of the input with a length at most ``max_num``.

    Example:
        >>> from geowatch.utils.util_kwutil import *  # NOQA
        >>> items = list(range(100))
        >>> max_num = 5
        >>> sub_items = list(farthest_first(items, max_num))
        >>> print(list(sub_items))
        [99, 0, 50, 25, 75]

    Example:
        >>> from geowatch.utils.util_kwutil import *  # NOQA
        >>> items = {chr(i): i for i in range(ord('a'), ord('a') + 26)}
        >>> max_num = 5
        >>> sub_items = list(farthest_first(items, max_num))
        >>> print(dict(sub_items))
        {'z': 122, 'a': 97, 'n': 110, 'g': 103, 't': 116}
    """
    if first is None:
        sub_index_gen = _farthest_first_indices(0, len(items))
    else:
        sub_index_gen = _farthest_first_indices_generalized(0, len(items), start=first)
    sub_index_gen = it.islice(sub_index_gen, max_num)
    if isinstance(items, dict):
        item_keys = list(items.keys())
        for idx in sub_index_gen:
            key = item_keys[idx]
            value = items[key]
            yield (key, value)
    else:
        for idx in sub_index_gen:
            yield items[idx]



def _farthest_first_indices(start, stop):
    """
    Given a ordered list of items, incrementally yield indexes such that each
    new index maximizes the distance to all other previously chosen indexes.

    Args:
        start (int): The inclusive starting index (typically 0)
        stop (int): The exclusive maximum index (typically ``len(items)``)

    Yields:
        int: the next chosen index in the series

    TODO:
        - [ ] Find a Better Name,
                spread_indices?
                farthest_index_traversal
                spread_index_traversal

        - [ ] Allow the user to specify which point is first?
              Do we start at the beginning, end, or middle?
              Currently we default to the end.

    Notes:
        * This is an instance of farthest-point traversal in 1D.

    References:
        .. [CSSE_167943] https://cs.stackexchange.com/questions/167943/is-this-knapsack-variant-named-studied-online-algorithm-for-farthest-from-pr
        .. [WikiFarthestFirst] https://en.wikipedia.org/wiki/Farthest-first_traversal

    CommandLine:
        xdoctest -m geowatch.utils.util_kwutil _farthest_first_indices

    Example:
        >>> from geowatch.utils.util_kwutil import *  # NOQA
        >>> total = 10
        >>> start, stop = 0, 10
        >>> gen = _farthest_first_indices(start, stop)
        >>> result = list(gen)
        >>> assert set(result) == set(range(start, stop))
        >>> print(result)
        [9, 0, 5, 2, 7, 1, 6, 3, 8, 4]
    """
    if start < stop:
        yield stop - 1
        yield from _farthest_from_previous_helper(start, stop - 1)


def _farthest_from_previous_helper(start, stop):
    if start < stop:
        low_mid: int = (start + stop) // 2
        high_mid: int = (start + stop + 1) // 2

        left_gen = _farthest_from_previous_helper(start, low_mid)
        right_gen = _farthest_from_previous_helper(high_mid, stop)

        pairgen = it.zip_longest(left_gen, right_gen)
        flatgen = it.chain.from_iterable(pairgen)
        filtgen = filter(lambda x: x is not None, flatgen)
        yield from filtgen
        if low_mid < high_mid:
            yield low_mid


def _farthest_first_indices_generalized(start, stop, first=None, distance_fn=None):
    """
    Yield indices in a farthest-first traversal order for a 1D uniform grid [start, stop),
    with an optional user-specified starting index.

    This implementation uses a heap to maintain for each candidate the minimum distance
    to any selected index, so that we can always pick the candidate that maximizes that distance.

    Args:
        start (int): inclusive start index.
        stop (int): exclusive stop index.
        first (int, optional): the starting index (must be in [start, stop)). Defaults to stop - 1.
        distance_fn (callable, optional): A function that computes distance between two indices.
            Defaults to absolute difference (L1 norm).

    Yields:
        int: the next index in the farthest-first order.

    References:
        https://chatgpt.com/c/67a37e86-f89c-8013-8ef8-400c209de0e0

    TODO:
        - [ ] Allow the user to specify a custom distance metric. The logic
              should still work because we only care about the maximum distance
              of unselected elements to their closest selected index.

    Example:
        >>> from geowatch.utils.util_kwutil import *  # NOQA
        >>> import ubelt as ub
        >>> total = 10
        >>> start, stop = 0, 10
        >>> results = []
        >>> for first in range(start, stop):
        >>>     results += [list(_farthest_first_indices_generalized(start, stop, first=first))]
        >>> print(f'results = {ub.urepr(results, nl=1)}')
        results = [
            [0, 9, 4, 2, 6, 1, 3, 5, 7, 8],
            [1, 9, 5, 3, 7, 0, 2, 4, 6, 8],
            [2, 9, 5, 0, 7, 1, 3, 4, 6, 8],
            [3, 9, 0, 6, 1, 2, 4, 5, 7, 8],
            [4, 9, 0, 2, 6, 1, 3, 5, 7, 8],
            [5, 0, 9, 2, 7, 1, 3, 4, 6, 8],
            [6, 0, 3, 9, 1, 2, 4, 5, 7, 8],
            [7, 0, 3, 5, 9, 1, 2, 4, 6, 8],
            [8, 0, 4, 2, 6, 1, 3, 5, 7, 9],
            [9, 0, 4, 2, 6, 1, 3, 5, 7, 8],
        ]
        >>> # This distance function just reverses the metric, so we find the
        >>> # closest item instead of the furthest. It is a bit contrived, but
        >>> # demos how a specific distance function can modify the results.
        >>> results = []
        >>> def distance_fn(i, j):
        >>>     return -abs(i - j)
        >>> for first in range(start, stop):
        >>>     results += [list(_farthest_first_indices_generalized(start, stop, first=first, distance_fn=distance_fn))]
        >>> print(f'results = {ub.urepr(results, nl=1)}')
        results = [
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
            [1, 0, 2, 3, 4, 5, 6, 7, 8, 9],
            [2, 1, 0, 3, 4, 5, 6, 7, 8, 9],
            [3, 2, 1, 0, 4, 5, 6, 7, 8, 9],
            [4, 3, 2, 1, 0, 5, 6, 7, 8, 9],
            [5, 4, 3, 2, 1, 0, 6, 7, 8, 9],
            [6, 5, 4, 3, 2, 1, 0, 7, 8, 9],
            [7, 6, 5, 4, 3, 2, 1, 0, 8, 9],
            [8, 7, 6, 5, 4, 3, 2, 1, 0, 9],
            [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
        ]
    """
    # Note: using sortedcontainers might have a more clear implementation this
    # is probably still O(n**2), but I think we must be able to do better.  not
    # going to think about it too hard, this code path is not very likely to be
    # needed, as the simple farther first implementation is what is mainly
    # needed.
    import heapq
    if first is None:
        first = stop - 1
    if not (start <= first < stop):
        raise ValueError("first must be in the range [start, stop)")

    if distance_fn is None:
        distance_fn = lambda a, b: abs(a - b)  # NOQA

    # Set of selected indices (kept sorted for convenience).
    selected = [first]
    yield first

    # For each candidate (not yet selected), store its best known distance to a selected index.
    best_dist = {
        i: distance_fn(i, first)
        for i in range(start, stop)
        if i != first
    }
    # Maintain a heap keyed by negative distance (since heapq is a min-heap).
    heap = []
    for i, d in best_dist.items():
        heapq.heappush(heap, (-d, i))

    # Greedily select the candidate with the maximum current distance.
    while heap:
        neg_d, candidate = heapq.heappop(heap)
        # If candidate was already selected, skip it.
        if candidate not in best_dist:
            continue
        # If the min distance in the heap is wrong, then the item is outdated,
        # and we need to skip it.
        current_d = best_dist[candidate]
        if -neg_d != current_d:
            # The stored distance is outdated, pop it and skip, the correct
            # value will exist in the queue later on.
            continue

        # Candidate is the farthest from any selected point.
        yield candidate
        selected.append(candidate)
        # Remove candidate from best_dist so it isn’t chosen again.
        best_dist.pop(candidate)

        # Update best distances for all remaining candidates.
        for other, old_d in best_dist.items():
            new_d = distance_fn(other, candidate)
            if new_d < old_d:
                best_dist[other] = new_d
                heapq.heappush(heap, (-new_d, other))