Source code for geowatch.cli.coco_clean_geotiffs

#!/usr/bin/env python3
import scriptconfig as scfg
import ubelt as ub



[docs]
class CleanGeotiffConfig(scfg.DataConfig):
    r"""
    Clean geotiff files inplace by masking bad pixels with NODATA.

    Replaces large contiguous regions of specific same-valued pixels as NODATA.

    Note:
        This is a destructive operation and overwrites the geotiff image data
        inplace. Make a copy of your dataset if there is any chance you need to
        go back. The underlying kwcoco file is not modified.

    Usage:
        # It is a good idea to do a dry run first to check for issues
        # This can be done at a smaller scale for speed.
        DVC_DATA_DPATH=$(geowatch_dvc --tags='phase2_data' --hardware=auto)
        geowatch clean_geotiffs \
            --src "$DVC_DATA_DPATH/Drop4-BAS/data.kwcoco.json" \
            --channels="red|green|blue|nir|swir16|swir22" \
            --prefilter_channels="red" \
            --min_region_size=256 \
            --nodata_value=-9999 \
            --workers="min(2,avail)" \
            --probe_scale=0.5 \
            --dry=True

        # Then execute a real run at full scale - optionally with a probe scale
        geowatch clean_geotiffs \
            --src "$DVC_DATA_DPATH/Drop4-BAS/data_vali.kwcoco.json" \
            --channels="red|green|blue|nir|swir16|swir22" \
            --prefilter_channels="red" \
            --min_region_size=256 \
            --nodata_value=-9999 \
            --workers="min(2,avail)" \
            --probe_scale=None \
            --dry=False


    Ignore:

        geowatch clean_geotiffs \
            --src=data.kwcoco.zip --dry=True --workers=8  \
            --probe_scale=0.25 --prefilter_channels=pan --channels=pan

        geowatch clean_geotiffs \
            --dry=False --workers=2  \
            --probe_scale=0.0625 --prefilter_channels="pan" \
            --channels="pan" \
            --src=data.kwcoco.zip

        geowatch clean_geotiffs \
            --dry=False --workers=avail  \
            --probe_scale=0.0625 --prefilter_channels="red" \
            --channels="red|green|blue|nir|swir16|swir22" \
            --src=data.kwcoco.zip

        geowatch clean_geotiffs \
            --dry=False --workers=avail  \
            --probe_scale=0.25 --prefilter_channels="swir22" \
            --channels="swir16|swir22" \
            --src=imgonly-BR_R005.kwcoco.json

    """
    src = scfg.Value(None, position=1, help='input coco dataset')

    workers = scfg.Value(0, type=str, help='number of workers')

    channels = scfg.Value('*', help=ub.paragraph(
        '''
        The channels to apply nodata fixes to. A value of * means all channels
        except the excluded ones.
        '''))

    exclude_channels = scfg.Value('quality|cloudmask', help=ub.paragraph(
        '''
        Channels to never apply fixes to.
        '''))

    prefilter_channels = scfg.Value('red', help=ub.paragraph(
        '''
        The channels to use when checking for bad nodata values.
        If unspecified, uses channels.
        '''))

    possible_nodata_values = scfg.Value([0], help=ub.paragraph(
        '''
        List of integer values that might represent nodata, but may also failed
        to be labeled as nodata.
        '''))

    min_region_size = scfg.Value(256, help=ub.paragraph(
        '''
        Minimum size of a connected region to be considered as a nodata
        candidate.
        '''))

    scale = scfg.Value(None, help=ub.paragraph(
        '''
        Scale at which to perform the check. E.g. 0.5 for half resolution.
        Speeds up checks, but should only be used in dry runs.
        TODO: we could fix the real run to work with scale checks, and that
        would speed it up and probably not generate many false negatives.
        '''))

    nodata_value = scfg.Value(-9999, help='the real nodata value to use')

    probe_scale = scfg.Value(None, help=ub.paragraph(
        '''
        If specified, we probe the image at smaller scale for bad values and if
        any are found we do a full scale computation. Speeds up computation if
        there are not many images with bad values, but may result in false
        negatives.

        Should be given as a float less than 1.0
        '''))

    use_fix_stamps = scfg.Value(False, help=ub.paragraph(
        '''
        if True, write a file next to every file that was fixed so we dont need
        to check it again.
        '''))

    export_bad_fpath = scfg.Value(None, help='if True, export paths to bad files to this newline separated file, also works in dry mode')

    dry = scfg.Value(False, help='if True, only do a dry run. Report issues but do not fix them')



__config__ = CleanGeotiffConfig



[docs]
def main(cmdline=1, **kwargs):
    """
    CommandLine:
        xdoctest -m geowatch.cli.coco_clean_geotiffs main

    Example:
        >>> # xdoctest: +REQUIRES(env:SLOW_DOCTESTS)
        >>> # Generate a dataset that has bad nodata values
        >>> from geowatch.cli.coco_clean_geotiffs import *  # NOQA
        >>> import kwimage
        >>> import geowatch
        >>> import kwarray
        >>> import numpy as np
        >>> # Create a copy of the test dataset to clean inplace
        >>> orig_dset = geowatch.coerce_kwcoco('geowatch-msi', geodata=True, bad_nodata=True, num_videos=1, num_frames=2)
        >>> orig_dpath = ub.Path(orig_dset.bundle_dpath)
        >>> dpath = orig_dpath.augment(stemsuffix='_cleaned')
        >>> dpath.delete()
        >>> orig_dpath.copy(dpath)
        >>> dset = geowatch.coerce_kwcoco(dpath / 'data.kwcoco.json')
        >>> coco_img = dset.images().coco_images[0]
        >>> kwargs = {
        >>>     'src': dset,
        >>>     'workers': 0,
        >>>     'channels': 'B11',
        >>>     'prefilter_channels': 'B11',
        >>>     'min_region_size': 32,
        >>>     'nodata_value': 2,  # because toydata is uint16
        >>> }
        >>> cmdline = 0
        >>> # Do a dry run first
        >>> main(cmdline=cmdline, **kwargs, dry=True)
        >>> # Then a real run.
        >>> main(cmdline=cmdline, **kwargs)
        >>> coco_img1 = orig_dset.images().coco_images[0]
        >>> coco_img2 = dset.coco_image(coco_img1.img['id'])
        >>> print(ub.urepr(list(coco_img1.iter_image_filepaths())))
        >>> print(ub.urepr(list(coco_img2.iter_image_filepaths())))
        >>> imdata1 = coco_img1.imdelay('B11', nodata_method='float').finalize()
        >>> imdata2 = coco_img2.imdelay('B11', nodata_method='float').finalize()
        >>> print(np.isnan(imdata1).sum())
        >>> print(np.isnan(imdata2).sum())
        >>> canvas1 = kwarray.robust_normalize(imdata1)
        >>> canvas2 = kwarray.robust_normalize(imdata2)
        >>> canvas1 = kwimage.nodata_checkerboard(canvas1)
        >>> canvas2 = kwimage.nodata_checkerboard(canvas2)
        >>> # xdoctest: +REQUIRES(--show)
        >>> # xdoctest: +REQUIRES(module:kwplot)
        >>> import kwplot
        >>> kwplot.autompl()
        >>> kwplot.imshow(canvas1, pnum=(1, 2, 1), title='before')
        >>> kwplot.imshow(canvas2, pnum=(1, 2, 2), title='after')
    """
    from kwutil import util_parallel
    import kwcoco

    config = CleanGeotiffConfig.cli(cmdline=cmdline, data=kwargs, strict=True)
    import rich
    rich.print('config = {}'.format(ub.urepr(config, nl=1)))

    print('Loading dataset')
    coco_dset = kwcoco.CocoDataset.coerce(config['src'])

    workers = util_parallel.coerce_num_workers(config['workers'])
    print('workers = {}'.format(ub.urepr(workers, nl=1)))
    jobs = ub.JobPool(mode='process', max_workers=workers, transient=True)
    print('Created job pool')

    if config['channels'] is None or config['channels'] == '*':
        channels = None
    else:
        channels = kwcoco.ChannelSpec.coerce(config['channels'])
    if config['prefilter_channels'] is None:
        prefilter_channels = None
    else:
        prefilter_channels = kwcoco.ChannelSpec.coerce(config['prefilter_channels'])

    possible_nodata_values = set(config['possible_nodata_values'])

    if config['scale'] is not None:
        assert config['dry'], 'only scale in dry runs'

    probe_kwargs = {
        'channels': channels,
        'exclude_channels': kwcoco.ChannelSpec.coerce(config['exclude_channels']).fuse(),
        'scale': config['scale'],
        'possible_nodata_values': possible_nodata_values,
        'prefilter_channels': prefilter_channels,
        'min_region_size': config['min_region_size'],
        'probe_scale': config['probe_scale'],
        'use_fix_stamps': config['use_fix_stamps'],
        'nodata_value': config['nodata_value'],
    }

    print('Grabbing coco images')
    coco_imgs = coco_dset.images().coco_images
    print('About to start looping')

    if config.export_bad_fpath is not None:
        export_fpath = ub.Path(config.export_bad_fpath)
        with open(export_fpath, 'w'):
            ...
        export_file = open(export_fpath, 'a')
    else:
        export_file = None

    from kwutil import util_progress
    mprog = util_progress.ProgressManager(backend='progiter')
    # mprog = util_progress.ProgressManager(backend='rich')
    with mprog, jobs:
        print('In the with statement')
        mprog.update_info('Looking for geotiff issues')

        # TODO: may need to use the blocking job queue to limit the maximum
        # number of unhandled but populated results
        for coco_img in mprog.new(coco_imgs, desc='Submit probe jobs'):
            coco_img.detach()
            jobs.submit(probe_image_issues, coco_img, **probe_kwargs)

        def collect_jobs(jobs):
            for job in mprog.new(jobs.as_completed(), total=len(jobs), desc='Collect probe jobs'):
                image_summary = job.result()
                yield image_summary

        def filter_fixable(summaries):

            num_asset_issues = 0
            num_images_issues = 0
            num_incorrect_nodata = 0
            seen_bad_values = set()

            mprog.update_info(ub.codeblock(
                f'''
                Discovered Issues
                -----------------
                Found num_images_issues={num_images_issues}
                Found num_asset_issues={num_asset_issues}

                num_incorrect_nodata={num_incorrect_nodata}
                seen_bad_values={seen_bad_values}
                '''
            ))

            for image_summary in summaries:
                if image_summary['needs_fix']:
                    num_images_issues += 1
                    for asset_summary in image_summary['chans']:
                        if asset_summary['needs_fix']:
                            asset_summary['coco_img'] = image_summary['coco_img']
                            seen_bad_values.update(asset_summary['bad_values'])
                            if not asset_summary.get('has_incorrect_nodata_value', False):
                                num_incorrect_nodata += 1
                            num_asset_issues += 1
                            mprog.update_info(ub.codeblock(
                                f'''
                                Discovered Issues
                                -----------------
                                Found num_images_issues={num_images_issues}
                                Found num_asset_issues={num_asset_issues}

                                seen_bad_values={seen_bad_values}
                                '''
                            ))
                            yield asset_summary

        EAGER = 0

        summaries = collect_jobs(jobs)
        if EAGER:
            summaries = list(summaries)
        else:
            summaries = iter(summaries)

        needs_fix = filter_fixable(summaries)
        if EAGER:
            needs_fix = list(needs_fix)
        else:
            needs_fix = iter(needs_fix)

        if not config['dry']:
            correct_nodata_value = config['nodata_value']
            for asset_summary in mprog.new(needs_fix, desc='Cleaning identified issues'):
                if export_file is not None:
                    export_file.write(asset_summary['fpath'] + '\n')
                fpath = ub.Path(asset_summary['fpath'])
                fix_geotiff_ondisk(asset_summary,
                                   correct_nodata_value=correct_nodata_value)
                if config['use_fix_stamps']:
                    # Mark that fixes have been applied to this asset
                    import json
                    fix_fpath = fpath.augment(tail='.fixes.stamp')
                    if fix_fpath.exists():
                        fixes = json.loads(fix_fpath.read_text())
                    else:
                        fixes = []
                    new_fixes = ub.udict(asset_summary) & {'band_idxs'}
                    fixes.append(new_fixes)
                    fix_fpath.write_text(json.dumps(fixes))
        else:
            for asset_summary in mprog.new(needs_fix, desc='Dry run: identifying issues'):
                if export_file is not None:
                    export_file.write(asset_summary['fpath'] + '\n')
                print(asset_summary['fpath'])
                ...

    if export_file is not None:
        export_file.close()


    # if 0:
    #     import xdev
    #     import kwplot
    #     kwplot.autompl()

    #     nonzero_chans = []
    #     for image_summary in has_nonzeros:
    #         coco_img = image_summary['coco_img']
    #         for asset_summary in image_summary['chans']:
    #             bad_values = asset_summary['bad_values']
    #             if len(set(bad_values) - {0}):
    #                 asset_summary['coco_img'] = coco_img
    #                 nonzero_chans.append(asset_summary)

    #     iter_ = xdev.InteractiveIter(nonzero_chans)
    #     for asset_summary in iter_:
    #         coco_img = asset_summary['coco_img']
    #         chan_canvas = draw_asset_summary(coco_img, asset_summary)
    #         kwplot.imshow(chan_canvas, doclf=1)
    #         xdev.InteractiveIter.draw()



[docs]
def probe_image_issues(coco_img, channels=None, prefilter_channels=None, scale=None,
                       possible_nodata_values=None, min_region_size=256,
                       exclude_channels=None, probe_scale=None,
                       use_fix_stamps=False, nodata_value=-9999):
    """
    Inspect a single image, possibily with multiple assets, each with possibily
    multiple bands for fixable nodata values.

    Args:
        coco_img : the coco image to check

        channels : the channels to check

        prefilter_channels :
            the channels to check first for efficiency. If they do not exist,
            then the all channels are checked.

        possible_nodata_values (set[int]):
            the values that may be nodata if known

        scale :
            use a downscaled overview to speed up the computation via
            approximation. Returns results at this scale. DO NOT USE RIGHT NOW.

        probe_scale :
            use a downscaled overview to speed up the computation via
            approximation. If the probe identifies an issue a full scale probe
            is done.

    Ignore:
        globals().update((ub.udict(xdev.get_func_kwargs(probe_image_issues)) | probe_kwargs))

    Example:
        >>> import geowatch
        >>> from geowatch.cli.coco_clean_geotiffs import *  # NOQA
        >>> import numpy as np
        >>> dset = geowatch.coerce_kwcoco('geowatch-msi', geodata=True, bad_nodata=True)
        >>> coco_img = dset.images().coco_images[4]
        >>> channels = 'B11|B10|X.1'
        >>> prefilter_channels = 'B11'
        >>> scale = None
        >>> possible_nodata_values = {0}
        >>> min_region_size = 128
        >>> image_summary = probe_image_issues(
        >>>     coco_img, channels=channels, prefilter_channels=prefilter_channels,
        >>>     scale=scale, possible_nodata_values=possible_nodata_values,
        >>>     min_region_size=min_region_size)
        >>> print(f'image_summary={image_summary}')
    """
    import kwcoco
    if channels is None:
        request_channels = coco_img.channels.fuse()
    else:
        request_channels = kwcoco.ChannelSpec.coerce(channels).fuse()
    if prefilter_channels is not None:
        prefilter_channels = kwcoco.ChannelSpec.coerce(prefilter_channels).fuse()

    if exclude_channels is not None:
        request_channels = request_channels.difference(exclude_channels)
        if prefilter_channels is not None:
            prefilter_channels = prefilter_channels.difference(exclude_channels)

    image_summary = {}
    asset_summaries = []

    prefilter_assets = []
    requested_assets = []
    for obj in coco_img.iter_asset_objs():
        chans = kwcoco.ChannelSpec.coerce(obj['channels']).fuse()
        request_common = (chans & request_channels).fuse()
        if request_common.numel():
            if prefilter_channels is None:
                prefilter_common = None
            else:
                prefilter_common = (chans & prefilter_channels).fuse()
            chans_ = chans.to_oset()
            band_idxs = [chans_.index(c) for c in request_common.to_list()]
            if prefilter_common is not None and prefilter_common.numel():
                prefilter_assets.append({
                    'obj': obj,
                    'band_idxs': band_idxs,
                })
            else:
                requested_assets.append({
                    'obj': obj,
                    'band_idxs': band_idxs,
                })

    should_continue = True
    for item in prefilter_assets:
        obj = item['obj']
        band_idxs = item['band_idxs']

        if probe_scale is not None:
            # Check if there is anything obvious at a smaller scale first if
            # requested.
            probe_asset_summary = probe_asset(
                coco_img, obj, band_idxs=band_idxs, scale=probe_scale,
                possible_nodata_values=possible_nodata_values,
                min_region_size=min_region_size, use_fix_stamps=use_fix_stamps,
                nodata_value=nodata_value)
            if not probe_asset_summary['bad_values']:
                should_continue = False

        if should_continue:
            # Check if there is anything obvious in a specific channel first if
            # requested.
            asset_summary = probe_asset(coco_img, obj, band_idxs=band_idxs,
                                        scale=scale,
                                        possible_nodata_values=possible_nodata_values,
                                        min_region_size=min_region_size,
                                        use_fix_stamps=use_fix_stamps,
                                        nodata_value=nodata_value)
            if not asset_summary['bad_values']:
                should_continue = False
            asset_summaries.append(asset_summary)

    if len(prefilter_assets) == 0:
        should_continue = True

    if should_continue:
        for item in requested_assets:
            obj = item['obj']
            band_idxs = item['band_idxs']
            asset_summary = probe_asset(coco_img, obj, band_idxs=band_idxs,
                                        scale=scale,
                                        possible_nodata_values=possible_nodata_values,
                                        min_region_size=min_region_size,
                                        use_fix_stamps=use_fix_stamps,
                                        nodata_value=nodata_value)
            asset_summaries.append(asset_summary)

    image_summary['chans'] = asset_summaries
    image_summary['coco_img'] = coco_img
    image_summary['bad_values'] = list(ub.unique(ub.flatten([c['bad_values'] for c in asset_summaries])))
    image_summary['needs_fix'] = any([c['needs_fix'] for c in asset_summaries])
    return image_summary




[docs]
def probe_asset(coco_img, obj, band_idxs=None, scale=None,
                possible_nodata_values=None, min_region_size=256,
                use_fix_stamps=False, nodata_value=-9999):
    """
    Inspect a specific single-file asset possibily with multiple bands for
    fixable nodata values.
    """
    import kwcoco
    asset_channels = kwcoco.FusedChannelSpec.coerce(obj['channels'])
    if band_idxs is None:
        band_idxs = list(range(asset_channels.numel()))

    bundle_dpath = ub.Path(coco_img.bundle_dpath)
    fpath = bundle_dpath / obj['file_name']

    asset_summary = {
        'channels': asset_channels,
        'fpath': fpath,
        'needs_fix': False,
    }

    if use_fix_stamps:
        import json
        fix_fpath = fpath.augment(tail='.fixes.stamp')
        if fix_fpath.exists():
            fixes = json.loads(fix_fpath.read_text())
            assert fixes
            return asset_summary
            # TODO: actually check the requested fix was applied, but for now
            # assume if the file exists we should skip the check.

    check_channels = asset_channels[band_idxs]
    delayed = coco_img.imdelay(
        channels=check_channels, space='asset', nodata_method='float')
    if scale is not None:
        delayed.prepare()
        delayed = delayed.scale(scale)
        delayed = delayed.optimize()

    imdata = delayed.finalize(interpolation='nearest')

    if scale is not None:
        min_region_size_ = int(min_region_size * scale)
    else:
        min_region_size_ = min_region_size

    if possible_nodata_values is not None:
        possible_nodata_values = list(possible_nodata_values)
        possible_nodata_values.append(nodata_value)

    asset_summary.update(probe_asset_imdata(
        imdata, band_idxs, min_region_size_=min_region_size_,
        possible_nodata_values=possible_nodata_values))

    asset_summary.update(_probe_correct_nodata_value(
        fpath, band_idxs, nodata_value))

    return asset_summary



def _probe_correct_nodata_value(fpath, band_idxs, nodata_value=-9999):
    asset_summary = {}
    from geowatch.utils import util_gdal
    gdal_dset = util_gdal.GdalDataset.open(fpath)
    band_infos = gdal_dset.info()['bands']
    gdal_dset = None
    asset_summary['has_incorrect_nodata_value'] = False
    for band_idx in band_idxs:
        band_info = band_infos[band_idx]
        if band_info.get('noDataValue', None) != nodata_value:
            asset_summary['has_incorrect_nodata_value'] = True
            asset_summary['needs_fix'] = True
    return asset_summary



[docs]
def probe_asset_imdata(imdata, band_idxs, min_region_size_=256,
                       possible_nodata_values=None):
    import kwarray
    asset_summary = {}
    asset_summary['band_idxs'] = band_idxs
    band_summaries = []
    imdata = kwarray.atleast_nd(imdata, n=3)
    for idx in range(len(band_idxs)):
        band_imdata = imdata[..., idx]
        band_summary = probe_band_imdata(
            band_imdata, min_region_size_=min_region_size_,
            possible_nodata_values=possible_nodata_values)
        band_summaries.append(band_summary)
    asset_summary['band_summaries'] = band_summaries
    asset_summary['bad_values'] = list(ub.unique(ub.flatten([
        c['bad_values'] for c in band_summaries])))
    if len(asset_summary['bad_values']) > 0:
        asset_summary['needs_fix'] = True
    return asset_summary




[docs]
def probe_band_imdata(band_imdata, min_region_size_=256,
                      possible_nodata_values=None):
    from geowatch.utils import util_kwimage
    import numpy as np

    band_imdata = np.ascontiguousarray(band_imdata)
    is_samecolor = util_kwimage.find_samecolor_regions(
        band_imdata, min_region_size=min_region_size_,
        values=possible_nodata_values, PRINT_STEPS=0)

    band_summary = {}
    if np.any(is_samecolor):
        unique_labels, unique_first_index, unique_counts = np.unique(is_samecolor, return_index=True, return_counts=True)
        flags = unique_labels > 0
        bad_labels = unique_labels[flags]
        first_index = unique_first_index[flags]
        bad_counts = unique_counts[flags]
        bad_values = band_imdata.ravel()[first_index]

        band_summary['is_samecolor'] = is_samecolor
        band_summary['bad_labels'] = bad_labels
        band_summary['bad_values'] = bad_values
        band_summary['bad_counts'] = bad_counts
    else:
        band_summary['is_samecolor'] = False
        band_summary['bad_values'] = []
    return band_summary




[docs]
def fix_single_asset(fpath, dry=False):
    from delayed_image import DelayedLoad
    asset_summary = {
        'fpath': fpath,
        'needs_fix': False,
    }

    min_region_size_ = 256
    nodata_value = -9999
    possible_nodata_values = [0, nodata_value]

    delayed = DelayedLoad(fpath)
    delayed.prepare()
    band_idxs = list(range(delayed.num_channels))

    imdata = delayed.finalize(nodata_method='nan', interpolation='nearest')

    asset_summary.update(probe_asset_imdata(
        imdata, band_idxs, min_region_size_=min_region_size_,
        possible_nodata_values=possible_nodata_values))

    asset_summary.update(_probe_correct_nodata_value(
        fpath, band_idxs, nodata_value))

    print('asset_summary = {}'.format(ub.urepr(asset_summary, nl=True)))
    if not dry:
        fix_geotiff_ondisk(asset_summary, correct_nodata_value=nodata_value)




[docs]
def fix_geotiff_ondisk(asset_summary, correct_nodata_value=-9999):
    """
    Updates the nodata value based on a mask inplace on disk.
    Attempts to preserve all other metadata, but this is not guarenteed or
    always possible.

    Args:
        asset_summary (Dict): an item from :func:`probe_asset`.

        correct_nodata_value (int): the nodata value to use in the
            modified geotiff.

    Assumptions:
        * The input image uses AVERAGE overview resampling
        * The input image is a tiled geotiff (ideally a COG)

    TODO:
        - [ ] Can restructure this as a more general context manager.

    Example:
        >>> from geowatch.cli.coco_clean_geotiffs import *  # NOQA
        >>> from geowatch.demo.metrics_demo.demo_rendering import write_demo_geotiff
        >>> import kwimage
        >>> import numpy as np
        >>> dpath = ub.Path.appdir('geowatch/tests/clean_geotiff').ensuredir()
        >>> fpath1 = dpath / 'test_geotiff.tif'
        >>> fpath2 = fpath1.augment(stemsuffix='_fixed')
        >>> fpath1.delete()
        >>> fpath2.delete()
        >>> imdata = kwimage.grab_test_image('amazon', dsize=(512, 512))
        >>> poly = kwimage.Polygon.random().scale(imdata.shape[0:2][::-1])
        >>> imdata = poly.draw_on(imdata, color='black')
        >>> imdata = imdata.astype(np.int16)
        >>> #imdata = poly.fill(imdata, value=(0, 0, 0), pixels_are='areas')
        >>> imdata = poly.fill(imdata, value=0, pixels_are='areas')
        >>> imdata[:256, :256, 0] = 0
        >>> write_demo_geotiff(img_fpath=fpath1, imdata=imdata)
        >>> fpath1.copy(fpath2)
        >>> asset_summary = probe_asset_imdata(imdata, band_idxs=[0, 2], possible_nodata_values={0})
        >>> asset_summary['fpath'] = fpath2
        >>> assert fpath1.stat().st_size == fpath2.stat().st_size
        >>> fix_geotiff_ondisk(asset_summary)
        >>> assert fpath1.stat().st_size != fpath2.stat().st_size
        >>> imdata1 = kwimage.imread(fpath1, nodata_method='ma')
        >>> imdata2 = kwimage.imread(fpath2, nodata_method='ma')
        >>> canvas1 = kwimage.normalize_intensity(imdata1)
        >>> canvas2 = kwimage.normalize_intensity(imdata2)
        >>> canvas1 = kwimage.nodata_checkerboard(canvas1)
        >>> canvas2 = kwimage.nodata_checkerboard(canvas2)
        >>> # xdoctest: +REQUIRES(--show)
        >>> # xdoctest: +REQUIRES(module:kwplot)
        >>> import kwplot
        >>> kwplot.autompl()
        >>> kwplot.imshow(canvas1.data, pnum=(2, 2, 1), title='norm imdata1 vals')
        >>> kwplot.imshow(canvas2.data, pnum=(2, 2, 2), title='norm imdata2 vals')
        >>> kwplot.imshow(imdata1.mask.any(axis=2), pnum=(2, 2, 3), title='imdata1.mask')
        >>> kwplot.imshow(imdata2.mask.any(axis=2), pnum=(2, 2, 4), title='imdata2.mask')
        >>> #kwplot.imshow((asset_summary['is_samecolor'] > 0), pnum=(3, 2, 5), title='is samecolor mask')

    Example:
        >>> from geowatch.cli.coco_clean_geotiffs import *  # NOQA
        >>> from geowatch.demo.metrics_demo.demo_rendering import write_demo_geotiff
        >>> import kwimage
        >>> import numpy as np
        >>> dpath = ub.Path.appdir('geowatch/tests/clean_geotiff').ensuredir()
        >>> fpath1 = dpath / 'test_geotiff.tif'
        >>> fpath2 = fpath1.augment(stemsuffix='_fixed')
        >>> fpath1.delete()
        >>> fpath2.delete()
        >>> imdata = kwimage.grab_test_image('amazon', dsize=(512, 512))[..., 0]
        >>> poly = kwimage.Polygon.random().scale(imdata.shape[0:2][::-1])
        >>> imdata = imdata.astype(np.int16)
        >>> imdata = poly.fill(imdata, value=0, pixels_are='areas')
        >>> imdata[:256, :256] = 0
        >>> write_demo_geotiff(img_fpath=fpath1, imdata=imdata)
        >>> fpath1.copy(fpath2)
        >>> asset_summary = probe_asset_imdata(imdata, band_idxs=[0], possible_nodata_values={0})
        >>> asset_summary['fpath'] = fpath2
        >>> assert fpath1.stat().st_size == fpath2.stat().st_size
        >>> fix_geotiff_ondisk(asset_summary)
        >>> assert fpath1.stat().st_size != fpath2.stat().st_size
        >>> imdata1 = kwimage.imread(fpath1, nodata_method='ma')
        >>> imdata2 = kwimage.imread(fpath2, nodata_method='ma')
        >>> canvas1 = kwimage.normalize_intensity(imdata1)
        >>> canvas2 = kwimage.normalize_intensity(imdata2)
        >>> canvas1 = kwimage.nodata_checkerboard(canvas1)
        >>> canvas2 = kwimage.nodata_checkerboard(canvas2)
        >>> # xdoctest: +REQUIRES(--show)
        >>> # xdoctest: +REQUIRES(module:kwplot)
        >>> import kwplot
        >>> kwplot.autompl()
        >>> kwplot.imshow(canvas1.data, pnum=(2, 2, 1), title='norm imdata1 vals')
        >>> kwplot.imshow(canvas2.data, pnum=(2, 2, 2), title='norm imdata2 vals')
        >>> kwplot.imshow(imdata1.mask, pnum=(2, 2, 3), title='imdata1.mask')
        >>> kwplot.imshow(imdata2.mask, pnum=(2, 2, 4), title='imdata2.mask')
        >>> #kwplot.imshow((asset_summary['is_samecolor'] > 0), pnum=(3, 2, 5), title='is samecolor mask')
    """
    import os
    from geowatch.utils import util_gdal
    from osgeo import gdal
    import numpy as np
    import tempfile
    fpath = ub.Path(asset_summary['fpath'])
    band_idxs = asset_summary['band_idxs']

    # We will write a modified version to a temporay file and then overwrite
    # the destination file to avoid race conditions.
    tmp_fpath = ub.Path(tempfile.mktemp(
        dir=fpath.parent, prefix='.tmp.' + fpath.stem, suffix='.tiff'))
    assert tmp_fpath.parent.exists()

    # Assume average overview resampling, there does not seem to be standard
    # way that a geotiff encodes what algorithm was used.
    overview_resample = 'AVERAGE'

    ### Read in source data and introspect it
    src_dset = util_gdal.GdalDataset.open(os.fspath(fpath))
    orig_driver_name = src_dset.GetDriver().ShortName
    assert orig_driver_name == 'GTiff'

    band_idx_to_orig_props = {}
    for band_idx in band_idxs:
        band = src_dset.GetRasterBand(band_idx + 1)
        # Have to read this here because it is clobbered in the copy
        band_idx_to_orig_props[band_idx] = {
            'blocksize': band.GetBlockSize(),
            'num_overviews': band.GetOverviewCount(),
        }

    ### Copy the source data to memory and modify it
    driver1 = gdal.GetDriverByName(str('MEM'))
    copy1 = driver1.CreateCopy(str(''), src_dset)
    src_dset.FlushCache()
    src_dset = None

    ### Pixel Modification ###
    # Modify the pixel contents
    for band_idx, band_summary in zip(band_idxs, asset_summary['band_summaries']):
        band = copy1.GetRasterBand(band_idx + 1)
        band_data = band.ReadAsArray()
        mask = band_summary['is_samecolor'] > 0

        curr_nodat_value = band.GetNoDataValue()

        if curr_nodat_value is not None:
            if curr_nodat_value != correct_nodata_value:
                is_curr_nodata = (band_data == curr_nodat_value)
                mask = (mask | is_curr_nodata)

        band_data[mask] = correct_nodata_value

        band.WriteArray(band_data)
        if curr_nodat_value != correct_nodata_value:
            band.SetNoDataValue(correct_nodata_value)

    if band_idx_to_orig_props:
        assert ub.allsame(band_idx_to_orig_props.values()), (
            'We expect input bands to have the same blocksize')
        bandprop = ub.peek(band_idx_to_orig_props.values())
        blocksize = bandprop['blocksize']
        num_overviews = bandprop['num_overviews']
    else:
        raise AssertionError('should have this info')

    ### Rebuild overviews
    overviewlist = (2 ** np.arange(1, num_overviews + 1)).tolist()
    copy1.BuildOverviews(overview_resample, overviewlist)

    ### Flush the in-memory dataset to an on-disk GeoTiff
    xsize, ysize = blocksize
    _options = [
        'BIGTIFF=YES',
        'TILED=YES',
        'BLOCKXSIZE={}'.format(xsize),
        'BLOCKYSIZE={}'.format(ysize),
    ]
    _options += ['COMPRESS={}'.format('DEFLATE')]
    _options.append('COPY_SRC_OVERVIEWS=YES')
    driver1 = None
    # driver2 = gdal.GetDriverByName(str('GTiff'))
    driver2 = gdal.GetDriverByName(str(orig_driver_name))
    copy2 = driver2.CreateCopy(os.fspath(tmp_fpath), copy1, options=_options)

    if copy2 is None:
        last_gdal_error = gdal.GetLastErrorMsg()
        if 'No such file or directory' in last_gdal_error:
            ex_cls = IOError
        else:
            ex_cls = Exception
        raise ex_cls(
            'Unable to create gtiff driver for fpath={}, options={}, last_gdal_error={}'.format(
                fpath, _options, last_gdal_error))
    copy2.FlushCache()
    copy1 = None
    copy2 = None  # NOQA
    driver2 = None

    assert ub.Path(tmp_fpath).exists()
    os.rename(tmp_fpath, fpath)




[docs]
def draw_asset_summary(coco_img, asset_summary):
    from geowatch.utils import util_kwimage
    import kwimage
    import numpy as np
    is_samecolor = asset_summary['is_samecolor']
    data = coco_img.imdelay(channels=asset_summary['channels'], space='asset',
                            nodata_method='float').finalize()

    is_samecolor = np.ascontiguousarray(is_samecolor)
    unique_labels, first_index = np.unique(is_samecolor, return_index=True)
    data_values = data.ravel()[first_index]

    # is_samecolor.ravel()[first_index]
    label_mapping = dict(zip(unique_labels, data_values))
    label_mapping[0] = 'valid data'
    labels = is_samecolor
    label_to_color = {0: 'black'}
    label_canvas = util_kwimage.colorize_label_image(
        labels, with_legend=True,
        label_mapping=label_mapping, label_to_color=label_to_color)

    poly = kwimage.Mask.coerce(is_samecolor > 0).to_multi_polygon()
    boxes = kwimage.Boxes.concatenate([p.to_boxes() for p in poly.data])

    im_canvas = kwimage.normalize_intensity(data)
    im_canvas = kwimage.fill_nans_with_checkers(im_canvas, on_value=0.3)
    im_canvas = boxes.draw_on(im_canvas)
    canvas = kwimage.stack_images([im_canvas, label_canvas], axis=1)
    title = coco_img.video['name'] + ' ' + asset_summary['channels'] + '\n' + coco_img.img['name']
    canvas = kwimage.draw_header_text(canvas, title)

    if 0:
        import kwplot
        kwplot.imshow(canvas)
    return canvas



if __name__ == '__main__':
    """
    CommandLine:
        python ~/code/watch/geowatch/cli/coco_clean_geotiffs.py
    """
    main()