Source code for geowatch.cli.coco_remove_bad_images

#!/bin/env python
import ubelt as ub
import scriptconfig as scfg



[docs]
class CocoRemoveBadImagesConfig(scfg.DataConfig):
    """
    Remove coco images that are mostly nodata.
    Can also delete the on-disk assets if specified.
    """
    __command__ = 'remove_bad_images'
    __default__ = {
        'src': scfg.Value('data.kwcoco.json', help='input kwcoco filepath', position=1),

        'dst': scfg.Value(None, help='output kwcoco filepath', position=2),

        'workers': scfg.Value(0, type=str, help='number of io threads'),

        'mode': scfg.Value('process', help='can be thread, process, or serial'),

        'channels': scfg.Value(None, help='If specified, check only these channels for bad pixels'),

        'delete_assets': scfg.Value('auto', help='if True actually deletes the assets. If auto and interactive, will ask the user to choose'),

        'interactive': scfg.Value(True, isflag=1, help='if true, ask the user to confirm deletion'),

        # 'overview': scfg.Value('coarsest', help='set to "coarsest" for fastest method, and 0 for most accurate method, or a non negative integer for that level of overview'),
        'overview': scfg.Value(0, help='set to "coarsest" for fastest method, and 0 for most accurate method, or a non negative integer for that level of overview'),
    }




[docs]
def main(cmdline=True, **kwargs):
    """
    Ignore:
        from geowatch.cli.coco_bad_empty_images import *  # NOQA
        kwargs = {}
        kwargs['src'] = 'imgonly_S2_L8_WV.kwcoco.json'
        kwargs['dst'] = 'imgonly_S2_L8_WV.kwcoco.json.tmp'
        kwargs['workers'] = 8
        # kwargs['channels'] = 'red|green|blue'
        kwargs['channels'] = 'red'
        cmdline = False
    """
    config = CocoRemoveBadImagesConfig.cli(cmdline=cmdline, data=kwargs,
                                           strict=True)
    mode = config['mode']

    import kwcoco
    from kwutil import util_parallel
    from rich.prompt import Confirm
    import safer
    workers = util_parallel.coerce_num_workers(config['workers'])

    main_channels = config['channels']
    if main_channels is not None:
        main_channels = kwcoco.FusedChannelSpec.coerce(main_channels)

    dset = kwcoco.CocoDataset.coerce(config['src'])

    delete_assets = config['delete_assets']
    if delete_assets == 'auto':
        if not config['interactive']:
            delete_assets = False

    overview = config['overview']
    bad_gids = find_empty_images(dset, main_channels, mode=mode,
                                 workers=workers, overview=overview)

    if config['interactive']:
        if delete_assets == 'auto':
            total_bytes = compute_asset_disk_usage(dset, bad_gids, mode, workers)
            total_megabytes = total_bytes / 2 ** 20
            print(f'Total bad space: {total_megabytes:0.4f} MB')

        flag = Confirm.ask('Do you want to remove these empty images from the output kwcoco?')
        if not flag:
            return
        if delete_assets == 'auto':
            delete_assets = Confirm.ask('[red] Do you want to delete the on-disk assets too? (DESTRUCTIVE)')

    if delete_assets:
        bad_fpaths = []
        for bad_gid in ub.ProgIter(bad_gids, desc='collect empty assets'):
            coco_img = dset.coco_image(bad_gid)
            bad_fpaths.extend(list(coco_img.iter_image_filepaths()))

        for bad_fpath in ub.ProgIter(bad_fpaths, desc='delete empty assets'):
            ub.delete(bad_fpath)

    dset.remove_images(bad_gids)

    FIX_ASSET_ORDER = 1
    if FIX_ASSET_ORDER:
        # TODO: this should be part of the crop script
        for img in dset.dataset['images']:
            if 'auxiliary' in img:
                img['auxiliary'] = sorted(img['auxiliary'], key=lambda aux: aux['channels'])

    # dset.fpath = config['dst']
    dst_fpath = config['dst']
    print('Write to dst_fpath = {!r}'.format(dst_fpath))
    with safer.open(dst_fpath, 'w', temp_file=not ub.WIN32) as file:
        dset.dump(file, indent='    ', newlines=True)
    print('Wrote to dst_fpath = {!r}'.format(dst_fpath))




[docs]
def compute_asset_disk_usage(dset, gids, mode, workers):
    from kwutil import util_progress

    calc_jobs = ub.JobPool(mode=mode, max_workers=workers)

    pman = util_progress.ProgressManager()

    with pman:
        for gid in pman.progiter(gids, desc='calc asset space'):
            coco_img = dset.coco_image(gid)
            for fpath in coco_img.iter_image_filepaths():
                fpath = ub.Path(fpath).resolve()
                calc_jobs.submit(fpath.stat)

        total_bytes = 0
        prog = pman.progiter(calc_jobs.as_completed(), desc='collect size jobs',
                             total=len(calc_jobs))
        for job in prog:
            stat = job.result()
            total_bytes += stat.st_size
            total_megabytes = total_bytes / 2 ** 20
            msg = f'Current size: {total_megabytes:0.4f} MB'
            prog.set_postfix_str(msg)
    return total_bytes




[docs]
def is_image_empty(coco_img, main_channels=None, overview=-1):
    """
    Run heristics to determine if a coco image is empty.
    """
    import kwimage
    import numpy as np
    import kwcoco
    bundle_dpath = ub.Path(coco_img.bundle_dpath)

    if main_channels is not None:
        main_channels = kwcoco.FusedChannelSpec.coerce(main_channels)

    chan_infos = {}
    for obj in coco_img.iter_asset_objs():
        chan = kwcoco.FusedChannelSpec.coerce(obj['channels'])
        if main_channels is None or (main_channels & chan).numel():
            gpath = bundle_dpath / obj['file_name']
            chan_infos[chan.spec] = chan_info = {}
            chan_info['exists'] = gpath.exists()
            if chan_info['exists']:
                try:
                    imdata = kwimage.imread(gpath, backend='gdal', nodata='ma',
                                            overview=overview)
                except Exception:
                    imdata = kwimage.imread(gpath, backend='gdal', nodata='ma')

                valid_values = imdata.data[~imdata.mask]
                num_masked = imdata.mask.sum()
                num_zero = (valid_values == 0).sum()

                num_iffy = num_masked + num_zero
                total = imdata.mask.size

                if len(valid_values) == 0:
                    max_val = np.ma.masked
                    min_val = np.ma.masked
                    num_min = 0
                    num_max = 0
                else:
                    max_val = valid_values.max()
                    min_val = valid_values.min()
                    num_min = (valid_values == max_val).sum()
                    num_max = (valid_values == min_val).sum()
                    if max_val != 0 and min_val != 0:
                        num_iffy += num_min

                chan_info['max_val'] = max_val
                chan_info['min_val'] = min_val

                chan_info['num_masked'] = num_masked
                chan_info['num_zero'] = num_zero
                chan_info['num_min'] = num_min
                chan_info['num_max'] = num_max
                chan_info['num_iffy'] = num_iffy

                chan_info['frac_masked'] = num_masked / total
                chan_info['frac_zero'] = num_zero / total
                chan_info['frac_iffy'] = num_iffy / total

    img_info = {
        'chan_infos': chan_infos,
        'gid': coco_img.img['id'],
    }
    num_exist = 0
    num_bad = 0
    for chan, info in chan_infos.items():
        if info['exists']:
            num_exist += 1
            maxval = info['max_val']
            if maxval is np.ma.masked or maxval == 0:
                num_bad += 1

    is_bad = (num_bad == num_exist and num_exist > 0)
    img_info['is_bad'] = is_bad
    img_info['num_bad'] = num_bad
    img_info['num_exist'] = num_exist
    return img_info




[docs]
def find_empty_images(dset, main_channels, overview=-1, mode='process',
                      workers=0):
    import numpy as np
    import pandas as pd
    from kwutil import util_progress
    import rich

    gid_to_infos = {}
    all_gids = list(dset.index.imgs.keys())

    pool = ub.JobPool(mode=mode, max_workers=workers)
    pman = util_progress.ProgressManager()

    with pman, pool:
        for gid in pman.progiter(all_gids, desc='submit find empty image jobs',
                                 freq=1000, adjust=0):
            if gid not in gid_to_infos:
                coco_img = dset.coco_image(gid).detach()
                job = pool.submit(is_image_empty, coco_img,
                                  main_channels=main_channels, overview=overview)
                job.coco_img = coco_img

        image_infos = []
        num_bad = 0
        prog = pman.progiter(pool.as_completed(), total=len(pool),
                             desc='collect find empty images', freq=1000,
                             adjust=False)
        for job in prog:
            coco_img = job.coco_img
            img_info = job.result()
            if img_info['is_bad']:
                num_bad += 1
                pman.update_info(f'num_empty = {num_bad} / {len(all_gids)}')
            image_infos.append(img_info)

    for img_info in image_infos:
        img_iffys = [b['frac_iffy'] for b in img_info['chan_infos'].values()]
        if img_iffys:
            img_info['frac_iffy'] = min(img_iffys)
        else:
            img_info['frac_iffy'] = -1

    if 1:
        iffy_fracs = [d['frac_iffy'] for d in image_infos]
        iffy_fracs = np.array(iffy_fracs)
        iffy_bins = [-1, 0, 0.25, 0.5, 0.75, 0.85, .90, .95, 0.98, 1.0]
        iffy_freq, iffy_bins = np.histogram(iffy_fracs, bins=iffy_bins)
        iffy_hist = ub.dzip(ub.iter_window(iffy_bins, 2), iffy_freq)
        print('iffy_hist = {}'.format(ub.urepr(iffy_hist, nl=1)))

    # TODO: different iffy thresh per sensor
    iffy_thresh = 0.95
    bad_img_infos = []
    for img_info in image_infos:
        if img_info['frac_iffy'] > iffy_thresh:
            bad_img_infos.append(img_info)

    bad_gids = [b['gid'] for b in bad_img_infos]

    all_images = dset.images()
    bad_images = dset.images(bad_gids)
    sensor_to_num_bad = ub.dict_hist(bad_images.lookup("sensor_coarse"))
    sensor_to_total = ub.dict_hist(all_images.lookup("sensor_coarse"))
    sensor_bad_df = pd.DataFrame({'num_bad': sensor_to_num_bad, 'num_total': sensor_to_total})

    print(f'{len(bad_images)=}')

    print('Sensor Versus num bad / total')
    rich.print(sensor_bad_df.to_string())

    vidname_to_num_bad = ub.dict_hist(dset.videos(bad_images.lookup("video_id")).lookup("name"))
    vidname_to_num_total = ub.dict_hist(dset.videos(all_images.lookup("video_id")).lookup("name"))
    vidname_bad_df = pd.DataFrame({'num_bad': vidname_to_num_bad, 'num_total': vidname_to_num_total})
    vidname_bad_df = vidname_bad_df.fillna(0)
    vidname_bad_df = vidname_bad_df.sort_index()
    print('Video Versus num bad')
    rich.print(vidname_bad_df.to_string())
    # print('sensor_to_num_bad = {}'.format(ub.urepr(sensor_to_num_bad, nl=1)))
    # print('region_to_num_bad = {}'.format(ub.urepr(region_to_num_bad, nl=1)))

    bad_stats = ub.ddict(lambda: 0)
    for bad in bad_img_infos:
        gid = bad['gid']
        coco_img = dset.coco_image(gid)
        for chan, chan_info in bad['chan_infos'].items():
            sensor = coco_img.img["sensor_coarse"]
            if chan_info["max_val"] is np.ma.masked:
                bad_stats[f'{sensor}:{chan}.max_masked'] += 1
            elif chan_info["max_val"] == 0:
                bad_stats[f'{sensor}:{chan}.max_zero'] += 1
                chan_info["num_masked"]
    rich.print('bad_stats = {}'.format(ub.urepr(bad_stats, nl=1)))

    return bad_gids



__config__ = CocoRemoveBadImagesConfig


if __name__ == '__main__':
    """
    CommandLine:
        python ~/code/watch/geowatch/cli/coco_bad_empty_images.py
    """
    main()