#!/bin/env python
import ubelt as ub
import scriptconfig as scfg
[docs]
class CocoRemoveBadImagesConfig(scfg.DataConfig):
"""
Remove coco images that are mostly nodata.
Can also delete the on-disk assets if specified.
"""
__command__ = 'remove_bad_images'
__default__ = {
'src': scfg.Value('data.kwcoco.json', help='input kwcoco filepath', position=1),
'dst': scfg.Value(None, help='output kwcoco filepath', position=2),
'workers': scfg.Value(0, type=str, help='number of io threads'),
'mode': scfg.Value('process', help='can be thread, process, or serial'),
'channels': scfg.Value(None, help='If specified, check only these channels for bad pixels'),
'delete_assets': scfg.Value('auto', help='if True actually deletes the assets. If auto and interactive, will ask the user to choose'),
'interactive': scfg.Value(True, isflag=1, help='if true, ask the user to confirm deletion'),
# 'overview': scfg.Value('coarsest', help='set to "coarsest" for fastest method, and 0 for most accurate method, or a non negative integer for that level of overview'),
'overview': scfg.Value(0, help='set to "coarsest" for fastest method, and 0 for most accurate method, or a non negative integer for that level of overview'),
}
[docs]
def main(cmdline=True, **kwargs):
"""
Ignore:
from geowatch.cli.coco_bad_empty_images import * # NOQA
kwargs = {}
kwargs['src'] = 'imgonly_S2_L8_WV.kwcoco.json'
kwargs['dst'] = 'imgonly_S2_L8_WV.kwcoco.json.tmp'
kwargs['workers'] = 8
# kwargs['channels'] = 'red|green|blue'
kwargs['channels'] = 'red'
cmdline = False
"""
config = CocoRemoveBadImagesConfig.cli(cmdline=cmdline, data=kwargs,
strict=True)
mode = config['mode']
import kwcoco
from kwutil import util_parallel
from rich.prompt import Confirm
import safer
workers = util_parallel.coerce_num_workers(config['workers'])
main_channels = config['channels']
if main_channels is not None:
main_channels = kwcoco.FusedChannelSpec.coerce(main_channels)
dset = kwcoco.CocoDataset.coerce(config['src'])
delete_assets = config['delete_assets']
if delete_assets == 'auto':
if not config['interactive']:
delete_assets = False
overview = config['overview']
bad_gids = find_empty_images(dset, main_channels, mode=mode,
workers=workers, overview=overview)
if config['interactive']:
if delete_assets == 'auto':
total_bytes = compute_asset_disk_usage(dset, bad_gids, mode, workers)
total_megabytes = total_bytes / 2 ** 20
print(f'Total bad space: {total_megabytes:0.4f} MB')
flag = Confirm.ask('Do you want to remove these empty images from the output kwcoco?')
if not flag:
return
if delete_assets == 'auto':
delete_assets = Confirm.ask('[red] Do you want to delete the on-disk assets too? (DESTRUCTIVE)')
if delete_assets:
bad_fpaths = []
for bad_gid in ub.ProgIter(bad_gids, desc='collect empty assets'):
coco_img = dset.coco_image(bad_gid)
bad_fpaths.extend(list(coco_img.iter_image_filepaths()))
for bad_fpath in ub.ProgIter(bad_fpaths, desc='delete empty assets'):
ub.delete(bad_fpath)
dset.remove_images(bad_gids)
FIX_ASSET_ORDER = 1
if FIX_ASSET_ORDER:
# TODO: this should be part of the crop script
for img in dset.dataset['images']:
if 'auxiliary' in img:
img['auxiliary'] = sorted(img['auxiliary'], key=lambda aux: aux['channels'])
# dset.fpath = config['dst']
dst_fpath = config['dst']
print('Write to dst_fpath = {!r}'.format(dst_fpath))
with safer.open(dst_fpath, 'w', temp_file=not ub.WIN32) as file:
dset.dump(file, indent=' ', newlines=True)
print('Wrote to dst_fpath = {!r}'.format(dst_fpath))
[docs]
def compute_asset_disk_usage(dset, gids, mode, workers):
from kwutil import util_progress
calc_jobs = ub.JobPool(mode=mode, max_workers=workers)
pman = util_progress.ProgressManager()
with pman:
for gid in pman.progiter(gids, desc='calc asset space'):
coco_img = dset.coco_image(gid)
for fpath in coco_img.iter_image_filepaths():
fpath = ub.Path(fpath).resolve()
calc_jobs.submit(fpath.stat)
total_bytes = 0
prog = pman.progiter(calc_jobs.as_completed(), desc='collect size jobs',
total=len(calc_jobs))
for job in prog:
stat = job.result()
total_bytes += stat.st_size
total_megabytes = total_bytes / 2 ** 20
msg = f'Current size: {total_megabytes:0.4f} MB'
prog.set_postfix_str(msg)
return total_bytes
[docs]
def is_image_empty(coco_img, main_channels=None, overview=-1):
"""
Run heristics to determine if a coco image is empty.
"""
import kwimage
import numpy as np
import kwcoco
bundle_dpath = ub.Path(coco_img.bundle_dpath)
if main_channels is not None:
main_channels = kwcoco.FusedChannelSpec.coerce(main_channels)
chan_infos = {}
for obj in coco_img.iter_asset_objs():
chan = kwcoco.FusedChannelSpec.coerce(obj['channels'])
if main_channels is None or (main_channels & chan).numel():
gpath = bundle_dpath / obj['file_name']
chan_infos[chan.spec] = chan_info = {}
chan_info['exists'] = gpath.exists()
if chan_info['exists']:
try:
imdata = kwimage.imread(gpath, backend='gdal', nodata='ma',
overview=overview)
except Exception:
imdata = kwimage.imread(gpath, backend='gdal', nodata='ma')
valid_values = imdata.data[~imdata.mask]
num_masked = imdata.mask.sum()
num_zero = (valid_values == 0).sum()
num_iffy = num_masked + num_zero
total = imdata.mask.size
if len(valid_values) == 0:
max_val = np.ma.masked
min_val = np.ma.masked
num_min = 0
num_max = 0
else:
max_val = valid_values.max()
min_val = valid_values.min()
num_min = (valid_values == max_val).sum()
num_max = (valid_values == min_val).sum()
if max_val != 0 and min_val != 0:
num_iffy += num_min
chan_info['max_val'] = max_val
chan_info['min_val'] = min_val
chan_info['num_masked'] = num_masked
chan_info['num_zero'] = num_zero
chan_info['num_min'] = num_min
chan_info['num_max'] = num_max
chan_info['num_iffy'] = num_iffy
chan_info['frac_masked'] = num_masked / total
chan_info['frac_zero'] = num_zero / total
chan_info['frac_iffy'] = num_iffy / total
img_info = {
'chan_infos': chan_infos,
'gid': coco_img.img['id'],
}
num_exist = 0
num_bad = 0
for chan, info in chan_infos.items():
if info['exists']:
num_exist += 1
maxval = info['max_val']
if maxval is np.ma.masked or maxval == 0:
num_bad += 1
is_bad = (num_bad == num_exist and num_exist > 0)
img_info['is_bad'] = is_bad
img_info['num_bad'] = num_bad
img_info['num_exist'] = num_exist
return img_info
[docs]
def find_empty_images(dset, main_channels, overview=-1, mode='process',
workers=0):
import numpy as np
import pandas as pd
from kwutil import util_progress
import rich
gid_to_infos = {}
all_gids = list(dset.index.imgs.keys())
pool = ub.JobPool(mode=mode, max_workers=workers)
pman = util_progress.ProgressManager()
with pman, pool:
for gid in pman.progiter(all_gids, desc='submit find empty image jobs',
freq=1000, adjust=0):
if gid not in gid_to_infos:
coco_img = dset.coco_image(gid).detach()
job = pool.submit(is_image_empty, coco_img,
main_channels=main_channels, overview=overview)
job.coco_img = coco_img
image_infos = []
num_bad = 0
prog = pman.progiter(pool.as_completed(), total=len(pool),
desc='collect find empty images', freq=1000,
adjust=False)
for job in prog:
coco_img = job.coco_img
img_info = job.result()
if img_info['is_bad']:
num_bad += 1
pman.update_info(f'num_empty = {num_bad} / {len(all_gids)}')
image_infos.append(img_info)
for img_info in image_infos:
img_iffys = [b['frac_iffy'] for b in img_info['chan_infos'].values()]
if img_iffys:
img_info['frac_iffy'] = min(img_iffys)
else:
img_info['frac_iffy'] = -1
if 1:
iffy_fracs = [d['frac_iffy'] for d in image_infos]
iffy_fracs = np.array(iffy_fracs)
iffy_bins = [-1, 0, 0.25, 0.5, 0.75, 0.85, .90, .95, 0.98, 1.0]
iffy_freq, iffy_bins = np.histogram(iffy_fracs, bins=iffy_bins)
iffy_hist = ub.dzip(ub.iter_window(iffy_bins, 2), iffy_freq)
print('iffy_hist = {}'.format(ub.urepr(iffy_hist, nl=1)))
# TODO: different iffy thresh per sensor
iffy_thresh = 0.95
bad_img_infos = []
for img_info in image_infos:
if img_info['frac_iffy'] > iffy_thresh:
bad_img_infos.append(img_info)
bad_gids = [b['gid'] for b in bad_img_infos]
all_images = dset.images()
bad_images = dset.images(bad_gids)
sensor_to_num_bad = ub.dict_hist(bad_images.lookup("sensor_coarse"))
sensor_to_total = ub.dict_hist(all_images.lookup("sensor_coarse"))
sensor_bad_df = pd.DataFrame({'num_bad': sensor_to_num_bad, 'num_total': sensor_to_total})
print(f'{len(bad_images)=}')
print('Sensor Versus num bad / total')
rich.print(sensor_bad_df.to_string())
vidname_to_num_bad = ub.dict_hist(dset.videos(bad_images.lookup("video_id")).lookup("name"))
vidname_to_num_total = ub.dict_hist(dset.videos(all_images.lookup("video_id")).lookup("name"))
vidname_bad_df = pd.DataFrame({'num_bad': vidname_to_num_bad, 'num_total': vidname_to_num_total})
vidname_bad_df = vidname_bad_df.fillna(0)
vidname_bad_df = vidname_bad_df.sort_index()
print('Video Versus num bad')
rich.print(vidname_bad_df.to_string())
# print('sensor_to_num_bad = {}'.format(ub.urepr(sensor_to_num_bad, nl=1)))
# print('region_to_num_bad = {}'.format(ub.urepr(region_to_num_bad, nl=1)))
bad_stats = ub.ddict(lambda: 0)
for bad in bad_img_infos:
gid = bad['gid']
coco_img = dset.coco_image(gid)
for chan, chan_info in bad['chan_infos'].items():
sensor = coco_img.img["sensor_coarse"]
if chan_info["max_val"] is np.ma.masked:
bad_stats[f'{sensor}:{chan}.max_masked'] += 1
elif chan_info["max_val"] == 0:
bad_stats[f'{sensor}:{chan}.max_zero'] += 1
chan_info["num_masked"]
rich.print('bad_stats = {}'.format(ub.urepr(bad_stats, nl=1)))
return bad_gids
__config__ = CocoRemoveBadImagesConfig
if __name__ == '__main__':
"""
CommandLine:
python ~/code/watch/geowatch/cli/coco_bad_empty_images.py
"""
main()