#!/usr/bin/env python3
import scriptconfig as scfg
import ubelt as ub
[docs]
class CleanGeotiffConfig(scfg.DataConfig):
r"""
Clean geotiff files inplace by masking bad pixels with NODATA.
Replaces large contiguous regions of specific same-valued pixels as NODATA.
Note:
This is a destructive operation and overwrites the geotiff image data
inplace. Make a copy of your dataset if there is any chance you need to
go back. The underlying kwcoco file is not modified.
Usage:
# It is a good idea to do a dry run first to check for issues
# This can be done at a smaller scale for speed.
DVC_DATA_DPATH=$(geowatch_dvc --tags='phase2_data' --hardware=auto)
geowatch clean_geotiffs \
--src "$DVC_DATA_DPATH/Drop4-BAS/data.kwcoco.json" \
--channels="red|green|blue|nir|swir16|swir22" \
--prefilter_channels="red" \
--min_region_size=256 \
--nodata_value=-9999 \
--workers="min(2,avail)" \
--probe_scale=0.5 \
--dry=True
# Then execute a real run at full scale - optionally with a probe scale
geowatch clean_geotiffs \
--src "$DVC_DATA_DPATH/Drop4-BAS/data_vali.kwcoco.json" \
--channels="red|green|blue|nir|swir16|swir22" \
--prefilter_channels="red" \
--min_region_size=256 \
--nodata_value=-9999 \
--workers="min(2,avail)" \
--probe_scale=None \
--dry=False
Ignore:
geowatch clean_geotiffs \
--src=data.kwcoco.zip --dry=True --workers=8 \
--probe_scale=0.25 --prefilter_channels=pan --channels=pan
geowatch clean_geotiffs \
--dry=False --workers=2 \
--probe_scale=0.0625 --prefilter_channels="pan" \
--channels="pan" \
--src=data.kwcoco.zip
geowatch clean_geotiffs \
--dry=False --workers=avail \
--probe_scale=0.0625 --prefilter_channels="red" \
--channels="red|green|blue|nir|swir16|swir22" \
--src=data.kwcoco.zip
geowatch clean_geotiffs \
--dry=False --workers=avail \
--probe_scale=0.25 --prefilter_channels="swir22" \
--channels="swir16|swir22" \
--src=imgonly-BR_R005.kwcoco.json
"""
src = scfg.Value(None, position=1, help='input coco dataset')
workers = scfg.Value(0, type=str, help='number of workers')
channels = scfg.Value('*', help=ub.paragraph(
'''
The channels to apply nodata fixes to. A value of * means all channels
except the excluded ones.
'''))
exclude_channels = scfg.Value('quality|cloudmask', help=ub.paragraph(
'''
Channels to never apply fixes to.
'''))
prefilter_channels = scfg.Value('red', help=ub.paragraph(
'''
The channels to use when checking for bad nodata values.
If unspecified, uses channels.
'''))
possible_nodata_values = scfg.Value([0], help=ub.paragraph(
'''
List of integer values that might represent nodata, but may also failed
to be labeled as nodata.
'''))
min_region_size = scfg.Value(256, help=ub.paragraph(
'''
Minimum size of a connected region to be considered as a nodata
candidate.
'''))
scale = scfg.Value(None, help=ub.paragraph(
'''
Scale at which to perform the check. E.g. 0.5 for half resolution.
Speeds up checks, but should only be used in dry runs.
TODO: we could fix the real run to work with scale checks, and that
would speed it up and probably not generate many false negatives.
'''))
nodata_value = scfg.Value(-9999, help='the real nodata value to use')
probe_scale = scfg.Value(None, help=ub.paragraph(
'''
If specified, we probe the image at smaller scale for bad values and if
any are found we do a full scale computation. Speeds up computation if
there are not many images with bad values, but may result in false
negatives.
Should be given as a float less than 1.0
'''))
use_fix_stamps = scfg.Value(False, help=ub.paragraph(
'''
if True, write a file next to every file that was fixed so we dont need
to check it again.
'''))
export_bad_fpath = scfg.Value(None, help='if True, export paths to bad files to this newline separated file, also works in dry mode')
dry = scfg.Value(False, help='if True, only do a dry run. Report issues but do not fix them')
__config__ = CleanGeotiffConfig
[docs]
def main(cmdline=1, **kwargs):
"""
CommandLine:
xdoctest -m geowatch.cli.coco_clean_geotiffs main
Example:
>>> # xdoctest: +REQUIRES(env:SLOW_DOCTESTS)
>>> # Generate a dataset that has bad nodata values
>>> from geowatch.cli.coco_clean_geotiffs import * # NOQA
>>> import kwimage
>>> import geowatch
>>> import kwarray
>>> import numpy as np
>>> # Create a copy of the test dataset to clean inplace
>>> orig_dset = geowatch.coerce_kwcoco('geowatch-msi', geodata=True, bad_nodata=True, num_videos=1, num_frames=2)
>>> orig_dpath = ub.Path(orig_dset.bundle_dpath)
>>> dpath = orig_dpath.augment(stemsuffix='_cleaned')
>>> dpath.delete()
>>> orig_dpath.copy(dpath)
>>> dset = geowatch.coerce_kwcoco(dpath / 'data.kwcoco.json')
>>> coco_img = dset.images().coco_images[0]
>>> kwargs = {
>>> 'src': dset,
>>> 'workers': 0,
>>> 'channels': 'B11',
>>> 'prefilter_channels': 'B11',
>>> 'min_region_size': 32,
>>> 'nodata_value': 2, # because toydata is uint16
>>> }
>>> cmdline = 0
>>> # Do a dry run first
>>> main(cmdline=cmdline, **kwargs, dry=True)
>>> # Then a real run.
>>> main(cmdline=cmdline, **kwargs)
>>> coco_img1 = orig_dset.images().coco_images[0]
>>> coco_img2 = dset.coco_image(coco_img1.img['id'])
>>> print(ub.urepr(list(coco_img1.iter_image_filepaths())))
>>> print(ub.urepr(list(coco_img2.iter_image_filepaths())))
>>> imdata1 = coco_img1.imdelay('B11', nodata_method='float').finalize()
>>> imdata2 = coco_img2.imdelay('B11', nodata_method='float').finalize()
>>> print(np.isnan(imdata1).sum())
>>> print(np.isnan(imdata2).sum())
>>> canvas1 = kwarray.robust_normalize(imdata1)
>>> canvas2 = kwarray.robust_normalize(imdata2)
>>> canvas1 = kwimage.nodata_checkerboard(canvas1)
>>> canvas2 = kwimage.nodata_checkerboard(canvas2)
>>> # xdoctest: +REQUIRES(--show)
>>> # xdoctest: +REQUIRES(module:kwplot)
>>> import kwplot
>>> kwplot.autompl()
>>> kwplot.imshow(canvas1, pnum=(1, 2, 1), title='before')
>>> kwplot.imshow(canvas2, pnum=(1, 2, 2), title='after')
"""
from kwutil import util_parallel
import kwcoco
config = CleanGeotiffConfig.cli(cmdline=cmdline, data=kwargs, strict=True)
import rich
rich.print('config = {}'.format(ub.urepr(config, nl=1)))
print('Loading dataset')
coco_dset = kwcoco.CocoDataset.coerce(config['src'])
workers = util_parallel.coerce_num_workers(config['workers'])
print('workers = {}'.format(ub.urepr(workers, nl=1)))
jobs = ub.JobPool(mode='process', max_workers=workers, transient=True)
print('Created job pool')
if config['channels'] is None or config['channels'] == '*':
channels = None
else:
channels = kwcoco.ChannelSpec.coerce(config['channels'])
if config['prefilter_channels'] is None:
prefilter_channels = None
else:
prefilter_channels = kwcoco.ChannelSpec.coerce(config['prefilter_channels'])
possible_nodata_values = set(config['possible_nodata_values'])
if config['scale'] is not None:
assert config['dry'], 'only scale in dry runs'
probe_kwargs = {
'channels': channels,
'exclude_channels': kwcoco.ChannelSpec.coerce(config['exclude_channels']).fuse(),
'scale': config['scale'],
'possible_nodata_values': possible_nodata_values,
'prefilter_channels': prefilter_channels,
'min_region_size': config['min_region_size'],
'probe_scale': config['probe_scale'],
'use_fix_stamps': config['use_fix_stamps'],
'nodata_value': config['nodata_value'],
}
print('Grabbing coco images')
coco_imgs = coco_dset.images().coco_images
print('About to start looping')
if config.export_bad_fpath is not None:
export_fpath = ub.Path(config.export_bad_fpath)
with open(export_fpath, 'w'):
...
export_file = open(export_fpath, 'a')
else:
export_file = None
from kwutil import util_progress
mprog = util_progress.ProgressManager(backend='progiter')
# mprog = util_progress.ProgressManager(backend='rich')
with mprog, jobs:
print('In the with statement')
mprog.update_info('Looking for geotiff issues')
# TODO: may need to use the blocking job queue to limit the maximum
# number of unhandled but populated results
for coco_img in mprog.new(coco_imgs, desc='Submit probe jobs'):
coco_img.detach()
jobs.submit(probe_image_issues, coco_img, **probe_kwargs)
def collect_jobs(jobs):
for job in mprog.new(jobs.as_completed(), total=len(jobs), desc='Collect probe jobs'):
image_summary = job.result()
yield image_summary
def filter_fixable(summaries):
num_asset_issues = 0
num_images_issues = 0
num_incorrect_nodata = 0
seen_bad_values = set()
mprog.update_info(ub.codeblock(
f'''
Discovered Issues
-----------------
Found num_images_issues={num_images_issues}
Found num_asset_issues={num_asset_issues}
num_incorrect_nodata={num_incorrect_nodata}
seen_bad_values={seen_bad_values}
'''
))
for image_summary in summaries:
if image_summary['needs_fix']:
num_images_issues += 1
for asset_summary in image_summary['chans']:
if asset_summary['needs_fix']:
asset_summary['coco_img'] = image_summary['coco_img']
seen_bad_values.update(asset_summary['bad_values'])
if not asset_summary.get('has_incorrect_nodata_value', False):
num_incorrect_nodata += 1
num_asset_issues += 1
mprog.update_info(ub.codeblock(
f'''
Discovered Issues
-----------------
Found num_images_issues={num_images_issues}
Found num_asset_issues={num_asset_issues}
seen_bad_values={seen_bad_values}
'''
))
yield asset_summary
EAGER = 0
summaries = collect_jobs(jobs)
if EAGER:
summaries = list(summaries)
else:
summaries = iter(summaries)
needs_fix = filter_fixable(summaries)
if EAGER:
needs_fix = list(needs_fix)
else:
needs_fix = iter(needs_fix)
if not config['dry']:
correct_nodata_value = config['nodata_value']
for asset_summary in mprog.new(needs_fix, desc='Cleaning identified issues'):
if export_file is not None:
export_file.write(asset_summary['fpath'] + '\n')
fpath = ub.Path(asset_summary['fpath'])
fix_geotiff_ondisk(asset_summary,
correct_nodata_value=correct_nodata_value)
if config['use_fix_stamps']:
# Mark that fixes have been applied to this asset
import json
fix_fpath = fpath.augment(tail='.fixes.stamp')
if fix_fpath.exists():
fixes = json.loads(fix_fpath.read_text())
else:
fixes = []
new_fixes = ub.udict(asset_summary) & {'band_idxs'}
fixes.append(new_fixes)
fix_fpath.write_text(json.dumps(fixes))
else:
for asset_summary in mprog.new(needs_fix, desc='Dry run: identifying issues'):
if export_file is not None:
export_file.write(asset_summary['fpath'] + '\n')
print(asset_summary['fpath'])
...
if export_file is not None:
export_file.close()
# if 0:
# import xdev
# import kwplot
# kwplot.autompl()
# nonzero_chans = []
# for image_summary in has_nonzeros:
# coco_img = image_summary['coco_img']
# for asset_summary in image_summary['chans']:
# bad_values = asset_summary['bad_values']
# if len(set(bad_values) - {0}):
# asset_summary['coco_img'] = coco_img
# nonzero_chans.append(asset_summary)
# iter_ = xdev.InteractiveIter(nonzero_chans)
# for asset_summary in iter_:
# coco_img = asset_summary['coco_img']
# chan_canvas = draw_asset_summary(coco_img, asset_summary)
# kwplot.imshow(chan_canvas, doclf=1)
# xdev.InteractiveIter.draw()
[docs]
def probe_image_issues(coco_img, channels=None, prefilter_channels=None, scale=None,
possible_nodata_values=None, min_region_size=256,
exclude_channels=None, probe_scale=None,
use_fix_stamps=False, nodata_value=-9999):
"""
Inspect a single image, possibily with multiple assets, each with possibily
multiple bands for fixable nodata values.
Args:
coco_img : the coco image to check
channels : the channels to check
prefilter_channels :
the channels to check first for efficiency. If they do not exist,
then the all channels are checked.
possible_nodata_values (set[int]):
the values that may be nodata if known
scale :
use a downscaled overview to speed up the computation via
approximation. Returns results at this scale. DO NOT USE RIGHT NOW.
probe_scale :
use a downscaled overview to speed up the computation via
approximation. If the probe identifies an issue a full scale probe
is done.
Ignore:
globals().update((ub.udict(xdev.get_func_kwargs(probe_image_issues)) | probe_kwargs))
Example:
>>> import geowatch
>>> from geowatch.cli.coco_clean_geotiffs import * # NOQA
>>> import numpy as np
>>> dset = geowatch.coerce_kwcoco('geowatch-msi', geodata=True, bad_nodata=True)
>>> coco_img = dset.images().coco_images[4]
>>> channels = 'B11|B10|X.1'
>>> prefilter_channels = 'B11'
>>> scale = None
>>> possible_nodata_values = {0}
>>> min_region_size = 128
>>> image_summary = probe_image_issues(
>>> coco_img, channels=channels, prefilter_channels=prefilter_channels,
>>> scale=scale, possible_nodata_values=possible_nodata_values,
>>> min_region_size=min_region_size)
>>> print(f'image_summary={image_summary}')
"""
import kwcoco
if channels is None:
request_channels = coco_img.channels.fuse()
else:
request_channels = kwcoco.ChannelSpec.coerce(channels).fuse()
if prefilter_channels is not None:
prefilter_channels = kwcoco.ChannelSpec.coerce(prefilter_channels).fuse()
if exclude_channels is not None:
request_channels = request_channels.difference(exclude_channels)
if prefilter_channels is not None:
prefilter_channels = prefilter_channels.difference(exclude_channels)
image_summary = {}
asset_summaries = []
prefilter_assets = []
requested_assets = []
for obj in coco_img.iter_asset_objs():
chans = kwcoco.ChannelSpec.coerce(obj['channels']).fuse()
request_common = (chans & request_channels).fuse()
if request_common.numel():
if prefilter_channels is None:
prefilter_common = None
else:
prefilter_common = (chans & prefilter_channels).fuse()
chans_ = chans.to_oset()
band_idxs = [chans_.index(c) for c in request_common.to_list()]
if prefilter_common is not None and prefilter_common.numel():
prefilter_assets.append({
'obj': obj,
'band_idxs': band_idxs,
})
else:
requested_assets.append({
'obj': obj,
'band_idxs': band_idxs,
})
should_continue = True
for item in prefilter_assets:
obj = item['obj']
band_idxs = item['band_idxs']
if probe_scale is not None:
# Check if there is anything obvious at a smaller scale first if
# requested.
probe_asset_summary = probe_asset(
coco_img, obj, band_idxs=band_idxs, scale=probe_scale,
possible_nodata_values=possible_nodata_values,
min_region_size=min_region_size, use_fix_stamps=use_fix_stamps,
nodata_value=nodata_value)
if not probe_asset_summary['bad_values']:
should_continue = False
if should_continue:
# Check if there is anything obvious in a specific channel first if
# requested.
asset_summary = probe_asset(coco_img, obj, band_idxs=band_idxs,
scale=scale,
possible_nodata_values=possible_nodata_values,
min_region_size=min_region_size,
use_fix_stamps=use_fix_stamps,
nodata_value=nodata_value)
if not asset_summary['bad_values']:
should_continue = False
asset_summaries.append(asset_summary)
if len(prefilter_assets) == 0:
should_continue = True
if should_continue:
for item in requested_assets:
obj = item['obj']
band_idxs = item['band_idxs']
asset_summary = probe_asset(coco_img, obj, band_idxs=band_idxs,
scale=scale,
possible_nodata_values=possible_nodata_values,
min_region_size=min_region_size,
use_fix_stamps=use_fix_stamps,
nodata_value=nodata_value)
asset_summaries.append(asset_summary)
image_summary['chans'] = asset_summaries
image_summary['coco_img'] = coco_img
image_summary['bad_values'] = list(ub.unique(ub.flatten([c['bad_values'] for c in asset_summaries])))
image_summary['needs_fix'] = any([c['needs_fix'] for c in asset_summaries])
return image_summary
[docs]
def probe_asset(coco_img, obj, band_idxs=None, scale=None,
possible_nodata_values=None, min_region_size=256,
use_fix_stamps=False, nodata_value=-9999):
"""
Inspect a specific single-file asset possibily with multiple bands for
fixable nodata values.
"""
import kwcoco
asset_channels = kwcoco.FusedChannelSpec.coerce(obj['channels'])
if band_idxs is None:
band_idxs = list(range(asset_channels.numel()))
bundle_dpath = ub.Path(coco_img.bundle_dpath)
fpath = bundle_dpath / obj['file_name']
asset_summary = {
'channels': asset_channels,
'fpath': fpath,
'needs_fix': False,
}
if use_fix_stamps:
import json
fix_fpath = fpath.augment(tail='.fixes.stamp')
if fix_fpath.exists():
fixes = json.loads(fix_fpath.read_text())
assert fixes
return asset_summary
# TODO: actually check the requested fix was applied, but for now
# assume if the file exists we should skip the check.
check_channels = asset_channels[band_idxs]
delayed = coco_img.imdelay(
channels=check_channels, space='asset', nodata_method='float')
if scale is not None:
delayed.prepare()
delayed = delayed.scale(scale)
delayed = delayed.optimize()
imdata = delayed.finalize(interpolation='nearest')
if scale is not None:
min_region_size_ = int(min_region_size * scale)
else:
min_region_size_ = min_region_size
if possible_nodata_values is not None:
possible_nodata_values = list(possible_nodata_values)
possible_nodata_values.append(nodata_value)
asset_summary.update(probe_asset_imdata(
imdata, band_idxs, min_region_size_=min_region_size_,
possible_nodata_values=possible_nodata_values))
asset_summary.update(_probe_correct_nodata_value(
fpath, band_idxs, nodata_value))
return asset_summary
def _probe_correct_nodata_value(fpath, band_idxs, nodata_value=-9999):
asset_summary = {}
from geowatch.utils import util_gdal
gdal_dset = util_gdal.GdalDataset.open(fpath)
band_infos = gdal_dset.info()['bands']
gdal_dset = None
asset_summary['has_incorrect_nodata_value'] = False
for band_idx in band_idxs:
band_info = band_infos[band_idx]
if band_info.get('noDataValue', None) != nodata_value:
asset_summary['has_incorrect_nodata_value'] = True
asset_summary['needs_fix'] = True
return asset_summary
[docs]
def probe_asset_imdata(imdata, band_idxs, min_region_size_=256,
possible_nodata_values=None):
import kwarray
asset_summary = {}
asset_summary['band_idxs'] = band_idxs
band_summaries = []
imdata = kwarray.atleast_nd(imdata, n=3)
for idx in range(len(band_idxs)):
band_imdata = imdata[..., idx]
band_summary = probe_band_imdata(
band_imdata, min_region_size_=min_region_size_,
possible_nodata_values=possible_nodata_values)
band_summaries.append(band_summary)
asset_summary['band_summaries'] = band_summaries
asset_summary['bad_values'] = list(ub.unique(ub.flatten([
c['bad_values'] for c in band_summaries])))
if len(asset_summary['bad_values']) > 0:
asset_summary['needs_fix'] = True
return asset_summary
[docs]
def probe_band_imdata(band_imdata, min_region_size_=256,
possible_nodata_values=None):
from geowatch.utils import util_kwimage
import numpy as np
band_imdata = np.ascontiguousarray(band_imdata)
is_samecolor = util_kwimage.find_samecolor_regions(
band_imdata, min_region_size=min_region_size_,
values=possible_nodata_values, PRINT_STEPS=0)
band_summary = {}
if np.any(is_samecolor):
unique_labels, unique_first_index, unique_counts = np.unique(is_samecolor, return_index=True, return_counts=True)
flags = unique_labels > 0
bad_labels = unique_labels[flags]
first_index = unique_first_index[flags]
bad_counts = unique_counts[flags]
bad_values = band_imdata.ravel()[first_index]
band_summary['is_samecolor'] = is_samecolor
band_summary['bad_labels'] = bad_labels
band_summary['bad_values'] = bad_values
band_summary['bad_counts'] = bad_counts
else:
band_summary['is_samecolor'] = False
band_summary['bad_values'] = []
return band_summary
[docs]
def fix_single_asset(fpath, dry=False):
from delayed_image import DelayedLoad
asset_summary = {
'fpath': fpath,
'needs_fix': False,
}
min_region_size_ = 256
nodata_value = -9999
possible_nodata_values = [0, nodata_value]
delayed = DelayedLoad(fpath)
delayed.prepare()
band_idxs = list(range(delayed.num_channels))
imdata = delayed.finalize(nodata_method='nan', interpolation='nearest')
asset_summary.update(probe_asset_imdata(
imdata, band_idxs, min_region_size_=min_region_size_,
possible_nodata_values=possible_nodata_values))
asset_summary.update(_probe_correct_nodata_value(
fpath, band_idxs, nodata_value))
print('asset_summary = {}'.format(ub.urepr(asset_summary, nl=True)))
if not dry:
fix_geotiff_ondisk(asset_summary, correct_nodata_value=nodata_value)
[docs]
def fix_geotiff_ondisk(asset_summary, correct_nodata_value=-9999):
"""
Updates the nodata value based on a mask inplace on disk.
Attempts to preserve all other metadata, but this is not guarenteed or
always possible.
Args:
asset_summary (Dict): an item from :func:`probe_asset`.
correct_nodata_value (int): the nodata value to use in the
modified geotiff.
Assumptions:
* The input image uses AVERAGE overview resampling
* The input image is a tiled geotiff (ideally a COG)
TODO:
- [ ] Can restructure this as a more general context manager.
Example:
>>> from geowatch.cli.coco_clean_geotiffs import * # NOQA
>>> from geowatch.demo.metrics_demo.demo_rendering import write_demo_geotiff
>>> import kwimage
>>> import numpy as np
>>> dpath = ub.Path.appdir('geowatch/tests/clean_geotiff').ensuredir()
>>> fpath1 = dpath / 'test_geotiff.tif'
>>> fpath2 = fpath1.augment(stemsuffix='_fixed')
>>> fpath1.delete()
>>> fpath2.delete()
>>> imdata = kwimage.grab_test_image('amazon', dsize=(512, 512))
>>> poly = kwimage.Polygon.random().scale(imdata.shape[0:2][::-1])
>>> imdata = poly.draw_on(imdata, color='black')
>>> imdata = imdata.astype(np.int16)
>>> #imdata = poly.fill(imdata, value=(0, 0, 0), pixels_are='areas')
>>> imdata = poly.fill(imdata, value=0, pixels_are='areas')
>>> imdata[:256, :256, 0] = 0
>>> write_demo_geotiff(img_fpath=fpath1, imdata=imdata)
>>> fpath1.copy(fpath2)
>>> asset_summary = probe_asset_imdata(imdata, band_idxs=[0, 2], possible_nodata_values={0})
>>> asset_summary['fpath'] = fpath2
>>> assert fpath1.stat().st_size == fpath2.stat().st_size
>>> fix_geotiff_ondisk(asset_summary)
>>> assert fpath1.stat().st_size != fpath2.stat().st_size
>>> imdata1 = kwimage.imread(fpath1, nodata_method='ma')
>>> imdata2 = kwimage.imread(fpath2, nodata_method='ma')
>>> canvas1 = kwimage.normalize_intensity(imdata1)
>>> canvas2 = kwimage.normalize_intensity(imdata2)
>>> canvas1 = kwimage.nodata_checkerboard(canvas1)
>>> canvas2 = kwimage.nodata_checkerboard(canvas2)
>>> # xdoctest: +REQUIRES(--show)
>>> # xdoctest: +REQUIRES(module:kwplot)
>>> import kwplot
>>> kwplot.autompl()
>>> kwplot.imshow(canvas1.data, pnum=(2, 2, 1), title='norm imdata1 vals')
>>> kwplot.imshow(canvas2.data, pnum=(2, 2, 2), title='norm imdata2 vals')
>>> kwplot.imshow(imdata1.mask.any(axis=2), pnum=(2, 2, 3), title='imdata1.mask')
>>> kwplot.imshow(imdata2.mask.any(axis=2), pnum=(2, 2, 4), title='imdata2.mask')
>>> #kwplot.imshow((asset_summary['is_samecolor'] > 0), pnum=(3, 2, 5), title='is samecolor mask')
Example:
>>> from geowatch.cli.coco_clean_geotiffs import * # NOQA
>>> from geowatch.demo.metrics_demo.demo_rendering import write_demo_geotiff
>>> import kwimage
>>> import numpy as np
>>> dpath = ub.Path.appdir('geowatch/tests/clean_geotiff').ensuredir()
>>> fpath1 = dpath / 'test_geotiff.tif'
>>> fpath2 = fpath1.augment(stemsuffix='_fixed')
>>> fpath1.delete()
>>> fpath2.delete()
>>> imdata = kwimage.grab_test_image('amazon', dsize=(512, 512))[..., 0]
>>> poly = kwimage.Polygon.random().scale(imdata.shape[0:2][::-1])
>>> imdata = imdata.astype(np.int16)
>>> imdata = poly.fill(imdata, value=0, pixels_are='areas')
>>> imdata[:256, :256] = 0
>>> write_demo_geotiff(img_fpath=fpath1, imdata=imdata)
>>> fpath1.copy(fpath2)
>>> asset_summary = probe_asset_imdata(imdata, band_idxs=[0], possible_nodata_values={0})
>>> asset_summary['fpath'] = fpath2
>>> assert fpath1.stat().st_size == fpath2.stat().st_size
>>> fix_geotiff_ondisk(asset_summary)
>>> assert fpath1.stat().st_size != fpath2.stat().st_size
>>> imdata1 = kwimage.imread(fpath1, nodata_method='ma')
>>> imdata2 = kwimage.imread(fpath2, nodata_method='ma')
>>> canvas1 = kwimage.normalize_intensity(imdata1)
>>> canvas2 = kwimage.normalize_intensity(imdata2)
>>> canvas1 = kwimage.nodata_checkerboard(canvas1)
>>> canvas2 = kwimage.nodata_checkerboard(canvas2)
>>> # xdoctest: +REQUIRES(--show)
>>> # xdoctest: +REQUIRES(module:kwplot)
>>> import kwplot
>>> kwplot.autompl()
>>> kwplot.imshow(canvas1.data, pnum=(2, 2, 1), title='norm imdata1 vals')
>>> kwplot.imshow(canvas2.data, pnum=(2, 2, 2), title='norm imdata2 vals')
>>> kwplot.imshow(imdata1.mask, pnum=(2, 2, 3), title='imdata1.mask')
>>> kwplot.imshow(imdata2.mask, pnum=(2, 2, 4), title='imdata2.mask')
>>> #kwplot.imshow((asset_summary['is_samecolor'] > 0), pnum=(3, 2, 5), title='is samecolor mask')
"""
import os
from geowatch.utils import util_gdal
from osgeo import gdal
import numpy as np
import tempfile
fpath = ub.Path(asset_summary['fpath'])
band_idxs = asset_summary['band_idxs']
# We will write a modified version to a temporay file and then overwrite
# the destination file to avoid race conditions.
tmp_fpath = ub.Path(tempfile.mktemp(
dir=fpath.parent, prefix='.tmp.' + fpath.stem, suffix='.tiff'))
assert tmp_fpath.parent.exists()
# Assume average overview resampling, there does not seem to be standard
# way that a geotiff encodes what algorithm was used.
overview_resample = 'AVERAGE'
### Read in source data and introspect it
src_dset = util_gdal.GdalDataset.open(os.fspath(fpath))
orig_driver_name = src_dset.GetDriver().ShortName
assert orig_driver_name == 'GTiff'
band_idx_to_orig_props = {}
for band_idx in band_idxs:
band = src_dset.GetRasterBand(band_idx + 1)
# Have to read this here because it is clobbered in the copy
band_idx_to_orig_props[band_idx] = {
'blocksize': band.GetBlockSize(),
'num_overviews': band.GetOverviewCount(),
}
### Copy the source data to memory and modify it
driver1 = gdal.GetDriverByName(str('MEM'))
copy1 = driver1.CreateCopy(str(''), src_dset)
src_dset.FlushCache()
src_dset = None
### Pixel Modification ###
# Modify the pixel contents
for band_idx, band_summary in zip(band_idxs, asset_summary['band_summaries']):
band = copy1.GetRasterBand(band_idx + 1)
band_data = band.ReadAsArray()
mask = band_summary['is_samecolor'] > 0
curr_nodat_value = band.GetNoDataValue()
if curr_nodat_value is not None:
if curr_nodat_value != correct_nodata_value:
is_curr_nodata = (band_data == curr_nodat_value)
mask = (mask | is_curr_nodata)
band_data[mask] = correct_nodata_value
band.WriteArray(band_data)
if curr_nodat_value != correct_nodata_value:
band.SetNoDataValue(correct_nodata_value)
if band_idx_to_orig_props:
assert ub.allsame(band_idx_to_orig_props.values()), (
'We expect input bands to have the same blocksize')
bandprop = ub.peek(band_idx_to_orig_props.values())
blocksize = bandprop['blocksize']
num_overviews = bandprop['num_overviews']
else:
raise AssertionError('should have this info')
### Rebuild overviews
overviewlist = (2 ** np.arange(1, num_overviews + 1)).tolist()
copy1.BuildOverviews(overview_resample, overviewlist)
### Flush the in-memory dataset to an on-disk GeoTiff
xsize, ysize = blocksize
_options = [
'BIGTIFF=YES',
'TILED=YES',
'BLOCKXSIZE={}'.format(xsize),
'BLOCKYSIZE={}'.format(ysize),
]
_options += ['COMPRESS={}'.format('DEFLATE')]
_options.append('COPY_SRC_OVERVIEWS=YES')
driver1 = None
# driver2 = gdal.GetDriverByName(str('GTiff'))
driver2 = gdal.GetDriverByName(str(orig_driver_name))
copy2 = driver2.CreateCopy(os.fspath(tmp_fpath), copy1, options=_options)
if copy2 is None:
last_gdal_error = gdal.GetLastErrorMsg()
if 'No such file or directory' in last_gdal_error:
ex_cls = IOError
else:
ex_cls = Exception
raise ex_cls(
'Unable to create gtiff driver for fpath={}, options={}, last_gdal_error={}'.format(
fpath, _options, last_gdal_error))
copy2.FlushCache()
copy1 = None
copy2 = None # NOQA
driver2 = None
assert ub.Path(tmp_fpath).exists()
os.rename(tmp_fpath, fpath)
[docs]
def draw_asset_summary(coco_img, asset_summary):
from geowatch.utils import util_kwimage
import kwimage
import numpy as np
is_samecolor = asset_summary['is_samecolor']
data = coco_img.imdelay(channels=asset_summary['channels'], space='asset',
nodata_method='float').finalize()
is_samecolor = np.ascontiguousarray(is_samecolor)
unique_labels, first_index = np.unique(is_samecolor, return_index=True)
data_values = data.ravel()[first_index]
# is_samecolor.ravel()[first_index]
label_mapping = dict(zip(unique_labels, data_values))
label_mapping[0] = 'valid data'
labels = is_samecolor
label_to_color = {0: 'black'}
label_canvas = util_kwimage.colorize_label_image(
labels, with_legend=True,
label_mapping=label_mapping, label_to_color=label_to_color)
poly = kwimage.Mask.coerce(is_samecolor > 0).to_multi_polygon()
boxes = kwimage.Boxes.concatenate([p.to_boxes() for p in poly.data])
im_canvas = kwimage.normalize_intensity(data)
im_canvas = kwimage.fill_nans_with_checkers(im_canvas, on_value=0.3)
im_canvas = boxes.draw_on(im_canvas)
canvas = kwimage.stack_images([im_canvas, label_canvas], axis=1)
title = coco_img.video['name'] + ' ' + asset_summary['channels'] + '\n' + coco_img.img['name']
canvas = kwimage.draw_header_text(canvas, title)
if 0:
import kwplot
kwplot.imshow(canvas)
return canvas
if __name__ == '__main__':
"""
CommandLine:
python ~/code/watch/geowatch/cli/coco_clean_geotiffs.py
"""
main()