#!/usr/bin/env python3
"""
Combine kwcoco files with different "auxiliary" / "asset" features into a
single kwcoco file.
"""
import ubelt as ub
import scriptconfig as scfg
[docs]
class CocoCombineFeatures(scfg.DataConfig):
"""
Combine kwcoco files with different "auxiliary" / "asset" features into a
single kwcoco file.
The names of the kwcoco images in all of the input ``src`` datasets must be
the same.
TODO:
- [ ] This might go in kwcoco proper? This could be folded into "union"
"""
src = scfg.Value([], nargs='+', position=1, help=ub.paragraph(
'''
Paths to the input kwcoco datasets. The first one will be the "base"
'''))
dst = scfg.Value(None, help=ub.paragraph(
'''
Path to the destination combined kwcoco dataset to write.
'''))
io_workers = scfg.Value('avail', help=ub.paragraph(
'''
Number of workers used to read multiple datasets. Can be numeric
or a string code like "avail", which uses all available CPUs.
'''))
absolute = scfg.Value(False, isflag=True, help=ub.paragraph(
'''
if True, reroot all inputs to use absolute paths
'''))
[docs]
def main(cmdline=True, **kwargs):
"""
Example:
>>> from geowatch.cli import coco_combine_features
>>> import geowatch
>>> dset = geowatch.coerce_kwcoco('geowatch-msi')
>>> dpath = ub.Path.appdir('geowatch/tests/combine_fetures').ensuredir()
>>> # Breakup the data into two parts with different features
>>> dset1 = dset.copy()
>>> dset2 = dset.copy()
>>> dset1.fpath = dpath / 'part1.kwcoco.json'
>>> dset2.fpath = dpath / 'part2.kwcoco.json'
>>> # Remove all but the first asset from dset1
>>> for coco_img in dset1.images().coco_images:
... del coco_img.img['auxiliary'][1:]
>>> # Remove the first asset from dset2
>>> for coco_img in dset2.images().coco_images:
... del coco_img.img['auxiliary'][0]
>>> dset1.dump()
>>> dset2.dump()
>>> from geowatch.utils import kwcoco_extensions
>>> chan_stats0 = kwcoco_extensions.coco_channel_stats(dset)['chan_hist']
>>> chan_stats1 = kwcoco_extensions.coco_channel_stats(dset1)['chan_hist']
>>> chan_stats2 = kwcoco_extensions.coco_channel_stats(dset2)['chan_hist']
>>> assert chan_stats1 != chan_stats0, 'channels should be different'
>>> # Combining the two modified kwcoco files should result in the original
>>> dst_fpath = dpath / 'combo.kwcoco.json'
>>> kwargs = {
>>> 'src': [str(dset1.fpath), str(dset2.fpath)],
>>> 'dst': str(dst_fpath),
>>> }
>>> cmdline = 0
>>> coco_combine_features.main(cmdline=cmdline, **kwargs)
>>> dst_dset = geowatch.coerce_kwcoco(dst_fpath)
>>> chan_stats3 = kwcoco_extensions.coco_channel_stats(dst_dset)['chan_hist']
>>> assert chan_stats3 == chan_stats0, (
>>> 'combine features should have the same as the original dset')
Example:
>>> # xdoctest: +REQUIRES(env:DVC_DPATH)
>>> # xdoctest: +SKIP
>>> # drop1-S2-L8-aligned-old deprecated
>>> from geowatch.cli.coco_combine_features import * # NOQA
>>> import os
>>> _default = ub.expandpath('$HOME/data/dvc-repos/smart_watch_dvc')
>>> dvc_dpath = ub.Path(os.environ.get('DVC_DPATH', _default))
>>> fpath1 = dvc_dpath / 'drop1-S2-L8-aligned/data.kwcoco.json'
>>> #fpath1 = dvc_dpath / 'drop1-S2-L8-aligned-old/data.kwcoco.json'
>>> fpath2 = dvc_dpath / 'drop1-S2-L8-aligned-old/uky_invariants.kwcoco.json'
>>> fpath3 = dvc_dpath / 'drop1-S2-L8-aligned/_testcombo.kwcoco.json'
>>> assert fpath1.exists()
>>> assert fpath2.exists()
>>> cmdline = False
>>> kwargs = {
>>> 'src': [str(fpath1), str(fpath2)],
>>> 'dst': str(fpath3),
>>> }
>>> main(cmdline, **kwargs)
"""
import kwcoco
config = CocoCombineFeatures.cli(data=kwargs, cmdline=cmdline)
import rich
rich.print(ub.urepr(config))
dset_iter = kwcoco.CocoDataset.coerce_multiple(
config.src, workers=config.io_workers)
dset_list = []
for dset in dset_iter:
if config['absolute']:
dset.reroot(absolute=True)
dset_list.append(dset)
src_dsets = dset_list[1:]
dst_dset = dset_list[0]
dst_dset.fpath = config['dst']
dst_dset = combine_auxiliary_features(dst_dset, src_dsets)
missing_hist = ub.ddict(lambda: 0)
channel_specs = []
# Check which images have which features (did we miss any?)
for _gid, dst_img in ub.ProgIter(dst_dset.index.imgs.items(),
total=dst_dset.n_images,
desc='checking features'):
img_channels = set()
for aux in dst_img.get('auxiliary'):
img_channels.add(aux['channels'])
channel_specs.append(img_channels)
all_channels = set.union(*channel_specs)
for spec in channel_specs:
missing = all_channels - spec
if missing:
for k in missing:
missing_hist[k] += 1
if missing_hist:
print('missing_hist = {!r}'.format(missing_hist))
print('dump dst_dset.fpath = {!r}'.format(dst_dset.fpath))
dst_dset.fpath
dst_dset.dump(newlines=True)
[docs]
def combine_auxiliary_features(dst_dset, src_dsets):
"""
Copies all non-existing assets from ``src_dsets`` into ``dst_dset``.
Updates each image in ``dst_dset`` with all non-existing asset (as
determined by the 'channels' attribute) in each corresponding image in each
``src_dsets``.
Args:
dst_dset (kwcoco.CocoDataset): modified inplace
src_dsets (List[kwcoco.CocoDataset]):
Returns:
kwcoco.CocoDataset: returns input ``dst_dset``.
Example:
>>> from geowatch.cli.coco_combine_features import * # NOQA
>>> import kwcoco
>>> base = kwcoco.CocoDataset.demo('vidshapes8-multispectral')
>>> dset1 = base.copy()
>>> dset2 = base.copy()
>>> dset3 = base.copy()
>>> dset4 = base.copy()
>>> for img in dset1.index.imgs.values():
>>> del img['auxiliary'][0::3]
>>> for img in dset2.index.imgs.values():
>>> del img['auxiliary'][1::3]
>>> dset2.remove_images([2, 3])
>>> for img in dset3.index.imgs.values():
>>> del img['auxiliary'][2::3]
>>> dset3.remove_images([2, 3])
>>> for img in dset4.index.imgs.values():
>>> del img['auxiliary'][0::2]
>>> dset4.remove_images([2, 3])
>>> dst_dset = dset1
>>> src_dsets = [dset2, dset3, dset4]
>>> for img in dset1.index.imgs.values():
... assert len(img['auxiliary']) != 5
>>> dst_dset = combine_auxiliary_features(dst_dset, src_dsets)
>>> lens1 = list(map(len, dset1.images(set(dset1.imgs) - {2, 3}).lookup('auxiliary')))
>>> assert ub.allsame([5] + lens1)
>>> lens2 = list(map(len, dset1.images({2, 3}).lookup('auxiliary')))
>>> assert ub.allsame([3] + lens2)
"""
for src_dset in src_dsets:
gids1, gids2, report = associate_images(dst_dset, src_dset)
print('report = {!r}'.format(report))
for gid1, gid2 in zip(gids1, gids2):
dst_img = dst_dset.index.imgs[gid1]
src_img = src_dset.index.imgs[gid2]
dst_auxiliary = dst_img.get('auxiliary')
src_auxiliary = src_img.get('auxiliary')
if src_auxiliary is None:
src_auxiliary = [] # nothing will happen in this case
if dst_auxiliary is None:
dst_auxiliary = dst_img['auxiliary'] = []
have_channels = set(aux.get('channels') for aux in dst_auxiliary)
assert src_img['name'] == dst_img['name']
for src_aux in src_auxiliary:
if src_aux['channels'] not in have_channels:
have_channels.add(src_aux['channels'])
dst_auxiliary.append(src_aux)
return dst_dset
[docs]
def associate_images(dset1, dset2):
"""
Get image ids for images in two datasets that share the same name.
This is a hueristic for getting pairs of images that correspond between two
datasets.
Args:
dset1 (kwcoco.CocoDataset):
dset2 (kwcoco.CocoDataset):
Returns:
Tuple[List[int], List[int], Dict]:
"""
dset1_img_names = set(dset1.index.name_to_img)
dset2_img_names = set(dset2.index.name_to_img)
common_names = dset1_img_names & dset2_img_names
dset1_missing_img_names = dset1_img_names - common_names
dset2_missing_img_names = dset2_img_names - common_names
report = {}
report.update({
'num_name_common': len(common_names),
'num_name_missing1': len(dset1_missing_img_names),
'num_name_missing2': len(dset2_missing_img_names),
})
gids1 = []
gids2 = []
for name in common_names:
img1 = dset1.index.name_to_img[name]
img2 = dset2.index.name_to_img[name]
gids1.append(img1['id'])
gids2.append(img2['id'])
return gids1, gids2, report
if __name__ == '__main__':
"""
CommandLine:
python ~/code/watch/geowatch/cli/coco_combine_features.py
python -m geowatch.cli.coco_combine_features
"""
main(cmdline=True)