#!/usr/bin/env python3
import scriptconfig as scfg
import ubelt as ub
[docs]
class WatchCocoStats(scfg.DataConfig):
"""
Print geowatch-relevant information about a kwcoco dataset.
This provides summary information about:
* Basic kwcoco stats (number of annotations / images / videos / categories)
* Average GSDs
* sensor / channel histograms
* image / annotation / video attribute historams
* Breakdowns over sensor / channel / video / dataset
* Per video summaries
CommandLine:
geowatch stats special:shapes8 vidshapes vidshapes-msi vidshapes-geowatch
TODO:
- [ ] Add other useful geowatch stats to this script
SeeAlso:
kwcoco stats
"""
__command__ = 'stats'
__default__ = {
'src': scfg.Value(
None, nargs='+', help=ub.paragraph(
'''
one or more datasets coercables, i.e. a path, live dataset, or
demodata code. Example demo codes are:
special:watch_msi
special:vidshapes8
special:vidshapes8-msi
'''), position=1),
'io_workers': scfg.Value('avail', help='number of workers used to read multiple datasets'),
'with_video_info': scfg.Value(
False, isflag=True, help='Show more per-video details')
}
[docs]
@classmethod
def main(cls, cmdline=True, **kw):
"""
Example:
>>> from geowatch.cli import watch_coco_stats
>>> import geowatch
>>> dset1 = geowatch.coerce_kwcoco('geowatch-msi', geodata=True, dates=True, heatmap=True)
>>> dset2 = geowatch.coerce_kwcoco('vidshapes8')
>>> kw = dict(src=[dset1.fpath, dset2.fpath])
>>> cmdline = 0
>>> watch_coco_stats.__cli__.main(cmdline=cmdline, **kw)
Example:
>>> from geowatch.cli import watch_coco_stats
>>> import geowatch
>>> dset1 = geowatch.coerce_kwcoco('geowatch-msi', geodata=True, dates=True, heatmap=True)
>>> kw = dict(src=dset1.fpath)
>>> cmdline = 0
>>> watch_coco_stats.__cli__.main(cmdline=cmdline, **kw)
"""
config = WatchCocoStats.cli(data=kw, cmdline=cmdline, strict=True)
import rich
rich.print('config = {}'.format(ub.urepr(config, nl=1, sort=0)))
import pandas as pd
import kwcoco
import math
import os
from kwutil import slugify_ext
fpaths = config['src']
if isinstance(fpaths, os.PathLike):
fpaths = [fpaths]
if isinstance(fpaths, str):
if ',' in fpaths:
print('warning: might not handle this case well')
fpaths = [fpaths]
if fpaths is None or len(fpaths) == 0:
raise ValueError('no files to compute stats on')
# TODO: tabulate stats when possible.
collatables = []
video_sensor_rows = []
all_sensors = set()
dset_iter = kwcoco.CocoDataset.coerce_multiple(
fpaths, workers=config.io_workers)
for dset in dset_iter:
print('\n--- Single Dataset Stats ---')
# dset = geowatch.coerce_kwcoco(fpath)
print('dset = {!r}'.format(dset))
stat_info = coco_watch_stats(
dset, with_video_info=config['with_video_info'])
collatable = {
'dset': stat_info['dset'],
**stat_info['basic_stats'],
**stat_info['chan_hist'],
**stat_info['sensor_hist'],
**stat_info['sensorchan_hist2'],
}
collatables.append(collatable)
for video_info_row in stat_info['video_summary_rows']:
video_sensor_freq = video_info_row['sensor_freq']
all_sensors.update(set(video_sensor_freq))
video_sensor_row = {
'dset': stat_info['dset'],
'name': video_info_row['name'],
**video_sensor_freq,
}
video_sensor_rows.append(video_sensor_row)
print('\n--- Multi Dataset Stats --')
try:
all_sensors = sorted(all_sensors)
except TypeError:
...
if video_sensor_rows:
if config['with_video_info']:
video_sensor_df = pd.DataFrame(video_sensor_rows)
piv = video_sensor_df.pivot(index=['name', 'dset'], columns=[], values=all_sensors)
piv = piv.sort_index()
piv = piv.astype(object)
piv = piv.applymap(lambda x: None if math.isnan(x) else int(x))
piv['total'] = piv.sum(axis=1)
print('Per-Video Sensor Frequency')
rich.print(piv.to_string(float_format='%0.0f', max_rows=500))
else:
print('No per-video stats')
print('collatables = {}'.format(ub.urepr(collatables, nl=2, sort=0)))
summary = pd.DataFrame(collatables)
col_name_map = {}
for cname in summary.columns:
new_cname = slugify_ext.smart_truncate(
cname, max_length=10, trunc_loc=1.0)
if cname != new_cname:
col_name_map[cname] = new_cname
if col_name_map:
print('Remap names for readability:')
print('col_name_map = {}'.format(ub.urepr(
ub.invert_dict(col_name_map), nl=1, sort=0)))
summary = summary.rename(col_name_map, axis=1)
summary_string = summary.to_string(max_rows=500)
max_colwidth = max(map(len, summary_string.split('\n')))
COLWIDTH_LIMIT = 1600
if max_colwidth > COLWIDTH_LIMIT:
rich.print(summary)
else:
rich.print(summary_string)
# print('Other helpful commands:')
# for fpath in fpaths:
# 'geowatch visualize {fpath:!r} --channels='
# pass
[docs]
def coco_watch_stats(dset, with_video_info=False):
"""
Args:
dset (kwcoco.CocoDataset)
Returns:
Dict[str, Any]: stat_info
Example:
>>> from geowatch.cli.watch_coco_stats import * # NOQA
>>> import geowatch
>>> dset = geowatch.coerce_kwcoco('geowatch-msi-geodata-heatmap-dates')
>>> stat_info = coco_watch_stats(dset)
"""
from kwutil import slugify_ext
from kwutil import util_time
from geowatch.utils import kwcoco_extensions
import rich
import pandas as pd
num_videos = len(dset.index.videos)
rich.print('num_videos = {!r}'.format(num_videos))
print('Per-video stats summary')
video_summary_rows = []
image_rows = []
all_image_ids = set(dset.images())
all_image_ids_with_video = set()
all_sensor_entries = []
for vidid, gids in dset.index.vidid_to_gids.items():
all_image_ids_with_video.update(gids)
video = dset.index.videos[vidid]
video = ub.dict_diff(video, ['regions', 'properties'])
# video_str = ub.urepr(video, nl=-1, sort=False)
# video_str = slugify_ext.smart_truncate(
# video_str, max_length=512, trunc_loc=0.7)
# print('video = {}'.format(video_str))
images = dset.images(gids)
annots_per_img = images.annots
flat_annots = dset.annots(list(ub.flatten(annots_per_img)))
# annots_per_img.lookup('track_id', None)
unique_trackids = set(flat_annots.lookup('track_id', None))
num_tracks = len(unique_trackids - {None})
num_annots = len(flat_annots)
# catname_freq = ub.udict(ub.dict_hist(
# flat_annots.lookup('category_id'))).map_keys(
# lambda x: dset._resolve_to_cat(x)['name'])
avail_sensors = images.lookup('sensor_coarse', None)
frame_dates = images.lookup('date_captured', None)
sensor_freq = ub.dict_hist(avail_sensors)
frame_dt = sorted([util_time.coerce_datetime(d) for d in frame_dates if d])
if frame_dt:
date_range = (min(frame_dt).date().isoformat(), max(frame_dt).date().isoformat())
else:
date_range = None
for img in images.objs:
dt = util_time.coerce_datetime(img.get('date_captured', None))
image_rows.append({
'video': video['name'],
'year': None if dt is None else dt.year,
'sensor': img.get('sensor_coarse', None)
})
video_info = ub.udict({
'name': video['name'],
**ub.dict_isect(video, ['width', 'height']),
'num_frames': len(gids),
'num_tracks': num_tracks,
'num_annots': num_annots,
# 'catname_freq': catname_freq,
'sensor_freq': sensor_freq,
'date_range': date_range,
}) | video
video_info.pop('regions', None)
video_info.pop('properties', None)
vid_info_str = ub.urepr(video_info, nl=-1, sort=False)
vid_info_str = slugify_ext.smart_truncate(
vid_info_str, max_length=512, trunc_loc=0.6)
if with_video_info:
print('video_info = {}'.format(vid_info_str))
all_sensor_entries.extend(avail_sensors)
# video_summary_rows.append(ub.dict_diff(video_info, {'sensor_freq', 'warp_wld_to_vid'}))
video_summary_rows.append(video_info - {'warp_wld_to_vid'})
print('dset.tag = {!r}'.format(dset.tag))
basic_stats = dset.basic_stats()
# Note: kwcoco should be doing this.
basic_stats['n_tracks'] = len(dset.index.trackid_to_aids)
ext_stats = dset.extended_stats()
rich.print('basic_stats = {}'.format(ub.urepr(basic_stats, nl=1, sort=0)))
rich.print('ext_stats = {}'.format(ub.urepr(ext_stats, nl=1, align=':', precision=3)))
attrs = dset.videos().attribute_frequency()
rich.print('histogram(video_attrs) = {}'.format(ub.urepr(attrs, nl=1, sort=0)))
attrs = dset.images().attribute_frequency()
rich.print('histogram(image_attrs) = {}'.format(ub.urepr(attrs, nl=1, sort=0)))
attrs = dset.annots().attribute_frequency()
rich.print('histogram(annot_attrs) = {}'.format(ub.urepr(attrs, nl=1, sort=0)))
loose_image_ids = sorted(all_image_ids - all_image_ids_with_video)
rich.print('len(loose_image_ids) = {!r}'.format(len(loose_image_ids)))
video_summary = pd.DataFrame(video_summary_rows)
video_summary = video_summary.drop(video_summary.columns.intersection([
'valid_region_geos', 'wld_crs_info', 'valid_region']), axis=1)
rich.print(video_summary)
# coco_dset = dset
# all_images = coco_dset.images()
# wv_images = all_images.compress([s == 'WV' for s in all_images.lookup('sensor_coarse')])
# coco_images = [coco_dset.coco_image(gid) for gid in wv_images]
# ub.dict_hist(['|'.join(sorted(coco_img.channels.fuse().parsed)) for coco_img in coco_images])
# ub.dict_hist([(coco_img.channels.fuse() & kwcoco.FusedChannelSpec.coerce('red|green|blue|panchromatic')).spec for coco_img in coco_images])
# all_images.lookup('sensor_coarse')
# coco_img = dset.images().take([0]).coco_images[0]
# fpath = coco_img.primary_image_filepath()
# _ = ub.cmd('gdalinfo {}'.format(fpath), verbose=3)
image_df = pd.DataFrame(image_rows)
try:
year_pivot = build_year_summary(image_df)
except (TypeError, KeyError):
print('unable to build year analysis')
year_pivot = None
else:
rich.print('Sensor Date Range Histograms')
rich.print(year_pivot.to_string(max_rows=500))
sensorchan_gsd_stats = coco_sensorchan_gsd_stats(dset)
rich.print(sensorchan_gsd_stats.to_string(max_rows=500))
sensor_hist = ub.dict_hist(all_sensor_entries)
rich.print('Sensor Histogram = {}'.format(ub.urepr(sensor_hist, nl=1, sort=0)))
print('MSI channel stats')
info = kwcoco_extensions.coco_channel_stats(dset)
rich.print(ub.urepr(info, nl=4, sort=0))
dset_bundle_suffix = '/'.join(ub.Path(dset.fpath).parts[-2:])
stat_info = {
'dset': dset_bundle_suffix,
'basic_stats': basic_stats,
'chan_hist': info['chan_hist'],
'sensor_hist': info['sensor_hist'],
'sensorchan_hist2': info['sensorchan_hist2'],
'video_summary_rows': video_summary_rows,
'year_pivot': year_pivot,
}
return stat_info
[docs]
def build_year_summary(image_df):
import pandas as pd
import numpy as np
_, year_bins = np.histogram(image_df['year'])
year_bins = sorted(np.unique(np.ceil(year_bins)))
year_bins = year_bins + [year_bins[-1] + 1]
# max_bins = max(18 // len(image_df['sensor'].unique()), 2)
max_bins = 15
if len(year_bins) > max_bins:
_, year_bins = np.histogram(image_df['year'], bins=max_bins)
year_bins = sorted(np.unique(np.ceil(year_bins)))
bin_labels = []
for a, b in ub.iter_window(year_bins, 2):
a = int(a)
b = int(b)
if a == b or (a + 1) == b:
bin_labels += [str(a)]
else:
bin_labels += [f'{a} - {b}']
year_summaries = []
group_keys = ['video', 'sensor']
for group_vals, group in image_df.groupby(group_keys):
counts = np.histogram(group['year'], bins=year_bins)[0]
group_id = ub.dzip(group_keys, group_vals)
summaries = [{'count': c, 'time': b, **group_id} for c, b in zip(counts, bin_labels)]
year_summaries.extend(summaries)
year_summary_df = pd.DataFrame(year_summaries)
year_summary_df = year_summary_df.sort_values('time')
year_pivot = year_summary_df.pivot(
index=['video', 'sensor'], columns=['time'], values=['count'])
year_pivot = year_pivot.fillna('0').astype(int)
return year_pivot
[docs]
def coco_sensorchan_gsd_stats(coco_dset):
"""
Checks the GSD of each band.
"""
import pandas as pd
import math
import numpy as np
import kwimage
from geowatch.utils import util_pandas
longform_rows = []
for image_id in coco_dset.images():
coco_img = coco_dset.coco_image(image_id)
asset_rows = []
assets = list(coco_img.iter_asset_objs())
missing_gsd_idxs = []
for idx, asset in enumerate(assets):
gsd = asset.get('approx_meter_gsd', float('nan'))
sensor = asset.get('sensor_coarse', '*')
channels = asset.get('channels', '?')
asset_rows.append({
'sensor': sensor,
'channels': channels,
'gsd': gsd,
})
if math.isnan(gsd):
missing_gsd_idxs.append(idx)
if missing_gsd_idxs:
# If we have a GSD for some but not all assets,
# we can relate them.
flags = ~np.array(ub.boolmask(missing_gsd_idxs, len(assets)))
if np.any(flags):
reference_idx = np.where(flags[0])[0][0]
ref_asset = assets[reference_idx]
img_from_ref = kwimage.Affine.coerce(
ref_asset.get('warp_aux_to_img', ref_asset.get('warp_asset_to_img')))
for miss_idx in missing_gsd_idxs:
mis_asset = assets[miss_idx]
img_from_mis = kwimage.Affine.coerce(
mis_asset.get('warp_aux_to_img', mis_asset.get('warp_asset_to_img')))
mis_from_img = img_from_mis.inv()
mis_from_ref = mis_from_img @ img_from_ref
approx_scale = np.mean(mis_from_ref.decompose()['scale'])
mis_gsd = ref_asset['approx_meter_gsd'] / approx_scale
asset_rows[miss_idx]['gsd'] = mis_gsd
longform_rows.extend(asset_rows)
gsd_table = pd.DataFrame(longform_rows)
groupers = list(gsd_table.columns.intersection(['sensor', 'channels']))
if len(groupers) == 0:
sensorchan_gsd_stats = gsd_table
else:
print(f'groupers={groupers}')
groups = util_pandas.pandas_fixed_groupby(gsd_table, groupers)
sensorchan_gsd_stats = groups.describe()
return sensorchan_gsd_stats
__cli__ = WatchCocoStats
main = __cli__.main
if __name__ == '__main__':
"""
CommandLine:
python -m geowatch.cli.watch_coco_stats --src=special:vidshapes8-multispectral
geowatch stats drop1/data.kwcoco.json
"""
WatchCocoStats.main()