#!/usr/bin/env python3
r"""
Given a set of site summaries, clusters them into groups, ideally with small
overlap. Writes new regions to a specified directory using the hash of the
contained sites as a subregion identifier.
Limitations:
- The clustering algorithm is overly simple
Example:
DVC_DATA_DPATH=$(geowatch_dvc --tags='phase2_data' --hardware='auto')
python -m geowatch.cli.cluster_sites \
--src "$DVC_DATA_DPATH/annotations/drop6/region_models/KR_R002.geojson" \
--dst_dpath $DVC_DATA_DPATH/ValiRegionSmall/geojson \
--draw_clusters True
DVC_DATA_DPATH=$(geowatch_dvc --tags='phase2_data' --hardware='auto')
python -m geowatch.cli.coco_align \
--src $DVC_DATA_DPATH/Drop6/combo_imganns-KR_R002_L.kwcoco.json \
--dst $DVC_DATA_DPATH/ValiRegionSmall/small_KR_R002_odarcigm.kwcoco.zip \
--regions $DVC_DATA_DPATH/ValiRegionSmall/geojson/SUB_KR_R002_n007_odarcigm.geojson \
--minimum_size="128x128@10GSD" \
--context_factor=1 \
--geo_preprop=auto \
--force_nodata=-9999 \
--site_summary=False \
--target_gsd=5 \
--aux_workers=8 \
--workers=8
DVC_DATA_DPATH=$(geowatch_dvc --tags='phase2_data' --hardware='auto')
python -m geowatch.cli.cluster_sites \
--src "$DVC_DATA_DPATH/annotations/drop6/region_models/NZ_R001.geojson" \
--dst_dpath $DVC_DATA_DPATH/ValiRegionSmall/geojson/NZ_R001 \
--draw_clusters True
DVC_DATA_DPATH=$(geowatch_dvc --tags='phase2_data' --hardware='auto')
python -m geowatch.cli.coco_align \
--src $DVC_DATA_DPATH/Drop6/combo_imganns-NZ_R001_L.kwcoco.json \
--dst $DVC_DATA_DPATH/ValiRegionSmall/small_NZ_R001_swnykmah.kwcoco.zip \
--regions $DVC_DATA_DPATH/ValiRegionSmall/geojson/NZ_R001/SUB_NZ_R001_n031_swnykmah.geojson \
--minimum_size="128x128@10GSD" \
--context_factor=1 \
--geo_preprop=auto \
--force_nodata=-9999 \
--site_summary=False \
--target_gsd=5 \
--aux_workers=8 \
--workers=8
"""
import scriptconfig as scfg
import ubelt as ub
[docs]
class ClusterSiteConfig(scfg.DataConfig):
"""
Creates a new region file that groups nearby sites.
"""
src = scfg.Value(None, help='input region files with site summaries', alias=['regions'])
dst_dpath = scfg.Value(None, help='output path to store the resulting region files')
dst_region_fpath = scfg.Value(None, help=ub.paragraph(
'''
geojson output path for site summaries. Only works if there is a single
input region, otherwise an error is raised
'''))
io_workers = scfg.Value(10, help='number of io workers')
draw_clusters = scfg.Value(False, isflag=True, help='if True draw the clusters in the specified dst_dpath')
crop_time = scfg.Value(True, isflag=True, help=ub.paragraph(
'''
if True also crops temporal extent to the sites, otherwise uses the region extent
'''))
minimum_size = scfg.Value('128x128@2GSD', help='minimum size of a cluster box')
maximum_size = scfg.Value('1024x1024@2GSD', help='minimum size of a cluster box')
context_factor = scfg.Value(1.5, help='extra padding around each site')
ignore_status = scfg.Value("['system_rejected']", type=str, help=ub.paragraph(
'''
A YAML list of status values that should be ignored by the clustering
algorithm. Defaults to ["system_rejected"].
'''))
[docs]
def main(cmdline=1, **kwargs):
"""
CommandLine:
xdoctest -m geowatch.cli.cluster_sites main:0
xdoctest -m geowatch.cli.cluster_sites main:2
Example:
>>> from geowatch.cli.cluster_sites import * # NOQA
>>> from geowatch.cli import cluster_sites
>>> import ubelt as ub
>>> dpath = ub.Path.appdir('geowatch', 'doctests', 'cluster_sites1').ensuredir()
>>> src_dpath = (dpath / 'src').ensuredir()
>>> dst_dpath = (dpath / 'dst')
>>> dst_region_fpath = dst_dpath / 'cluster.geojson'
>>> from geowatch.geoannots import geomodels
>>> region = geomodels.RegionModel.random(num_sites=10)
>>> src_fpath = src_dpath / 'demo_region.geojson'
>>> src_fpath.write_text(region.dumps())
>>> kwargs = {
>>> 'src': src_fpath,
>>> 'dst_dpath': dst_dpath,
>>> 'dst_region_fpath': dst_region_fpath,
>>> 'io_workers': 0,
>>> 'draw_clusters': 0,
>>> 'crop_time': True,
>>> }
>>> cmdline = 0
>>> cluster_sites.main(cmdline=cmdline, **kwargs)
Example:
>>> from geowatch.cli.cluster_sites import * # NOQA
>>> from geowatch.cli import cluster_sites
>>> import ubelt as ub
>>> dpath = ub.Path.appdir('geowatch', 'doctests', 'cluster_sites2').ensuredir()
>>> src_dpath = (dpath / 'src').ensuredir()
>>> dst_dpath = (dpath / 'dst')
>>> from geowatch.geoannots import geomodels
>>> region = geomodels.RegionModel.random(num_sites=10)
>>> src_fpath = src_dpath / 'demo_region.geojson'
>>> src_fpath.write_text(region.dumps())
>>> dst_region_fpath = dst_dpath / 'cluster.geojson'
>>> kwargs = {
>>> 'src': src_fpath,
>>> 'dst_dpath': dst_dpath,
>>> 'dst_region_fpath': dst_region_fpath,
>>> 'io_workers': 0,
>>> 'draw_clusters': 1,
>>> 'crop_time': True,
>>> }
>>> cmdline = 0
>>> cluster_sites.main(cmdline=cmdline, **kwargs)
Example:
>>> # Test empty case
>>> from geowatch.cli.cluster_sites import * # NOQA
>>> from geowatch.cli import cluster_sites
>>> import ubelt as ub
>>> dpath = ub.Path.appdir('geowatch', 'doctests', 'cluster_sites3').ensuredir()
>>> src_dpath = (dpath / 'src').ensuredir()
>>> dst_dpath = (dpath / 'dst')
>>> from geowatch.geoannots import geomodels
>>> region = geomodels.RegionModel.random(num_sites=0)
>>> src_fpath = src_dpath / 'demo_region.geojson'
>>> src_fpath.write_text(region.dumps())
>>> dst_region_fpath = dst_dpath / 'cluster.geojson'
>>> kwargs = {
>>> 'src': src_fpath,
>>> 'dst_dpath': dst_dpath,
>>> 'dst_region_fpath': dst_region_fpath,
>>> 'io_workers': 0,
>>> 'draw_clusters': 1,
>>> 'crop_time': True,
>>> }
>>> cmdline = 0
>>> cluster_sites.main(cmdline=cmdline, **kwargs)
Ignore:
import xdev
xdev.profile_now(cluster_sites.main)(cmdline=cmdline, **kwargs)
Ignore:
import sys, ubelt
sys.path.append(ubelt.expandpath('~/code/watch'))
from geowatch.cli.cluster_sites import * # NOQA
import geowatch
data_dpath = geowatch.find_dvc_dpath(tags='phase2_data', hardware='auto')
src = data_dpath / 'annotations/drop6/region_models/KR_R002.geojson'
dst_dpath = data_dpath / 'ValiRegionSmall/geojson'
kwargs = dict(src=src, dst_dpath=dst_dpath, draw_clusters=True)
main(**kwargs)
"""
config = ClusterSiteConfig.cli(cmdline=cmdline, data=kwargs)
import rich
from rich.markup import escape
rich.print('config = {}'.format(escape(ub.urepr(config, nl=1))))
from kwutil import util_yaml
from geowatch.geoannots import geomodels
from geowatch.utils import process_context
from geowatch.utils import util_resolution
config.ignore_status = util_yaml.Yaml.coerce(config.ignore_status)
print(f'config.ignore_status = {ub.urepr(config.ignore_status, nl=1)}')
# import pandas as pd
if config.dst_dpath is None:
raise ValueError('Destination path is required')
dst_dpath = config.dst_dpath = ub.Path(config.dst_dpath)
config.dst_dpath.ensuredir()
rich.print(f'Will write to: [link={dst_dpath}]{dst_dpath}[/link]')
proc_context = process_context.ProcessContext(
name='geowatch.cli.cluster_sites', type='process',
config=process_context.jsonify_config(config),
track_emissions=False,
)
input_region_models = list(geomodels.RegionModel.coerce_multiple(
config.src, workers=config.io_workers))
if config.dst_region_fpath is not None:
if len(input_region_models) != 1:
raise ValueError(
'we assume only 1 input region when output fpath given')
# Fixme: process context is weird here when we allow for multiple
# region inputs
proc_context.start()
scale = config.context_factor
minimum_size = util_resolution.ResolvedWindow.coerce(config.minimum_size)
maximum_size = util_resolution.ResolvedWindow.coerce(config.maximum_size)
# Convert to meters
meter = util_resolution.ResolvedUnit(1, 'GSD')
min_box_dim = minimum_size.at_resolution(meter).window[0]
max_box_dim = maximum_size.at_resolution(meter).window[0]
print(f'Looping over {len(input_region_models)} region')
# all_final_site_summaries = []
# Note: this should only ever be a single item in this loop. outputs will
# clobber each other. The code is setup to allow flexability to draw
# multiple region clusters at once, but we might want to disable this.
for input_region_model in input_region_models:
cluster_single_region_sites(input_region_model, scale, min_box_dim,
max_box_dim, proc_context, config)
[docs]
def cluster_single_region_sites(input_region_model, scale, min_box_dim, max_box_dim, proc_context, config):
import kwimage
from kwgis.utils import util_gis
from geowatch.utils import util_kwimage
input_region_model.fixup()
if 1:
try:
input_region_model.validate(strict=0)
except Exception:
input_region_model.fixup()
input_region_model.validate(strict=0)
# Create the set of input boxes to the clustering algorithm.
# Filter out any that have an ignorable status, and convert to UTM
# coordinates.
region_header = input_region_model.pandas_region()
region_sites = input_region_model.pandas_summaries()
assert len(region_header) == 1
if config.ignore_status:
# Remove any sites that have a status marked as ignored.
if 'status' in region_sites.columns:
keep_flags = region_sites['status'].apply(lambda s: s not in config.ignore_status)
# For some reason this returns a dataframe instead of a
# geodataframe if all flags are false Using an index with loc seems
# to work around this.
keep_indexes = keep_flags.index[keep_flags.index]
region_sites = region_sites.loc[keep_indexes]
region_sites_utm = util_gis.project_gdf_to_local_utm(region_sites, mode=1, tolerance=None)
polygons = kwimage.PolygonList([kwimage.Polygon.from_shapely(s)
for s in region_sites_utm.geometry])
# Run the clustering algorithm.
# TODO: would be good to do this in 3D with temporal extent too.
keep_bbs, overlap_idxs = util_kwimage.find_low_overlap_covering_boxes(
polygons, scale, min_box_dim, max_box_dim, max_iters=100)
# Sort bbs so the largest spatial ones are first.
keep_bbs = keep_bbs[keep_bbs.area.ravel().argsort()[::-1]]
# Convert the UTM bounding boxes into proper region models.
clustered_region, subregions = build_clustered_models(
input_region_model, region_header, region_sites, region_sites_utm,
keep_bbs, config)
# Write output to disk
# Awkward backwards compat, dump individal cluster groups
for subregion in subregions:
fpath = config.dst_dpath / (subregion.region_id + '.geojson')
fpath.write_text(subregion.dumps())
if config.dst_region_fpath is not None:
obj = proc_context.stop()
clustered_region.header['properties']['cache']['proc_context'] = obj
region_fpath = ub.Path(config.dst_region_fpath)
region_fpath.write_text(clustered_region.dumps())
if config.draw_clusters:
_draw_clusters(input_region_model, region_sites_utm, polygons,
clustered_region, keep_bbs, config)
def _draw_clusters(input_region_model, region_sites_utm, polygons,
clustered_region, keep_bbs, config):
from geowatch import heuristics
from geowatch.utils import util_kwplot
import kwimage
import kwplot
import rich
if 'status' in region_sites_utm.columns:
color_list = []
status_list = region_sites_utm['status']
for status in status_list:
info = heuristics.IARPA_STATUS_TO_INFO.get(status, {})
color = kwimage.Color.coerce(info.get('color', 'pink')).as255()
color_list.append(color)
else:
color_list = ['blue'] * len(polygons)
plt = kwplot.autoplt()
kwplot.figure(fnum=1, doclf=1)
# polygons.draw(color='pink')
for poly, color in zip(polygons, color_list):
edgecolor = kwimage.Color.coerce(color).adjust(saturate=-.1, lighten=.1)
poly.draw(color=color, edgecolor=edgecolor, linewidth=1, alpha=0.7)
# candidate_bbs.draw(color='blue', setlim=1)
subregion_suffix_list = [
ss['properties']['cache']['cluster_suffix']
for ss in clustered_region.site_summaries()
]
if len(keep_bbs):
keep_bbs.draw(color='orange', setlim=1, labels=subregion_suffix_list)
plt.gca().set_title('find_low_overlap_covering_boxes')
fig = plt.gcf()
viz_dpath = (config.dst_dpath / '_viz_clusters').ensuredir()
rich.print(f'Viz dpath: [link={viz_dpath}]{viz_dpath}[/link]')
finalizer = util_kwplot.FigureFinalizer(
size_inches=(16, 16),
dpath=viz_dpath,
)
finalizer(fig, 'clusters_' + input_region_model.region_id + '.png')
[docs]
def build_clustered_models(input_region_model, region_header, region_sites,
region_sites_utm, keep_bbs, config):
"""
Given the clustering output, construct new region models.
Returns:
Tuple[RegionModel, List[RegionModle]] :
1. A new "clustered region" region model with the original region
bounds and the clusters as site summaries
2. A set of region models for each cluster, with the original site
summaries that were assigned to that cluster
"""
from geowatch.geoannots import geomodels
import geopandas as gpd
region_id = input_region_model.region_id
utm_crs = region_sites_utm.crs
crs84 = region_header.crs
# Start an empty region model with the original bounds
# we will add clusters as site summaries to this data structure.
new_region_header = geomodels.RegionHeader(
properties={
"type": 'region',
"region_id": input_region_model.region_id,
"version": '2.4.3',
"mgrs": None,
"start_date": input_region_model.start_date.date().isoformat(),
"end_date": input_region_model.end_date.date().isoformat(),
"originator": "kit-cluster",
"model_content": "annotation",
"comments": '',
"cache": {}
},
geometry=input_region_model.geometry,
)
new_region_header.ensure_isodates()
new_region_header.infer_mgrs()
clustered_region = geomodels.RegionModel(features=[new_region_header])
print(f'Looping over {len(keep_bbs)} clusters')
subregions = []
for bb_idx, utm_box in enumerate(keep_bbs.to_shapely()):
ISECT_BOXES = 0
if ISECT_BOXES:
region_utm = region_header.to_crs(utm_crs)
region_bounds = region_utm.iloc[0]['geometry']
final_geom_utm = region_bounds.intersection(utm_box)
else:
final_geom_utm = utm_box
# Convert the UTM cluster into CRS84
final_geom_df_crs84 = gpd.GeoDataFrame(
{'geometry': [final_geom_utm]}, crs=utm_crs).to_crs(crs84)
final_geom_crs84 = final_geom_df_crs84.iloc[0]['geometry']
cluster_geometry = final_geom_crs84
if (cluster_geometry is None or
cluster_geometry.is_empty or
not cluster_geometry.is_valid):
raise AssertionError
# Assign original sites to this cluster.
is_contained = region_sites_utm.intersects(final_geom_utm)
contained_sites = region_sites[is_contained]
cluster_suffix = f'{bb_idx:03d}'
cluster_id = f'{region_id}_CLUSTER_{cluster_suffix}'
start_dates = contained_sites['start_date'].dropna()
end_dates = contained_sites['end_date'].dropna()
site_summaries = list(geomodels.SiteSummary.from_geopandas_frame(contained_sites))
if 1:
for s in site_summaries:
s.fixup()
s.validate(strict=0)
# The start / end date of the cluster is determined by the original
# sites that intersect it.
if len(start_dates) and config.crop_time:
start_date = start_dates.min()
else:
start_date = input_region_model.start_date
if len(end_dates) and config.crop_time:
end_date = end_dates.max()
else:
end_date = input_region_model.end_date
# Add this cluster as a site summary in the new region
cluster_summary = geomodels.SiteSummary(
properties={
"type": 'site_summary',
"status": "system_confirmed",
"site_id": cluster_id,
"version": '2.4.3',
"mgrs": None,
"start_date": start_date,
"end_date": end_date,
"originator": "kit-cluster",
"model_content": "annotation",
"cache": {"cluster_suffix": cluster_suffix},
},
geometry=cluster_geometry,
)
cluster_summary.ensure_isodates()
cluster_summary.infer_mgrs()
clustered_region.add_site_summary(cluster_summary)
# Create a new region model where the cluster is the region boundary
# that contatain the original site that were assigned to it.
subregion_header = geomodels.RegionHeader(
properties={
"type": 'region',
"region_id": cluster_id,
"version": '2.4.3',
"mgrs": None,
"start_date": start_date,
"end_date": end_date,
"originator": "kit-cluster",
"model_content": "annotation",
"comments": '',
},
geometry=cluster_geometry,
)
assert subregion_header.geometry is not None
subregion_header.ensure_isodates()
subregion_header.infer_mgrs()
subregion = geomodels.RegionModel(features=[
subregion_header
] + site_summaries)
try:
subregion.validate(strict=False)
except Exception:
subregion.fixup()
subregion.validate(strict=False)
subregions.append(subregion)
return clustered_region, subregions
__cli__ = ClusterSiteConfig
__cli__.main = main
if __name__ == '__main__':
"""
CommandLine:
python ~/code/watch/geowatch/cli/cluster_sites.py
"""
main(cmdline=True)