Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions schedview/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
"compute_scalar_metric_at_one_mjd",
"compute_scalar_metric_at_mjds",
"compute_mixed_scalar_metric",
"compute_obs_sim_offsets",
"compute_offset_stats",
"offsets_of_coord_band",
"often_repeated_fields",
"assign_field_hpids",
"count_visits_by_sim",
"match_visits_across_sims",
"compute_matched_visit_delta_statistics",
Expand All @@ -25,6 +29,12 @@

from .astro import compute_sun_moon_positions, convert_evening_date_to_night_of_survey, night_events
from .camera import LsstCameraFootprintPerimeter
from .comparesim import (
assign_field_hpids,
compute_obs_sim_offsets,
compute_offset_stats,
offsets_of_coord_band,
)
from .multisim import (
compute_matched_visit_delta_statistics,
count_visits_by_sim,
Expand Down
254 changes: 254 additions & 0 deletions schedview/compute/comparesim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
from functools import partial
from typing import Optional, Tuple

import astropy.units as u
import healpy as hp
import numpy as np
import pandas as pd
from astropy.coordinates import Angle, SkyCoord

from .multisim import match_visits_across_sims


def assign_field_hpids(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably needs a different name, to reduce confusion about what it does. "assign_field_hpids" doesn't imply matching, but one of the requirements for this function is two different data frames to match (and this is what drives the hpid assignment, if I'm getting it right).

Perhaps it could simply be "match_pointings_hpid" ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or maybe "assign_matching_field_hpids"?
I don't know .. just assign_field_hpids is accurate but makes me think you could have done what you probably did first (just assign each pointing to the nearest hpid).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about I modify this to just assign each pointing to the nearest hpid, and then have a separate function, snap_to_referece_hpid_when_close, that changes assigned hpids to one those from a reference set when the coordinates are close, and then have the notebook call both in succession?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the time you get rid of the matching, there's really nothing left but the call to hp.ang2pix, so maybe we should just drop the assign_field_hpids function entierly and have the notebook call hp.ang2pix directly, followed by snap_to_reference_hpid_when_close.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're maybe just (at the end of the day) proposing the new name here then?

would then "assign_matching_reference_hpids" or something work?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mostly, with the difference that the initial setting of the id is by a direct call to hp.ang2pix in the notebook, and a separate call to snap to the reference. This does result in more being done in the notebook, but has the advantage that in the notebook it makes it clear what's going on. I'm happy either with your proposed renaming, or with pulling hp.ang2pix out.

simulated_visits: pd.DataFrame,
completed_visits: pd.DataFrame,
nside: int = 262144,
coord_match_tolerance_deg: float = 0.002277777778,
*,
ra_col: str = "fieldRA",
decl_col: str = "fieldDec",
hpid_col: str = "fieldHpid",
inplace: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Populate an id column for simulated and completed visits and
propagate the id from the nearest simulated healpix to completed
visits that lie within a specified angular tolerance.

Parameters
----------
simulated_visits : `pd.DataFrame`
Simulated visit table containing visits from all simulations to be
compared. It must contain columns identified by ``ra_col``
and ``dec_col`` (both in degrees). An integer column named
``hpid_col`` will be added or overwritten. Pointings in this
``DataFrame`` are the reference pointings to which pointings from
completed visits will be matched.

completed_visits : `pd.DataFrame`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will the completed visits hpid be overwritten, when compared against a new simulation?

Copy link
Collaborator Author

@ehneilsen ehneilsen Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is set already (and inplace=True), yes. I'll note that in the docstring.

Completed visit table; same column requirements as
``simulated_visits``. If ``inplace`` is ``True`` and the column
designated by ``hpid_col`` already exists, values in that column
will be replaced.

field_hpix_nside : `int`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reasonable useful default for this?
I imagine that your testing has shown that some values tend to work pretty well, while others are overkill and others are way too coarse. I'm guessing that either 32 or at the outside 64 are best choices?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've been using 2*18=262144, or about 0.8" resolution, but highter would be just as good. My goal here was to make sure that it is high enough that simulated pointings that are different should have different hpids, and I wasn't concerned with it being too high because matching sims that match each other should have exactly matching coordinates.

HEALPix nside that defines the pixel resolution. Defaults to
nside=2**18 (about 0.8 arcseconds), so pointings with significantly
different ditherings are considered distinct.

field_coord_tolerance_deg : `float`
Maximum angular separation (deg) for a completed visit to be considered
a match to a simulated healpix center. Defaults to 0.002277777778,
or about 10 arcsecords, which is large enough to match across observed
pointing offsets, but not usually across different dithers.

ra_col : str, optional, default ``"fieldRA"``
Column name for right‑ascension values (degrees).

dec_col : str, optional, default ``"fieldDec"``
Column name for declination values (degrees).

hpid_col : str, optional, default ``"fieldHpid"``
Column name that will store the healpix index.

inplace : bool, optional, default ``False``
If ``True`` the input DataFrames are modified in place. If ``False``
(default) shallow copies are made so the originals stay untouched.

Returns
-------
result : Tuple[pd.DataFrame, pd.DataFrame]
``(simulated_visits_out, completed_visits_out)`` the (possibly copied)
DataFrames with the ``hpid_col`` column populated.
"""

if not inplace:
simulated_visits = simulated_visits.copy()
completed_visits = completed_visits.copy()

simulated_visits[hpid_col] = hp.ang2pix(
nside=nside,
theta=simulated_visits[ra_col],
phi=simulated_visits[decl_col],
lonlat=True,
)

if completed_visits.empty:
# If completed_visits is empty, all we have to do is create the column
# in the empty DataFrame and we are done.
completed_visits[hpid_col] = []
completed_visits[hpid_col] = completed_visits[hpid_col].astype(int)
else:
# Values assigned here will be replaced by closest matches
# in simulated_visit[hpid_col] if there are any within
# field_coord_tolerance_deg
completed_visits[hpid_col] = hp.ang2pix(
nside=nside,
theta=completed_visits[ra_col],
phi=completed_visits[decl_col],
lonlat=True,
)

# Find all hpids actually used in the simulation.
sim_hpids = simulated_visits[hpid_col].unique()
sim_hp_ra, sim_hp_dec = hp.pix2ang(nside=nside, ipix=sim_hpids, lonlat=True)
sim_hp_coords = SkyCoord(ra=sim_hp_ra * u.deg, dec=sim_hp_dec * u.deg, frame="icrs")

# Match completed visits to the nearest hpid used in simulations.
completed_coords = SkyCoord(
ra=completed_visits[ra_col].values * u.deg,
dec=completed_visits[decl_col].values * u.deg,
frame="icrs",
)
sim_hp_match_idx, sim_hp_sep, _ = completed_coords.match_to_catalog_sky(sim_hp_coords)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that you're using a really high side by default for assigning the sim visits to healpix ids -- could there be a potential failure where:
sim 1 has field centres which are slightly different than the field centres in sim 2
then the completed visits would only match either the healpix in sim 1 or sim 2, but if you'd used a lower nside they would both have actually matched and both been within the tolerance?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my comment on the corresponding comment on the docstring. My intention was to distinguish dithers, so completed visits in one dither not match simulated visits in a different dither, but rather only accommodate differences between the coordinates requested by the FBS and the coordinates reported in ConsDB.


# Apply tolerance mask and replace the healpix id where appropriate.
within_tol = sim_hp_sep.deg < coord_match_tolerance_deg
completed_visits.loc[within_tol, hpid_col] = sim_hpids[sim_hp_match_idx[within_tol]]

return simulated_visits, completed_visits
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering why we couldn't pass in multiple simulated_visits data frames at the same time, but I think it's because you're matching the completed visits to those simulated visits, and then the completed_visits can end up with different healpix ids depending on which simulation it's been matched against.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I think you noticed elsewhere, I pass visits from all simulations in the same DataFrame.



def offsets_of_coord_band(sim_index: int, visits: pd.DataFrame, obs_index: int = 0) -> pd.DataFrame:
"""
Compute the time offset between a set of observations and a
single simulated visit ``sim_index`` for a given (fieldHpid, band)
coordinate pair.

Parameters
----------
sim_index : `int`
The simulation index to compare against the observation (index 0).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is sim_index? Is it part of how the "visits" data frame referenced next was created?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's an id that indentifies sequences of visits. That is, the visits DataFrame contains all visits, including the simulations and completed visits. The sim_index column of the visits dataframe identifies which sim (or completed) that specific visit came from.

visits : `pd.DataFrame`
Table of visits that contains at least the columns ``sim_index`` and
``start_timestamp``.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand from this docstring what "visits" is .. it seems like it's not just the information from either the completed visits or a single simulation, but potentially both? How are they combined?
Is there some function somewhere you'd use to create the "visits" data frame and could that be referenced here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, visits is the DataFrame with all visits, both from completed and simulated simulations.


Returns
-------
offsets: `pd.DataFrame`
A DataFrame with columns ``obs_time``, ``sim_time`` and ``delta`` and
an index level ``sim_index``.
"""
# This function is intended to be run on a DataFrame on a single
# field/band combination, not on the whole set of visits, e.g. for a night.
for col in ("band", "fieldHpid"):
if col in visits.columns:
assert len(visits[col].unique()) == 1

offsets = match_visits_across_sims(visits.set_index("sim_index").start_timestamp, (obs_index, sim_index))

# Normalize the order and sense of the offset
if offsets.columns[0] == obs_index:
offsets["delta"] = -1 * offsets["delta"]
else:
offsets = offsets[[obs_index, sim_index, "delta"]]

assert offsets.columns[0] == obs_index
assert offsets.columns[1] == sim_index
assert offsets.columns[2] == "delta"
assert len(offsets.columns) == 3
offsets.columns = ["obs_time", "sim_time", "delta"]
offsets["sim_index"] = sim_index
offsets = offsets.set_index("sim_index")

return offsets


def compute_obs_sim_offsets(
visits: pd.DataFrame,
obs_index: int = 0,
) -> pd.DataFrame:
"""
Build a table of offsets for all simulated/completed pairs of visits.

Parameters
----------
visits : `pd.DataFrame`
Table of visits with at least the columns ``sim_index``,
``fieldHpid`` and ``band``.
obs_index : `int`, optional
``sim_index`` value for completed (observed) visits (default = 0).

Returns
-------
sim_offsets : `pd.DataFrame`
Offsets for every simulated visit, indexed by ``sim_index``,
``fieldHpid`` and ``band``.
"""
sim_indexes = visits.sim_index.unique()
sim_indexes = sim_indexes[sim_indexes != obs_index]

offsets = pd.concat(
visits.groupby(["fieldHpid", "band"]).apply(partial(offsets_of_coord_band, i), include_groups=False)
for i in sim_indexes
)
offsets.index = offsets.index.reorder_levels(["sim_index", "fieldHpid", "band"])
return offsets


def compute_offset_stats(
offsets: pd.DataFrame,
visits: Optional[pd.DataFrame] = None,
hhmmss: bool = False,
) -> pd.DataFrame:
"""
Produce a summary table of time offsets between completed and simulated
visits.

Parameters
----------
offsets : `pd.DataFrame`
Result of :func:`compute_obs_sim_offsets`. Must contain a ``delta``
column and a ``sim_index`` level in the index.
visits : `pd.DataFrame`, optional
The original visits table, required only if observation counts or
labels should be included in the returned ``DataFrame``.
hhmmss : `bool`, optional
If ``True``, convert the numeric statistics (mean, std, etc.) from
seconds to an ``HH:MM:SS`` string representation.

Returns
-------
offset_stats : `pd.DataFrame`
A table where each row corresponds to a ``sim_index`` and columns
include match counts, MAD, and the usual descriptive statistics.
"""
abs_delta = np.abs(offsets["delta"])
abs_delta.name = "abs_delta"

offset_stats = offsets.groupby("sim_index")["delta"].describe()
offset_stats.insert(0, "match count", offset_stats["count"].astype(int))
offset_stats.insert(1, "MAD", abs_delta.to_frame().groupby("sim_index")["abs_delta"].median())

if visits is not None:
visit_counts = visits.groupby("sim_index").agg({"label": "count"})
offset_stats.insert(
0, "obs count", np.full_like(offset_stats["count"], visit_counts.loc[0]).astype(int)
)
offset_stats.insert(1, "sim count", visit_counts)
offset_stats.insert(3, "#match/#obs", (offset_stats["count"] / offset_stats["obs count"]).round(2))
offset_stats.insert(4, "#match/#sim", (offset_stats["count"] / offset_stats["sim count"]).round(2))

offset_stats.drop(columns="count", inplace=True)

if hhmmss:
for column in ["MAD", "mean", "std", "min", "25%", "50%", "75%", "max"]:
offset_stats.loc[:, column] = Angle(
(offset_stats.loc[:, column].astype(int).values / 3600) * u.hour
).to_string(unit=u.hour, sep=":")

if visits is not None and "label" in visits.columns:
offset_stats.insert(0, "label", visits.groupby("sim_index")["label"].first().to_frame())

return offset_stats
2 changes: 2 additions & 0 deletions schedview/plot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"plot_infeasible",
"plot_airmass_vs_time",
"plot_alt_vs_time",
"plot_obs_vs_sim_time",
"plot_polar_alt_az",
"plot_survey_rewards",
"create_survey_reward_plot",
Expand Down Expand Up @@ -42,6 +43,7 @@

from .cadence import create_cadence_plot
from .colors import LIGHT_EXTRA_COLORS, LIGHT_PLOT_BAND_COLORS, PLOT_BAND_COLORS, make_band_cmap
from .comparesim import plot_obs_vs_sim_time
from .multisim import generate_sim_indicators, overplot_kernel_density_estimates
from .nightbf import plot_infeasible, plot_rewards
from .nightly import plot_airmass_vs_time, plot_alt_vs_time, plot_polar_alt_az
Expand Down
87 changes: 87 additions & 0 deletions schedview/plot/comparesim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from typing import List, Optional, Tuple

import bokeh
import bokeh.layouts
import bokeh.models
import bokeh.plotting
import numpy as np
import pandas as pd

from schedview.plot.colors import make_band_cmap


def plot_obs_vs_sim_time(
offsets: pd.DataFrame, tooltips: List[Tuple[str, str]], plot: Optional[bokeh.plotting.figure] = None
) -> bokeh.models.UIElement:
"""
Plot observation times vs simulation times for visits.

Parameters
----------
offsets : `pd.DataFrame`
DataFrame containing visit timing information with columns including
'obs_time', 'sim_time', 'sim_index', and 'label' as well as any
columns needed for the tooltips.
tooltips : `List[Tuple[str, str]]`
List of tuples defining the tooltip content for hover interactions.
plot : `bokeh.plotting.figure`, optional
Existing bokeh figure to use. If ``None``, a new figure is created.

Returns
-------
obs_vs_sim_time_plot : `bokeh.models.UIElement`
A bokeh UI element containing the simulation selector and the plot.
"""
if plot is None:
plot = bokeh.plotting.figure(frame_width=1024, frame_height=512)

offsets_plot_df = offsets.reset_index().set_index("sim_index")
offsets_plot_df.reset_index(inplace=True)
default_sim_id = offsets_plot_df.sim_index.min()
offsets_plot_df["sim_alpha"] = np.where(offsets_plot_df.sim_index == default_sim_id, 1, 0)

source = bokeh.models.ColumnDataSource(offsets_plot_df)

matching_line = bokeh.models.Slope(gradient=1, y_intercept=0, line_color="gray", line_width=1)
plot.add_layout(matching_line)

scatter_renderer = plot.scatter(
"sim_time", "obs_time", color=make_band_cmap("band"), alpha="sim_alpha", source=source
)
plot.xaxis[0].formatter = bokeh.models.DatetimeTickFormatter(hours="%H:%M")
plot.xaxis[0].axis_label = "visit start time in simulation"
plot.yaxis[0].formatter = bokeh.models.DatetimeTickFormatter(hours="%H:%M")
plot.yaxis[0].axis_label = "visit start time as completed"

hover_tool = bokeh.models.HoverTool(
renderers=[scatter_renderer],
tooltips=tooltips,
formatters={"@obs_time": "datetime", "@sim_time": "datetime"},
)
plot.add_tools(hover_tool)

if "label" not in source.column_names:
raise ValueError("A sim selector needs the label column")
sim_labels = offsets.groupby("sim_index")["label"].first().to_frame()
default_sim = sim_labels.loc[default_sim_id, "label"]
sim_selector = bokeh.models.Select(
value=default_sim, options=sim_labels.label.to_list(), name="simselect"
)

sim_selector_callback = bokeh.models.CustomJS(
args={"source": source},
code="""
for (let i = 0; i < source.data['label'].length; i++) {
if (['All', source.data['label'][i]].includes(this.value)) {
source.data['sim_alpha'][i] = 1.0;
} else {
source.data['sim_alpha'][i] = 0.0;
}
}
source.change.emit()
""",
)
sim_selector.js_on_change("value", sim_selector_callback)

ui_element = bokeh.layouts.column([sim_selector, plot])
return ui_element
Loading
Loading