diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index 68a8a3b9b..e6dbee8b6 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -429,9 +429,8 @@ def joint_tour_participation( if i not in model_settings.compute_settings.protect_columns: model_settings.compute_settings.protect_columns.append(i) - # TODO EET: this is related to the difference in nested logit and logit choice as per comment in - # make_choices_utility_based. As soon as alt_order_array is removed from arguments to - # make_choices_explicit_error_term_nl this guard can be removed + # This is related to the difference in nested logit and logit choice. As soon as alt_order_array + # is removed from arguments to make_choices_explicit_error_term_nl this guard can be removed. if state.settings.use_explicit_error_terms: assert ( nest_spec is None diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py index 7f032a8ae..7c8ef16db 100644 --- a/activitysim/abm/models/location_choice.py +++ b/activitysim/abm/models/location_choice.py @@ -15,10 +15,10 @@ TourLocationComponentSettings, TourModeComponentSettings, ) +from activitysim.core.exceptions import DuplicateWorkflowTableError from activitysim.core.interaction_sample import interaction_sample from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.util import reindex -from activitysim.core.exceptions import DuplicateWorkflowTableError """ The school/workplace location model predicts the zones in which various people will @@ -1031,6 +1031,17 @@ def iterate_location_choice( ] persons_merged_df_ = persons_merged_df_.sort_index() + # reset rng offsets to identical state on each iteration. This ensures that the same set of random numbers is + # used on each iteration. Note this has to happen AFTER updating shadow prices because the simulation method + # draws random numbers. + # Only applying when using EET for now because this will need changes to integration + # tests, but it's probably a good idea for MC simulation as well. + if state.settings.use_explicit_error_terms and iteration > 1: + logger.debug( + f"{trace_label} resetting random number generator offsets for iteration {iteration}" + ) + state.get_rn_generator().reset_offsets_for_step(state.current_model_name) + choices_df_, save_sample_df = run_location_choice( state, persons_merged_df_, diff --git a/activitysim/abm/models/util/test/test_cdap.py b/activitysim/abm/models/util/test/test_cdap.py index 20dc6b241..20d68f2dd 100644 --- a/activitysim/abm/models/util/test/test_cdap.py +++ b/activitysim/abm/models/util/test/test_cdap.py @@ -5,6 +5,7 @@ import os.path +import numpy as np import pandas as pd import pandas.testing as pdt import pytest @@ -176,3 +177,84 @@ def test_build_cdap_spec_hhsize2(people, model_settings): ).astype("float") pdt.assert_frame_equal(utils, expected, check_names=False) + + +def test_cdap_explicit_error_terms_parity(people, model_settings): + person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) + + # Increase population to get more stable distribution for parity check + # We'll just duplicate the existing people a few times + large_people = pd.concat([people] * 500).reset_index(drop=True) + large_people.index.name = "person_id" + + assert people.household_id.is_monotonic_increasing + large_people["hhid_diff"] = large_people.household_id.diff().fillna(0).astype(int) + large_people.loc[large_people["hhid_diff"] < 0, "hhid_diff"] = 1 + large_people["household_id"] = large_people.hhid_diff.cumsum() + + assert large_people["household_id"].is_monotonic_increasing + + # Run without explicit error terms + state_no_eet = workflow.State.make_default(__file__) + cdap_indiv_spec = state_no_eet.filesystem.read_model_spec( + file_name="cdap_indiv_and_hhsize1.csv" + ) + interaction_coefficients = pd.read_csv( + state_no_eet.filesystem.get_config_file_path( + "cdap_interaction_coefficients.csv" + ), + comment="#", + ) + interaction_coefficients = cdap.preprocess_interaction_coefficients( + interaction_coefficients + ) + cdap_fixed_relative_proportions = pd.DataFrame( + {"activity": ["M", "N", "H"], "coefficient": [0.33, 0.33, 0.34]} + ) + + state_no_eet.settings.use_explicit_error_terms = False + state_no_eet.rng().set_base_seed(42) + state_no_eet.rng().begin_step("test_no_eet") + state_no_eet.rng().add_channel("person_id", large_people) + state_no_eet.rng().add_channel( + "household_id", + large_people.drop_duplicates("household_id").set_index("household_id"), + ) + + choices_no_eet = cdap.run_cdap( + state_no_eet, + large_people, + person_type_map, + cdap_indiv_spec, + interaction_coefficients, + cdap_fixed_relative_proportions, + locals_d=None, + ) + + # Run with explicit error terms + state_eet = workflow.State.make_default(__file__) + state_eet.settings.use_explicit_error_terms = True + state_eet.rng().set_base_seed(42) + state_eet.rng().begin_step("test_eet") + state_eet.rng().add_channel("person_id", large_people) + state_eet.rng().add_channel( + "household_id", + large_people.drop_duplicates("household_id").set_index("household_id"), + ) + + choices_eet = cdap.run_cdap( + state_eet, + large_people, + person_type_map, + cdap_indiv_spec, + interaction_coefficients, + cdap_fixed_relative_proportions, + locals_d=None, + ) + + # Compare distributions + dist_no_eet = choices_no_eet.value_counts(normalize=True).sort_index() + dist_eet = choices_eet.value_counts(normalize=True).sort_index() + + # Check that they are reasonably close + pdt.assert_series_equal(dist_no_eet, dist_eet, atol=0.05, check_names=False) diff --git a/activitysim/abm/test/test_misc/configs_test_misc/joint_tour_participation.csv b/activitysim/abm/test/test_misc/configs_test_misc/joint_tour_participation.csv new file mode 100644 index 000000000..d81df1ab1 --- /dev/null +++ b/activitysim/abm/test/test_misc/configs_test_misc/joint_tour_participation.csv @@ -0,0 +1,2 @@ +Description,Expression,participate,not_participate +Adult participation,adult,0.5,-0.5 diff --git a/activitysim/abm/test/test_misc/configs_test_misc/joint_tour_participation.yaml b/activitysim/abm/test/test_misc/configs_test_misc/joint_tour_participation.yaml new file mode 100644 index 000000000..8db2410c0 --- /dev/null +++ b/activitysim/abm/test/test_misc/configs_test_misc/joint_tour_participation.yaml @@ -0,0 +1,5 @@ +SPEC: joint_tour_participation.csv +COEFFICIENTS: joint_tour_participation_coefficients.csv +participation_choice: participate +max_participation_choice_iterations: 100 +FORCE_PARTICIPATION: True diff --git a/activitysim/abm/test/test_misc/configs_test_misc/joint_tour_participation_coefficients.csv b/activitysim/abm/test/test_misc/configs_test_misc/joint_tour_participation_coefficients.csv new file mode 100644 index 000000000..237d51917 --- /dev/null +++ b/activitysim/abm/test/test_misc/configs_test_misc/joint_tour_participation_coefficients.csv @@ -0,0 +1,2 @@ +expression,coefficient +adult,1.0 diff --git a/activitysim/abm/test/test_misc/test_joint_tour_participation.py b/activitysim/abm/test/test_misc/test_joint_tour_participation.py new file mode 100644 index 000000000..5aa15c6e8 --- /dev/null +++ b/activitysim/abm/test/test_misc/test_joint_tour_participation.py @@ -0,0 +1,158 @@ +import numpy as np +import pandas as pd +import pandas.testing as pdt +import pytest + +from activitysim.abm.models import joint_tour_participation +from activitysim.core import logit, workflow + +from .test_trip_departure_choice import add_canonical_dirs + + +@pytest.fixture +def candidates(): + # Create synthetic candidates for Joint Tour Participation + # JTP chooses whether each candidate participates in a joint tour. + # We include varied compositions and preschoolers to exercise the + # get_tour_satisfaction logic properly. + num_tours_per_comp = 500 + compositions = ["MIXED", "ADULTS", "CHILDREN"] + num_candidates_per_tour = 4 + + total_tours = num_tours_per_comp * len(compositions) + num_candidates = total_tours * num_candidates_per_tour + + # Ensure reproducibility + rng = np.random.default_rng(42) + + tour_ids = np.repeat(np.arange(total_tours), num_candidates_per_tour) + comp_values = np.repeat(compositions, num_tours_per_comp * num_candidates_per_tour) + + df = pd.DataFrame( + { + "tour_id": tour_ids, + "household_id": tour_ids, # simplified for mock + "person_id": np.arange(num_candidates), + "composition": comp_values, + }, + index=pd.Index(np.arange(num_candidates), name="participant_id"), + ) + + # Assign adult and preschooler status based on composition + # MIXED: at least one adult and one child + # ADULTS: all adults + # CHILDREN: all children + df["adult"] = False + df["person_is_preschool"] = False + + for i, comp in enumerate(compositions): + mask = df.composition == comp + indices = df[mask].index + + if comp == "ADULTS": + df.loc[indices, "adult"] = True + elif comp == "CHILDREN": + df.loc[indices, "adult"] = False + # Some children are preschoolers + df.loc[ + rng.choice(indices, len(indices) // 4, replace=False), + "person_is_preschool", + ] = True + elif comp == "MIXED": + # For each tour, make the first person an adult, rest children + tour_start_indices = indices[::num_candidates_per_tour] + df.loc[tour_start_indices, "adult"] = True + # Other members are children, some might be preschoolers + other_indices = indices[~indices.isin(tour_start_indices)] + df.loc[ + rng.choice(other_indices, len(other_indices) // 3, replace=False), + "person_is_preschool", + ] = True + + return df + + +@pytest.fixture +def model_spec(): + # Simple spec with two alternatives: 'participate' and 'not_participate' + return pd.DataFrame( + {"participate": [0.8, -0.2], "not_participate": [0.0, 0.0]}, + index=pd.Index(["adult", "person_is_preschool"], name="Expression"), + ) + + +def test_jtp_explicit_error_terms_parity(candidates, model_spec): + """ + Test that joint tour participation results are statistically similar + between MNL and Explicit Error Terms (EET) using realistic candidate scenarios. + """ + # Create random utilities for the candidates that vary by attribute + rng = np.random.default_rng(42) + + # Base utility + some noise + base_util = (candidates.adult * 0.5) - (candidates.person_is_preschool * 1.0) + utils = pd.DataFrame( + { + "participate": base_util + rng.standard_normal(len(candidates)), + "not_participate": 0, + }, + index=candidates.index, + ) + + # Run without EET (MNL) + state_no_eet = add_canonical_dirs("configs_test_misc").default_settings() + state_no_eet.settings.use_explicit_error_terms = False + state_no_eet.rng().set_base_seed(42) + state_no_eet.rng().begin_step("test_no_eet") + state_no_eet.rng().add_channel("participant_id", candidates) + + # MNL path expects probabilities + probs_no_eet = logit.utils_to_probs(state_no_eet, utils, trace_label="test_no_eet") + choices_no_eet, _ = joint_tour_participation.participants_chooser( + state_no_eet, + probs_no_eet, + candidates, + model_spec, + trace_label="test_no_eet", + ) + + # Run with EET + state_eet = add_canonical_dirs("configs_test_misc").default_settings() + state_eet.settings.use_explicit_error_terms = True + state_eet.rng().set_base_seed(42) + state_eet.rng().begin_step("test_eet") + state_eet.rng().add_channel("participant_id", candidates) + + # EET path expects raw utilities + choices_eet, _ = joint_tour_participation.participants_chooser( + state_eet, + utils.copy(), + candidates, + model_spec, + trace_label="test_eet", + ) + + # Compare distributions of number of participants per tour + # Choice 0 is 'participate' + no_eet_participation_counts = ( + (choices_no_eet == 0).groupby(candidates.tour_id).sum() + ) + eet_participation_counts = (choices_eet == 0).groupby(candidates.tour_id).sum() + + dist_no_eet = no_eet_participation_counts.value_counts(normalize=True).sort_index() + dist_eet = eet_participation_counts.value_counts(normalize=True).sort_index() + + # Check that the distribution of participation counts is close + pdt.assert_series_equal(dist_no_eet, dist_eet, atol=0.05, check_names=False) + + # Also check average participation by composition for deeper parity check + comp_parity_no_eet = no_eet_participation_counts.groupby( + candidates.groupby("tour_id")["composition"].first() + ).mean() + comp_parity_eet = eet_participation_counts.groupby( + candidates.groupby("tour_id")["composition"].first() + ).mean() + + pdt.assert_series_equal( + comp_parity_no_eet, comp_parity_eet, atol=0.1, check_names=False + ) diff --git a/activitysim/abm/test/test_misc/test_trip_departure_choice.py b/activitysim/abm/test/test_misc/test_trip_departure_choice.py index 94d47f57a..d6645ce94 100644 --- a/activitysim/abm/test/test_misc/test_trip_departure_choice.py +++ b/activitysim/abm/test/test_misc/test_trip_departure_choice.py @@ -187,3 +187,60 @@ def test_apply_stage_two_model(model_spec, trips): pd.testing.assert_index_equal(departures.index, trips.index) departures = pd.concat([trips, departures], axis=1) + + +def test_tdc_explicit_error_terms_parity(model_spec, trips): + setup_dirs() + model_settings = tdc.TripDepartureChoiceSettings() + + # Increase population for statistical convergence + large_trips = pd.concat([trips] * 500).reset_index(drop=True) + large_trips.index.name = "trip_id" + # Ensure tour_ids are distinct for the expanded set + large_trips["tour_id"] = ( + large_trips.groupby("tour_id").cumcount() * 1000 + large_trips["tour_id"] + ) + + # Trip departure choice uses tour_leg_id as the random channel index + tour_legs = tdc.get_tour_legs(large_trips) + + # Run without explicit error terms + state_no_eet = add_canonical_dirs("configs_test_misc").default_settings() + state_no_eet.settings.use_explicit_error_terms = False + state_no_eet.rng().set_base_seed(42) + state_no_eet.rng().begin_step("test_no_eet") + state_no_eet.rng().add_channel("trip_id", large_trips) + state_no_eet.rng().add_channel("tour_leg_id", tour_legs) + + departures_no_eet = tdc.apply_stage_two_model( + state_no_eet, + model_spec, + large_trips, + 0, + "TEST Trip Departure No EET", + model_settings=model_settings, + ) + + # Run with explicit error terms + state_eet = add_canonical_dirs("configs_test_misc").default_settings() + state_eet.settings.use_explicit_error_terms = True + state_eet.rng().set_base_seed(42) + state_eet.rng().begin_step("test_eet") + state_eet.rng().add_channel("trip_id", large_trips) + state_eet.rng().add_channel("tour_leg_id", tour_legs) + + departures_eet = tdc.apply_stage_two_model( + state_eet, + model_spec, + large_trips, + 0, + "TEST Trip Departure EET", + model_settings=model_settings, + ) + + # Compare distributions + dist_no_eet = departures_no_eet.value_counts(normalize=True).sort_index() + dist_eet = departures_eet.value_counts(normalize=True).sort_index() + + # Check that they are reasonably close (within 5% for this sample size) + pd.testing.assert_series_equal(dist_no_eet, dist_eet, atol=0.05, check_names=False) diff --git a/activitysim/core/interaction_sample.py b/activitysim/core/interaction_sample.py index 93834c690..4241fd693 100644 --- a/activitysim/core/interaction_sample.py +++ b/activitysim/core/interaction_sample.py @@ -58,33 +58,21 @@ def make_sample_choices_utility_based( utilities = utilities[~zero_probs] choosers = choosers[~zero_probs] - utils_array = utilities.to_numpy() - chunk_sizer.log_df(trace_label, "utils_array", utils_array) - chosen_destinations = [] - - rands = state.get_rn_generator().gumbel_for_df(utilities, n=alternative_count) + rands = state.get_rn_generator().gumbel_for_df( + utilities, n=alternative_count * sample_size + ) chunk_sizer.log_df(trace_label, "rands", rands) - # TODO-EET [janzill Jun2022]: using for-loop to keep memory usage low, an array of dimension - # (len(choosers), alternative_count, sample_size) can get very large. Probably better to - # use chunking for this. - for i in range(sample_size): - # created this once for memory logging - if i > 0: - rands = state.get_rn_generator().gumbel_for_df( - utilities, n=alternative_count - ) - chosen_destinations.append(np.argmax(utils_array + rands, axis=1)) - chosen_destinations = np.concatenate(chosen_destinations, axis=0) + rands = rands.reshape((utilities.shape[0], alternative_count, sample_size)) + rands += utilities.to_numpy()[:, :, np.newaxis] + # choose maximum along all alternatives (axis 1) for all choosers and samples + chosen_destinations = np.argmax(rands, axis=1).reshape(-1) chunk_sizer.log_df(trace_label, "chosen_destinations", chosen_destinations) - - del utils_array - chunk_sizer.log_df(trace_label, "utils_array", None) del rands chunk_sizer.log_df(trace_label, "rands", None) - chooser_idx = np.tile(np.arange(utilities.shape[0]), sample_size) + chooser_idx = np.repeat(np.arange(utilities.shape[0]), sample_size) chunk_sizer.log_df(trace_label, "chooser_idx", chooser_idx) probs = logit.utils_to_probs( diff --git a/activitysim/core/logit.py b/activitysim/core/logit.py index 0030168bb..5cb7774f4 100644 --- a/activitysim/core/logit.py +++ b/activitysim/core/logit.py @@ -22,7 +22,6 @@ EXP_UTIL_MIN = 1e-300 EXP_UTIL_MAX = np.inf -# TODO-EET: Figure out what type we want UTIL_MIN to be, currently np.float64 UTIL_MIN = np.log(EXP_UTIL_MIN, dtype=np.float64) UTIL_UNAVAILABLE = 1000.0 * (UTIL_MIN - 1.0) @@ -344,8 +343,21 @@ def utils_to_probs( return probs -# TODO-EET: add doc string, tracing def add_ev1_random(state: workflow.State, df: pd.DataFrame): + """ + Add iid EV1 (Gumbel) random error terms to utilities for EET choice. + + Parameters + ---------- + state : workflow.State + df : pandas.DataFrame + Utilities indexed by chooser and with alternatives as columns. + + Returns + ------- + pandas.DataFrame + Utilities with EV1 errors added. + """ nest_utils_for_choice = df.copy() nest_utils_for_choice += state.get_rn_generator().gumbel_for_df( nest_utils_for_choice, n=nest_utils_for_choice.shape[1] @@ -367,11 +379,39 @@ def choose_from_tree( raise ValueError("This should never happen - no alternative found") -# TODO-EET: add doc string, tracing def make_choices_explicit_error_term_nl( - state, nested_utilities, alt_order_array, nest_spec, trace_label + state, + nested_utilities, + alt_order_array, + nest_spec, + trace_label, + trace_choosers=None, + allow_bad_utils=False, ): - """walk down the nesting tree and make choice at each level, which is the root of the next level choice.""" + """ + Walk down the nesting tree and make a choice at each level using EET. + + Parameters + ---------- + state : workflow.State + nested_utilities : pandas.DataFrame + Utilities for nest and leaf nodes. + alt_order_array : numpy.ndarray + Leaf alternatives in the original ordering. + nest_spec : dict or LogitNestSpec + Nest specification for the choice model. + trace_label : str + Trace label for logging and tracing. + + Returns + ------- + pandas.Series + Choice indices aligned to `alt_order_array`. + """ + if trace_label: + state.tracing.trace_df( + nested_utilities, tracing.extend_trace_label(trace_label, "nested_utils") + ) nest_utils_for_choice = add_ev1_random(state, nested_utilities) all_alternatives = set(nest.name for nest in each_nest(nest_spec, type="leaf")) @@ -389,8 +429,17 @@ def make_choices_explicit_error_term_nl( ), axis=1, ) - # TODO-EET: reporting like for zero probs - assert not choices.isnull().any(), f"No choice for {trace_label}" + missing_choices = choices.isnull() # TODO: should we check for infs here too? + if missing_choices.any() and not allow_bad_utils: + report_bad_choices( + state, + missing_choices, + nested_utilities, + trace_label=tracing.extend_trace_label(trace_label, "bad_utils"), + msg="no alternative selected", + # raise_error=False, + trace_choosers=trace_choosers, + ) choices = pd.Series(choices, index=nest_utils_for_choice.index) # In order for choice indexing to be consistent with MNL and cumsum MC choices, we need to index in the order @@ -400,25 +449,74 @@ def make_choices_explicit_error_term_nl( return choices -# TODO-EET: add doc string, tracing -def make_choices_explicit_error_term_mnl(state, utilities, trace_label): +def make_choices_explicit_error_term_mnl( + state, utilities, trace_label, trace_choosers=None, allow_bad_utils=False +) -> pd.Series: + """ + Make EET choices for a multinomial logit model by adding EV1 errors. + + Parameters + ---------- + state : workflow.State + utilities : pandas.DataFrame + Utilities with choosers as rows and alternatives as columns. + trace_label : str + Trace label for logging and tracing. + + Returns + ------- + pandas.Series + Choice indices aligned to the utilities columns order. + """ + if trace_label: + state.tracing.trace_df( + utilities, tracing.extend_trace_label(trace_label, "utilities") + ) utilities_incl_unobs = add_ev1_random(state, utilities) + if trace_label: + state.tracing.trace_df( + utilities_incl_unobs, + tracing.extend_trace_label(trace_label, "utilities_eet"), + ) choices = np.argmax(utilities_incl_unobs.to_numpy(), axis=1) - # TODO-EET: reporting like for zero probs - assert not np.isnan(choices).any(), f"No choice for {trace_label}" + missing_choices = np.isnan(choices) # TODO: should we check for infs here too? + if missing_choices.any() and not allow_bad_utils: + report_bad_choices( + state, + missing_choices, + utilities, + trace_label=tracing.extend_trace_label(trace_label, "bad_utils"), + msg="no alternative selected", + # raise_error=False, + trace_choosers=trace_choosers, + ) choices = pd.Series(choices, index=utilities_incl_unobs.index) return choices def make_choices_explicit_error_term( - state, utilities, alt_order_array, nest_spec=None, trace_label=None -): + state, + utilities, + alt_order_array, + nest_spec=None, + trace_label=None, + trace_choosers=None, + allow_bad_utils=False, +) -> pd.Series: trace_label = tracing.extend_trace_label(trace_label, "make_choices_eet") if nest_spec is None: - choices = make_choices_explicit_error_term_mnl(state, utilities, trace_label) + choices = make_choices_explicit_error_term_mnl( + state, utilities, trace_label, trace_choosers, allow_bad_utils + ) else: choices = make_choices_explicit_error_term_nl( - state, utilities, alt_order_array, nest_spec, trace_label + state, + utilities, + alt_order_array, + nest_spec, + trace_label, + trace_choosers, + allow_bad_utils, ) return choices @@ -430,17 +528,24 @@ def make_choices_utility_based( nest_spec=None, trace_label: str = None, trace_choosers=None, - allow_bad_probs=False, + allow_bad_utils=False, ) -> tuple[pd.Series, pd.Series]: trace_label = tracing.extend_trace_label(trace_label, "make_choices_utility_based") - # TODO-EET: index of choices for nested utilities is different than unnested - this needs to be consistent for - # turning indexes into alternative names to keep code changes to minimum for now + # For nested models, choices are mapped to `name_mapping` ordering inside the + # EET helper. For MNL, choices already follow the utilities column order. choices = make_choices_explicit_error_term( - state, utilities, name_mapping, nest_spec, trace_label + state, + utilities, + name_mapping, + nest_spec, + trace_label, + trace_choosers=trace_choosers, + allow_bad_utils=allow_bad_utils, ) - # TODO-EET: rands - log all zeros for now + # EET does not expose per-row random draws; return zeros for compatibility. rands = pd.Series(np.zeros_like(utilities.index.values), index=utilities.index) + return choices, rands diff --git a/activitysim/core/random.py b/activitysim/core/random.py index 5541fcd41..ea42b2411 100644 --- a/activitysim/core/random.py +++ b/activitysim/core/random.py @@ -9,8 +9,8 @@ import numpy as np import pandas as pd -from activitysim.core.util import reindex from activitysim.core.exceptions import DuplicateLoadableObjectError, TableIndexError +from activitysim.core.util import reindex from .tracing import print_elapsed_time @@ -445,7 +445,38 @@ def get_channel_for_df(self, df): raise TableIndexError("No channel with index name '%s'" % df.index.name) return self.channels[channel_name] - # step handling + def reset_offsets_for_step(self, step_name): + """ + Reset offsets for all channels for a step + + Parameters + ---------- + step_name : str + pipeline step name for this step + """ + + assert self.step_name == step_name + + for c in self.channels: + self.channels[c].row_states["offset"] = 0 + + def reset_offsets_for_df(self, df): + """ + Reset offsets for all choosers in df if the channel for a step + + Parameters + ---------- + step_name : str + pipeline step name for this step + df : pandas.DataFrame + df with index name and values corresponding to a registered channel + """ + channel = self.get_channel_for_df(df) + channel.row_states.loc[df.index, "offset"] = 0 + logger.info( + f"RNG: resetting random number generator offsets for channel '{channel.channel_name}' for {len(df)} rows" + + f" with index name '{df.index.name}'. Total lenght df: {len(channel.row_states)}" + ) def begin_step(self, step_name): """ diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index ed0b34452..6268c5174 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -9,7 +9,7 @@ from collections.abc import Callable from datetime import timedelta from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any import numpy as np import pandas as pd @@ -32,7 +32,9 @@ LogitNestSpec, TemplatedLogitComponentSettings, ) -from activitysim.core.estimation import Estimator + +if TYPE_CHECKING: + from activitysim.core.estimation import Estimator from activitysim.core.fast_eval import fast_eval from activitysim.core.simulate_consts import ( ALT_LOSER_UTIL, @@ -1503,7 +1505,6 @@ def eval_nl( ) if state.settings.use_explicit_error_terms: - # TODO-EET: Nested utility zero choice probability raw_utilities = logit.validate_utils( state, raw_utilities, allow_zero_probs=True, trace_label=trace_label ) @@ -1512,21 +1513,13 @@ def eval_nl( nested_utilities = compute_nested_utilities(raw_utilities, nest_spec) chunk_sizer.log_df(trace_label, "nested_utilities", nested_utilities) - # TODO-EET: use nested_utiltites directly to compute logsums? if want_logsums: - # logsum of nest root - # exponentiated utilities of leaves and nests - nested_exp_utilities = compute_nested_exp_utilities( - raw_utilities, nest_spec - ) - chunk_sizer.log_df( - trace_label, "nested_exp_utilities", nested_exp_utilities - ) - logsums = pd.Series(np.log(nested_exp_utilities.root), index=choosers.index) + logsums = pd.Series(nested_utilities.root, index=choosers.index) chunk_sizer.log_df(trace_label, "logsums", logsums) - # TODO-EET: index of choices for nested utilities is different than unnested - this needs to be consistent for - # turning indexes into alternative names to keep code changes to minimum for now + # Index of choices for nested utilities is different than unnested - this needs to be consistent for + # turning indexes into alternative names to keep code changes to minimum for now. Might want to look + # into changing this in the future when revisiting nested logit EET code. name_mapping = raw_utilities.columns.values del raw_utilities diff --git a/activitysim/core/test/test_interaction_sample.py b/activitysim/core/test/test_interaction_sample.py new file mode 100644 index 000000000..623b1622f --- /dev/null +++ b/activitysim/core/test/test_interaction_sample.py @@ -0,0 +1,297 @@ +# ActivitySim +# See full license in LICENSE.txt. + +import numpy as np +import pandas as pd +import pytest + +from activitysim.core import interaction_sample, workflow + + +@pytest.fixture +def state() -> workflow.State: + state = workflow.State().default_settings() + state.settings.check_for_variability = False + return state + + +def test_interaction_sample_parity(state): + # Run interaction_sample with and without explicit error terms and check that results are similar. + + num_choosers = 100_000 + num_alts = 100 + sample_size = 10 + + # Create random choosers and alternatives + rng = np.random.default_rng(42) + choosers = pd.DataFrame( + {"chooser_attr": rng.random(num_choosers)}, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + alternatives = pd.DataFrame( + {"alt_attr": rng.random(num_alts)}, + index=pd.Index(range(num_alts), name="alt_id"), + ) + + # Simple spec: utility = chooser_attr * alt_attr + spec = pd.DataFrame( + {"coefficient": [1.0]}, + index=pd.Index(["chooser_attr * alt_attr"], name="Expression"), + ) + + # Run _without_ explicit error terms + state.settings.use_explicit_error_terms = False + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_step_mnl") + + choices_mnl = interaction_sample.interaction_sample( + state, + choosers, + alternatives, + spec, + sample_size=sample_size, + alt_col_name="alt_id", + ) + + # Run _with_ explicit error terms + state.init_state() # reset the state to rerun with same seed + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_step_explicit") + + choices_explicit = interaction_sample.interaction_sample( + state, + choosers, + alternatives, + spec, + sample_size=sample_size, + alt_col_name="alt_id", + ) + + assert "alt_id" in choices_mnl.columns + assert "alt_id" in choices_explicit.columns + assert not choices_mnl["alt_id"].isna().any() + assert not choices_explicit["alt_id"].isna().any() + assert choices_mnl["alt_id"].isin(alternatives.index).all() + assert choices_explicit["alt_id"].isin(alternatives.index).all() + + # In interaction_sample, choices_explicit and choices_mnl are DataFrames with sampled alternatives. + # The statistics of chosen alternatives should be similar. + mnl_counts = choices_mnl["alt_id"].value_counts(normalize=True).sort_index() + explicit_counts = ( + choices_explicit["alt_id"].value_counts(normalize=True).sort_index() + ) + + # Check top choices overlap significantly or shares are close + all_alts = set(mnl_counts.index) | set(explicit_counts.index) + for alt in all_alts: + share_mnl = mnl_counts.get(alt, 0) + share_explicit = explicit_counts.get(alt, 0) + diff = abs(share_mnl - share_explicit) + assert diff < 0.01, ( + f"Large discrepancy at alt {alt}: " + f"mnl={share_mnl:.4f}, explicit={share_explicit:.4f}, diff={diff:.4f}" + ) + + +def test_interaction_sample_eet_unavailable_alternatives(state): + # Test that EET handles unavailable alternatives in sampling + num_choosers = 100 + num_alts = 10 + sample_size = 2 + choosers = pd.DataFrame( + {"chooser_attr": np.ones(num_choosers)}, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + # Alt 0-4 are attractive, Alt 5-9 are "unavailable" + alternatives = pd.DataFrame( + {"alt_attr": [10.0] * 5 + [-1000.0] * 5}, + index=pd.Index(range(num_alts), name="alt_id"), + ) + + spec = pd.DataFrame( + {"coefficient": [1.0]}, + index=pd.Index(["alt_attr"], name="Expression"), + ) + + # Run with EET + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_unavailable_eet") + + choices_eet = interaction_sample.interaction_sample( + state, + choosers, + alternatives, + spec, + sample_size=sample_size, + alt_col_name="alt_id", + ) + + # Sampled alternatives should only be from Alt 0-4 + assert choices_eet["alt_id"].isin([0, 1, 2, 3, 4]).all() + assert not choices_eet["alt_id"].isin([5, 6, 7, 8, 9]).any() + + +def test_interaction_sample_parity_peaked_utilities(state): + # Stress parity under a highly peaked utility profile: + # one dominant alternative, one secondary, and many tiny utilities. + num_choosers = 20_000 + num_alts = 100 + sample_size = 5 + + choosers = pd.DataFrame( + {"chooser_attr": np.ones(num_choosers)}, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + alt_utils = np.array([10.0, 1.0] + [0.0] * (num_alts - 2), dtype=np.float64) + alternatives = pd.DataFrame( + {"alt_attr": alt_utils}, + index=pd.Index(range(num_alts), name="alt_id"), + ) + + spec = pd.DataFrame( + {"coefficient": [1.0]}, + index=pd.Index(["alt_attr"], name="Expression"), + ) + + # Run non-EET path. + state.settings.use_explicit_error_terms = False + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_peaked_mnl") + choices_mnl = interaction_sample.interaction_sample( + state, + choosers, + alternatives, + spec, + sample_size=sample_size, + alt_col_name="alt_id", + ) + + # Run EET path with the same seed. + state.init_state() + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_peaked_explicit") + choices_explicit = interaction_sample.interaction_sample( + state, + choosers, + alternatives, + spec, + sample_size=sample_size, + alt_col_name="alt_id", + ) + + def weighted_shares(df: pd.DataFrame) -> pd.Series: + counts = df.groupby("alt_id")["pick_count"].sum() + return (counts / counts.sum()).sort_index() + + mnl_shares = weighted_shares(choices_mnl) + explicit_shares = weighted_shares(choices_explicit) + + all_alts = set(mnl_shares.index) | set(explicit_shares.index) + for alt in all_alts: + diff = abs(mnl_shares.get(alt, 0.0) - explicit_shares.get(alt, 0.0)) + assert diff < 0.005, ( + f"Peaked utility parity mismatch at alt {alt}: " + f"mnl={mnl_shares.get(alt, 0.0):.6f}, " + f"explicit={explicit_shares.get(alt, 0.0):.6f}, diff={diff:.6f}" + ) + + # The dominant alternative should absorb almost all mass in both paths. + assert mnl_shares.get(0, 0.0) > 0.99 + assert explicit_shares.get(0, 0.0) > 0.99 + + +class _DummyChunkSizer: + def log_df(self, *_args, **_kwargs): + return None + + +class _DummyState: + def __init__(self, rng): + self._rng = rng + + def get_rn_generator(self): + return self._rng + + +class _DummyRngUtilityBased: + def __init__(self, rands_3d): + self.rands_3d = rands_3d + + def gumbel_for_df(self, _utilities, n): + assert n == self.rands_3d.shape[1] * self.rands_3d.shape[2] + return self.rands_3d.reshape(-1) + + +def test_make_sample_choices_utility_based_repeat_alignment_chooser_dominant_heterogeneity(): + # Edge case: utilities are close across alternatives but vary strongly by chooser. + # This is where wrong chooser/sample alignment can hide in aggregate checks. + chooser_index = pd.Index([101, 102, 103, 104, 105, 106], name="person_id") + choosers = pd.DataFrame(index=chooser_index) + alternatives = pd.DataFrame(index=pd.Index([0, 1, 2, 3], name="alt_id")) + + n_choosers = len(choosers) + n_alts = len(alternatives) + sample_size = 3 + + # Very small alternative differences... + alt_signal = np.array([0.00, 0.01, 0.02, 0.03], dtype=np.float64) + # ...but very large chooser sensitivity differences. + chooser_scale = np.array([-500.0, -200.0, -50.0, 50.0, 200.0, 500.0]) + + utilities = pd.DataFrame( + chooser_scale[:, np.newaxis] * alt_signal[np.newaxis, :], + index=chooser_index, + ) + + # No random noise: chosen alternative is deterministic argmax of utilities. + rands_3d = np.zeros((n_choosers, n_alts, sample_size), dtype=np.float64) + state = _DummyState(_DummyRngUtilityBased(rands_3d)) + + out = interaction_sample.make_sample_choices_utility_based( + state=state, + choosers=choosers, + utilities=utilities, + alternatives=alternatives, + sample_size=sample_size, + alternative_count=n_alts, + alt_col_name="alt_id", + allow_zero_probs=False, + trace_label="test_repeat_alignment_chooser_heterogeneity", + chunk_sizer=_DummyChunkSizer(), + ) + + # Reconstruct expected indexing behavior. + chosen_2d = np.argmax( + rands_3d + utilities.to_numpy()[:, :, np.newaxis], + axis=1, + ) + chosen_flat = chosen_2d.reshape(-1) + + chooser_repeat = np.repeat(np.arange(n_choosers), sample_size) + chooser_tile = np.tile(np.arange(n_choosers), sample_size) + + probs = interaction_sample.logit.utils_to_probs( + state, + utilities, + allow_zero_probs=False, + trace_label="test_repeat_alignment_chooser_heterogeneity", + overflow_protection=True, + trace_choosers=choosers, + ).to_numpy() + + expected_prob_repeat = probs[chooser_repeat, chosen_flat] + wrong_prob_tile = probs[chooser_tile, chosen_flat] + + assert np.array_equal(out["prob"].to_numpy(), expected_prob_repeat) + assert not np.array_equal(out["prob"].to_numpy(), wrong_prob_tile) diff --git a/activitysim/core/test/test_interaction_sample_simulate.py b/activitysim/core/test/test_interaction_sample_simulate.py new file mode 100644 index 000000000..1be795417 --- /dev/null +++ b/activitysim/core/test/test_interaction_sample_simulate.py @@ -0,0 +1,151 @@ +# ActivitySim +# See full license in LICENSE.txt. + +import numpy as np +import pandas as pd +import pytest + +from activitysim.core import interaction_sample_simulate, workflow + + +@pytest.fixture +def state() -> workflow.State: + state = workflow.State().default_settings() + state.settings.check_for_variability = False + return state + + +def test_interaction_sample_simulate_parity(state): + # Run interaction_sample_simulate with and without explicit error terms and check that results are similar. + + num_choosers = 100_000 + num_alts_per_chooser = 5 # small sample size to keep things simple + + # Create random choosers + rng = np.random.default_rng(42) + choosers = pd.DataFrame( + {"chooser_attr": rng.random(num_choosers)}, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + # Create random alternatives for each chooser + # In interaction_sample_simulate, alternatives is typically a DataFrame with the same index as choosers + # but repeated for each alternative in the sample. + alt_ids = np.tile(np.arange(num_alts_per_chooser), num_choosers) + alternatives = pd.DataFrame( + { + "alt_attr": rng.random(num_choosers * num_alts_per_chooser), + "alt_id": alt_ids, + "tdd": alt_ids, + }, + index=np.repeat(choosers.index, num_alts_per_chooser), + ) + alternatives.index.name = "person_id" + + # Simple spec: utility = chooser_attr * alt_attr + spec = pd.DataFrame( + {"coefficient": [1.0]}, + index=pd.Index(["chooser_attr * alt_attr"], name="Expression"), + ) + + # Run _without_ explicit error terms + state.settings.use_explicit_error_terms = False + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_step_mnl") + + choices_mnl = interaction_sample_simulate.interaction_sample_simulate( + state, + choosers, + alternatives, + spec, + choice_column="tdd", + ) + + # Run _with_ explicit error terms + state.init_state() + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_step_explicit") + + choices_explicit = interaction_sample_simulate.interaction_sample_simulate( + state, + choosers, + alternatives, + spec, + choice_column="tdd", + ) + + assert len(choices_mnl) == num_choosers + assert len(choices_explicit) == num_choosers + assert choices_mnl.index.equals(choosers.index) + assert choices_explicit.index.equals(choosers.index) + assert not choices_mnl.isna().any() + assert not choices_explicit.isna().any() + + # choices are series with the same index as choosers and containing the choice (from choice_column) + mnl_counts = choices_mnl.value_counts(normalize=True).sort_index() + explicit_counts = choices_explicit.value_counts(normalize=True).sort_index() + + for alt in range(num_alts_per_chooser): + share_mnl = mnl_counts.get(alt, 0) + share_explicit = explicit_counts.get(alt, 0) + diff = abs(share_mnl - share_explicit) + assert diff < 0.01, ( + f"Large discrepancy at alt {alt}: " + f"mnl={share_mnl:.4f}, explicit={share_explicit:.4f}, diff={diff:.4f}" + ) + + +def test_interaction_sample_simulate_eet_unavailable_alternatives(state): + # Test that EET handles unavailable alternatives in sample simulation + + num_choosers = 10 + num_alts_per_chooser = 5 + + choosers = pd.DataFrame( + {"chooser_attr": np.ones(num_choosers)}, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + # For each chooser, 2 attractive alts, 3 unavailable + alt_attrs = [10.0, 10.0, -1000.0, -1000.0, -1000.0] * num_choosers + alt_ids = [0, 1, 2, 3, 4] * num_choosers + + alternatives = pd.DataFrame( + { + "alt_attr": alt_attrs, + "alt_id": alt_ids, + "tdd": alt_ids, + }, + index=np.repeat(choosers.index, num_alts_per_chooser), + ) + alternatives.index.name = "person_id" + + spec = pd.DataFrame( + {"coefficient": [1.0]}, + index=pd.Index(["alt_attr"], name="Expression"), + ) + + # Run with EET + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_unavailable_eet") + + choices_eet = interaction_sample_simulate.interaction_sample_simulate( + state, + choosers, + alternatives, + spec, + choice_column="tdd", + ) + + assert len(choices_eet) == num_choosers + assert choices_eet.index.equals(choosers.index) + assert not choices_eet.isna().any() + + # Choices should only be 0 or 1 + assert choices_eet.isin([0, 1]).all() + assert not choices_eet.isin([2, 3, 4]).any() diff --git a/activitysim/core/test/test_interaction_simulate.py b/activitysim/core/test/test_interaction_simulate.py new file mode 100644 index 000000000..af9442e22 --- /dev/null +++ b/activitysim/core/test/test_interaction_simulate.py @@ -0,0 +1,174 @@ +# ActivitySim +# See full license in LICENSE.txt. + +import numpy as np +import pandas as pd +import pytest + +from activitysim.core import interaction_simulate, workflow + + +@pytest.fixture +def state() -> workflow.State: + state = workflow.State().default_settings() + state.settings.check_for_variability = False + return state + + +def test_interaction_simulate_explicit_error_terms_parity(state): + # Run interaction_simulate with and without explicit error terms and check that results are similar. + + # Keep this large enough for stable parity checks without overloading CI. + num_choosers = 100_000 + num_alts = 5 + sample_size = num_alts + + # Create random choosers and alternatives + rng = np.random.default_rng(42) + choosers = pd.DataFrame( + {"chooser_attr": rng.random(num_choosers)}, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + alternatives = pd.DataFrame( + {"alt_attr": rng.random(num_alts)}, + index=pd.Index(range(num_alts), name="alt_id"), + ) + + spec = pd.DataFrame( + {"coefficient": [1.0]}, + index=pd.Index(["chooser_attr * alt_attr"], name="Expression"), + ) + + # Run _without_ explicit error terms + state.settings.use_explicit_error_terms = False + state.rng().set_base_seed(42) # Set seed BEFORE adding channels or steps + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_step_mnl") + + choices_mnl = interaction_simulate.interaction_simulate( + state, + choosers, + alternatives, + spec, + sample_size=sample_size, + ) + + # Run _with_ explicit error terms + state.init_state() # reset the state to rerun with same seed + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_step_explicit") + + choices_explicit = interaction_simulate.interaction_simulate( + state, + choosers, + alternatives, + spec, + sample_size=sample_size, + ) + + assert len(choices_mnl) == num_choosers + assert len(choices_explicit) == num_choosers + assert choices_mnl.index.equals(choosers.index) + assert choices_explicit.index.equals(choosers.index) + assert not choices_mnl.isna().any() + assert not choices_explicit.isna().any() + + mnl_counts = choices_mnl.value_counts(normalize=True).sort_index() + explicit_counts = choices_explicit.value_counts(normalize=True).sort_index() + + # Check that they are close, relative to the number of draws + assert np.allclose( + mnl_counts.to_numpy(), explicit_counts.to_numpy(), atol=0.01, rtol=0.001 + ) + + +def test_interaction_simulate_eet_unavailable_alternatives(state): + # Test that EET handles unavailable alternatives (very low utilities) + # similarly to MNL (zero probabilities). + + num_choosers = 100 + num_alts = 5 + + choosers = pd.DataFrame( + {"chooser_attr": np.ones(num_choosers)}, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + # Alt 0 and 1 are attractive, Alt 2, 3, 4 are "unavailable" (very low utility) + alternatives = pd.DataFrame( + {"alt_attr": [10.0, 10.0, -1000.0, -1000.0, -1000.0]}, + index=pd.Index(range(num_alts), name="alt_id"), + ) + + spec = pd.DataFrame( + {"coefficient": [1.0]}, + index=pd.Index(["alt_attr"], name="Expression"), + ) + + # Run with EET + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_unavailable_eet") + + choices_eet = interaction_simulate.interaction_simulate( + state, + choosers, + alternatives, + spec, + sample_size=num_alts, + ) + + assert len(choices_eet) == num_choosers + assert choices_eet.index.equals(choosers.index) + assert not choices_eet.isna().any() + + # Choices should only be from Alt 0 or 1 + assert choices_eet.isin( + [0, 1] + ).all(), f"EET picked an 'unavailable' alternative: {choices_eet[~choices_eet.isin([0, 1])]}" + + +def test_interaction_simulate_eet_large_utilities(state): + # Test that EET handles very large utilities without overflow issues + # that might occur in exp(util) calculations in standard MNL. + + num_choosers = 10 + num_alts = 2 + + choosers = pd.DataFrame( + {"chooser_attr": np.ones(num_choosers)}, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + # Standard MNL might struggle with exp(700) or exp(800) depending on float precision + alternatives = pd.DataFrame( + {"alt_attr": [700.0, 800.0]}, + index=pd.Index(range(num_alts), name="alt_id"), + ) + + spec = pd.DataFrame( + {"coefficient": [1.0]}, + index=pd.Index(["alt_attr"], name="Expression"), + ) + + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", choosers) + state.rng().begin_step("test_large_utils_eet") + + # This should run without crashing or returning NaNs + choices_eet = interaction_simulate.interaction_simulate( + state, + choosers, + alternatives, + spec, + sample_size=num_alts, + ) + + assert not choices_eet.isna().any() + # With such a large difference, Alt 1 should be the dominant choice + assert (choices_eet == 1).all() diff --git a/activitysim/core/test/test_logit.py b/activitysim/core/test/test_logit.py index c82606981..e381cd85a 100644 --- a/activitysim/core/test/test_logit.py +++ b/activitysim/core/test/test_logit.py @@ -9,7 +9,8 @@ import pandas.testing as pdt import pytest -from activitysim.core import logit, workflow +from activitysim.core import logit, simulate, workflow +from activitysim.core.exceptions import InvalidTravelError from activitysim.core.simulate import eval_variables @@ -70,7 +71,122 @@ def utilities(choosers, spec, test_data): ) -# TODO-EET: Add tests here! +@pytest.fixture(scope="module") +def interaction_choosers(): + return pd.DataFrame({"attr": ["a", "b", "c", "b"]}, index=["w", "x", "y", "z"]) + + +@pytest.fixture(scope="module") +def interaction_alts(): + return pd.DataFrame({"prop": [10, 20, 30, 40]}, index=[1, 2, 3, 4]) + + +# +# Utility Validation Tests +# +def test_validate_utils_replaces_unavailable_values(): + state = workflow.State().default_settings() + utils = pd.DataFrame([[0.0, logit.UTIL_MIN - 1.0], [1.0, 2.0]]) + + validated = logit.validate_utils(state, utils, allow_zero_probs=False) + + assert validated.iloc[0, 0] == pytest.approx(0.0) + assert validated.iloc[0, 1] == pytest.approx(logit.UTIL_UNAVAILABLE) + assert validated.iloc[1, 0] == pytest.approx(1.0) + assert validated.iloc[1, 1] == pytest.approx(2.0) + + +def test_validate_utils_raises_when_all_unavailable(): + state = workflow.State().default_settings() + utils = pd.DataFrame([[logit.UTIL_MIN - 1.0, logit.UTIL_MIN - 2.0]]) + + with pytest.raises(InvalidTravelError) as excinfo: + logit.validate_utils(state, utils, allow_zero_probs=False) + + assert "all probabilities are zero" in str(excinfo.value) + + +def test_validate_utils_allows_zero_probs(): + state = workflow.State().default_settings() + utils = pd.DataFrame([[0.5, logit.UTIL_MIN - 1.0]]) + + validated = logit.validate_utils(state, utils, allow_zero_probs=True) + + assert validated.iloc[0, 0] == 0.5 + assert validated.iloc[0, 1] == logit.UTIL_UNAVAILABLE + + +# +# `utils_to_probs` Tests +# +def test_utils_to_probs_logsums_with_overflow_protection(): + state = workflow.State().default_settings() + utils = pd.DataFrame( + [[1000.0, 1001.0, 999.0], [-1000.0, -1001.0, -999.0]], + columns=["a", "b", "c"], + ) + original_utils = utils.copy() + + probs, logsums = logit.utils_to_probs( + state, + utils, + trace_label=None, + overflow_protection=True, + return_logsums=True, + ) + + utils_np = original_utils.to_numpy() + row_max = utils_np.max(axis=1, keepdims=True) + exp_shifted = np.exp(utils_np - row_max) + expected_probs = exp_shifted / exp_shifted.sum(axis=1, keepdims=True) + expected_logsums = pd.Series( + np.log(exp_shifted.sum(axis=1)) + row_max.squeeze(), + index=utils.index, + ) + + pdt.assert_frame_equal( + probs, + pd.DataFrame(expected_probs, index=utils.index, columns=utils.columns), + rtol=1.0e-7, + atol=0.0, + ) + pdt.assert_series_equal(logsums, expected_logsums, rtol=1.0e-7, atol=0.0) + + +def test_utils_to_probs_warns_on_zero_probs_overflow(): + state = workflow.State().default_settings() + utils = pd.DataFrame( + [[logit.UTIL_MIN - 1.0, logit.UTIL_MIN - 2.0], [0.0, 0.0]], + columns=["a", "b"], + ) + + with pytest.warns(UserWarning, match="cannot set overflow_protection"): + probs = logit.utils_to_probs( + state, + utils, + trace_label=None, + allow_zero_probs=True, + overflow_protection=True, + ) + + assert (probs.iloc[0] == 0.0).all() + assert probs.iloc[1].sum() == pytest.approx(1.0) + assert probs.iloc[1].iloc[0] == pytest.approx(0.5) + assert probs.iloc[1].iloc[1] == pytest.approx(0.5) + + +def test_utils_to_probs_raises_on_float32_zero_probs_overflow(): + state = workflow.State().default_settings() + utils = pd.DataFrame(np.array([[90.0, 0.0]], dtype=np.float32)) + + with pytest.raises(ValueError, match="cannot prevent expected overflow"): + logit.utils_to_probs( + state, + utils, + trace_label=None, + allow_zero_probs=True, + overflow_protection=True, + ) def test_utils_to_probs(utilities, test_data): @@ -119,6 +235,9 @@ def test_utils_to_probs_raises(): assert np.asarray(z).ravel() == pytest.approx(np.asarray([0.0, 0.0, 1.0, 0.0])) +# +# `make_choices` Tests +# def test_make_choices_only_one(): state = workflow.State().default_settings() probs = pd.DataFrame( @@ -143,16 +262,423 @@ def test_make_choices_real_probs(utilities): ) -@pytest.fixture(scope="module") -def interaction_choosers(): - return pd.DataFrame({"attr": ["a", "b", "c", "b"]}, index=["w", "x", "y", "z"]) +def test_different_order_make_choices(): + # check if, when we shuffle utilities, make_choices chooses the same alternatives + state = workflow.State().default_settings() + # increase number of choosers and alternatives for realism + n_choosers = 100 + n_alts = 50 + data = np.random.rand(n_choosers, n_alts) + chooser_ids = np.arange(n_choosers) + alt_ids = [f"alt_{i}" for i in range(n_alts)] + + utilities = pd.DataFrame( + data, + index=pd.Index(chooser_ids, name="chooser_id"), + columns=alt_ids, + ) -@pytest.fixture(scope="module") -def interaction_alts(): - return pd.DataFrame({"prop": [10, 20, 30, 40]}, index=[1, 2, 3, 4]) + # We need a stable RNG that gives the same random numbers for the same chooser_id + # regardless of row order. ActivitySim's random.Random does this. + state.get_rn_generator().add_channel("chooser_id", utilities) + state.get_rn_generator().begin_step("test_step") + + probs = logit.utils_to_probs(state, utilities, trace_label=None) + choices, rands = logit.make_choices(state, probs) + + # shuffle utilities (rows) and make_choices again + # We must reset the step offset so the RNG produces the same sequence for the same IDs + state.get_rn_generator().end_step("test_step") + state.get_rn_generator().begin_step("test_step") + utilities_shuffled = utilities.sample(frac=1, random_state=42) + probs_shuffled = logit.utils_to_probs(state, utilities_shuffled, trace_label=None) + choices_shuffled, rands_shuffled = logit.make_choices(state, probs_shuffled) + + # sorting both to ensure comparison is on the same index order + pdt.assert_series_equal( + choices.sort_index(), choices_shuffled.sort_index(), check_dtype=False + ) + + +def test_make_choices_matches_random_draws(): + class DummyRNG: + def random_for_df(self, df, n=1): + assert n == 1 + return np.array([[0.05], [0.6], [0.95]]) + + class DummyState: + @staticmethod + def get_rn_generator(): + return DummyRNG() + + state = DummyState() + probs = pd.DataFrame( + [[0.1, 0.2, 0.7], [0.4, 0.4, 0.2], [0.05, 0.9, 0.05]], + index=["a", "b", "c"], + columns=["x", "y", "z"], + ) + choices, rands = logit.make_choices(state, probs) + + expected_rands = np.array([0.05, 0.6, 0.95]) + expected_choices = np.array([0, 1, 1]) + + pdt.assert_series_equal( + rands, + pd.Series(expected_rands, index=probs.index), + check_names=False, + ) + pdt.assert_series_equal( + choices, + pd.Series(expected_choices, index=probs.index), + check_dtype=False, + ) +# +# EV1 Random Tests +# +def test_add_ev1_random(): + class DummyRNG: + def gumbel_for_df(self, df, n): + # Deterministic, non-constant draws make it easy to verify + # correct per-row/per-column addition behavior. + row_component = df.index.to_numpy(dtype=float).reshape(-1, 1) / 10.0 + col_component = np.arange(n, dtype=float).reshape(1, -1) + return row_component + col_component + + rng = DummyRNG() + + class DummyState: + @staticmethod + def get_rn_generator(): + return rng + + utilities = pd.DataFrame( + [[1.0, 2.0], [3.0, 4.0]], + index=[10, 11], + columns=["a", "b"], + ) + + randomized = logit.add_ev1_random(DummyState(), utilities) + + expected = pd.DataFrame( + [[2.0, 4.0], [4.1, 6.1]], + index=[10, 11], + columns=["a", "b"], + ) + + # check that the random component was added correctly, and that the original utilities were not mutated + pdt.assert_frame_equal(randomized, expected) + pdt.assert_index_equal(randomized.index, utilities.index) + pdt.assert_index_equal(randomized.columns, utilities.columns) + pdt.assert_frame_equal( + utilities, + pd.DataFrame( + [[1.0, 2.0], [3.0, 4.0]], + index=[10, 11], + columns=["a", "b"], + ), + ) + + +# +# Nested Logit Structure Tests +# +def test_group_nest_names_by_level(): + nest_spec = { + "name": "root", + "coefficient": 1.0, + "alternatives": [ + {"name": "motorized", "coefficient": 0.7, "alternatives": ["car", "bus"]}, + "walk", + ], + } + + grouped = logit.group_nest_names_by_level(nest_spec) + + assert grouped == {1: ["root"], 2: ["motorized", "walk"], 3: ["car", "bus"]} + + +def test_choose_from_tree_selects_leaf(): + nest_utils = pd.Series( + { + "motorized": 2.0, + "walk": 1.0, + "car": 5.0, + "bus": 3.0, + } + ) + all_alternatives = {"walk", "car", "bus"} + logit_nest_groups = {1: ["root"], 2: ["motorized", "walk"], 3: ["car", "bus"]} + nest_alternatives_by_name = { + "root": ["motorized", "walk"], + "motorized": ["car", "bus"], + } + + choice = logit.choose_from_tree( + nest_utils, all_alternatives, logit_nest_groups, nest_alternatives_by_name + ) + + assert choice == "car" + + +def test_choose_from_tree_raises_on_missing_leaf(): + nest_utils = pd.Series({"motorized": 2.0, "walk": 1.0}) + all_alternatives = {"car", "bus"} + logit_nest_groups = {1: ["root"], 2: ["motorized", "walk"]} + nest_alternatives_by_name = { + "root": ["motorized", "walk"], + "motorized": ["car", "bus"], + } + + with pytest.raises(ValueError, match="no alternative found"): + logit.choose_from_tree( + nest_utils, all_alternatives, logit_nest_groups, nest_alternatives_by_name + ) + + +# +# EET Choice Behavior Tests +# +def test_make_choices_eet_mnl(monkeypatch): + def fake_add_ev1_random(_state, _df): + return pd.DataFrame( + [[1.0, 3.0], [4.0, 2.0]], + index=[100, 101], + columns=["a", "b"], + ) + + monkeypatch.setattr(logit, "add_ev1_random", fake_add_ev1_random) + + choices = logit.make_choices_explicit_error_term_mnl( + workflow.State().default_settings(), + pd.DataFrame([[0.0, 0.0], [0.0, 0.0]], index=[100, 101], columns=["a", "b"]), + trace_label=None, + ) + + pdt.assert_series_equal(choices, pd.Series([1, 0], index=[100, 101])) + + +def test_make_choices_eet_nl(monkeypatch): + def fake_add_ev1_random(_state, _df): + return pd.DataFrame( + [[5.0, 1.0, 4.0, 2.0], [3.0, 4.0, 1.0, 2.0]], + index=[10, 11], + columns=["motorized", "walk", "car", "bus"], + ) + + monkeypatch.setattr(logit, "add_ev1_random", fake_add_ev1_random) + + nest_spec = { + "name": "root", + "coefficient": 1.0, + "alternatives": [ + {"name": "motorized", "coefficient": 0.7, "alternatives": ["car", "bus"]}, + "walk", + ], + } + alt_order_array = np.array(["walk", "car", "bus"]) + + choices = logit.make_choices_explicit_error_term_nl( + workflow.State().default_settings(), + pd.DataFrame( + [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]], + index=[10, 11], + columns=["motorized", "walk", "car", "bus"], + ), + alt_order_array, + nest_spec, + trace_label=None, + ) + + pdt.assert_series_equal(choices, pd.Series([1, 0], index=[10, 11])) + + +def test_make_choices_utility_based_sets_zero_rands(monkeypatch): + def fake_add_ev1_random(_state, df): + return pd.DataFrame( + [[2.0, 1.0], [0.5, 2.5]], + index=df.index, + columns=df.columns, + ) + + monkeypatch.setattr(logit, "add_ev1_random", fake_add_ev1_random) + + utilities = pd.DataFrame([[3.0, 2.0], [1.0, 4.0]], index=[11, 12]) + choices, rands = logit.make_choices_utility_based( + workflow.State().default_settings(), + utilities, + name_mapping=np.array(["a", "b"]), + nest_spec=None, + trace_label=None, + ) + + expected_choices = pd.Series([0, 1], index=[11, 12]) + pdt.assert_series_equal(choices, expected_choices) + pdt.assert_series_equal(rands, pd.Series([0, 0], index=[11, 12])) + + +# +# EET vs non-EET Choice Behavior Tests +# +def test_make_choices_vs_eet_same_distribution(): + """With many draws, make_choices (probability-based) and + make_choices_explicit_error_term_mnl (EET) should produce roughly the + same empirical choice-frequency distribution for the same utilities.""" + n_draws = 1_000_000 + a_tol = 0.001 + r_tol = 0.01 + utils_values = [5.0, 6.0, 7.0, 8.0, 9.0] + n_alts = len(utils_values) + columns = ["a", "b", "c", "d", "e"] + + utils = pd.DataFrame([utils_values] * n_draws, columns=columns) + + # Probability-based (Monte Carlo) path — independent RNG + mc_rng = np.random.default_rng(42) + + class MCDummyRNG: + def random_for_df(self, df, n=1): + return mc_rng.random((len(df), n)) + + class MCDummyState: + @staticmethod + def get_rn_generator(): + return MCDummyRNG() + + probs = logit.utils_to_probs( + MCDummyState(), utils, trace_label=None, overflow_protection=True + ) + choices_mc, _ = logit.make_choices(MCDummyState(), probs, trace_label=None) + + # Explicit-error-term (EET) path — independent RNG + eet_rng = np.random.default_rng(123) + + class EETDummyRNG: + def gumbel_for_df(self, df, n): + return eet_rng.gumbel(size=(len(df), n)) + + class EETDummyState: + @staticmethod + def get_rn_generator(): + return EETDummyRNG() + + choices_eet = logit.make_choices_explicit_error_term_mnl( + EETDummyState(), utils, trace_label=None + ) + + mc_fracs = np.bincount(choices_mc.values.astype(int), minlength=n_alts) / n_draws + eet_fracs = np.bincount(choices_eet.values.astype(int), minlength=n_alts) / n_draws + + np.testing.assert_allclose(mc_fracs, eet_fracs, atol=a_tol, rtol=r_tol) + np.testing.assert_allclose( + mc_fracs, probs.iloc[0].to_numpy(), atol=a_tol, rtol=r_tol + ) + np.testing.assert_allclose( + eet_fracs, probs.iloc[0].to_numpy(), atol=a_tol, rtol=r_tol + ) + + +def test_make_choices_vs_eet_nl_same_distribution(): + """With many draws, nested logit choices via probabilities and + nested logit choices via EET should produce the same empirical distribution.""" + n_draws = 100_000 + a_tol = 0.01 + + nest_spec = { + "name": "root", + "coefficient": 1.0, + "alternatives": [ + {"name": "motorized", "coefficient": 0.5, "alternatives": ["car", "bus"]}, + "walk", + ], + } + # Utilities for car, bus, walk + # For NL, we need utilities for all nodes in the tree for EET, + # but for probability-based choice we usually use the flattened/logsummed probabilities. + # To compare them fairly, we use the same base utilities. + # car=0.5, bus=0.2, walk=0.4 + utils_df = pd.DataFrame( + [[0.5, 0.2, 0.4, 0.0, 0.0]], + columns=["car", "bus", "walk", "motorized", "root"], + ) + utils_df = pd.concat([utils_df] * n_draws, ignore_index=True) + alt_order_array = np.array(["car", "bus", "walk"]) + + # 1. Probability-based Nested Logit choices + mc_rng = np.random.default_rng(42) + + class MCDummyRNG: + def random_for_df(self, df, n=1): + return mc_rng.random((len(df), n)) + + class MCDummyState: + @staticmethod + def get_rn_generator(): + return MCDummyRNG() + + def default_settings(self): + return self + + # Compute probabilities for NL using simulation logic + nested_exp_utilities = simulate.compute_nested_exp_utilities( + utils_df[["car", "bus", "walk"]], nest_spec + ) + nested_probabilities = simulate.compute_nested_probabilities( + MCDummyState(), nested_exp_utilities, nest_spec, trace_label=None + ) + probs = simulate.compute_base_probabilities( + nested_probabilities, nest_spec, utils_df[["car", "bus", "walk"]] + ) + choices_mc, _ = logit.make_choices(MCDummyState(), probs, trace_label=None) + + # 2. EET-based Nested Logit choices + eet_rng = np.random.default_rng(123) + + class EETDummyRNG: + def gumbel_for_df(self, df, n): + return eet_rng.gumbel(size=(len(df), n)) + + class EETDummyState: + @staticmethod + def get_rn_generator(): + return EETDummyRNG() + + def default_settings(self): + return self + + @property + def tracing(self): + import activitysim.core.tracing as tracing + + return tracing + + # For EET NL, we provide the utilities for all nodes. + # compute_nested_utilities handles the division by nesting coefficients for leaves + # and the logsum * coefficient for internal nodes. + nested_utilities = simulate.compute_nested_utilities( + utils_df[["car", "bus", "walk"]], nest_spec + ) + + choices_eet = logit.make_choices_explicit_error_term_nl( + EETDummyState(), + nested_utilities, + alt_order_array, + nest_spec, + trace_label=None, + ) + + mc_fracs = np.bincount(choices_mc.values.astype(int), minlength=3) / n_draws + eet_fracs = np.bincount(choices_eet.values.astype(int), minlength=3) / n_draws + + # They should be close + np.testing.assert_allclose(mc_fracs, eet_fracs, atol=a_tol) + + +# +# Interaction Dataset Tests +# def test_interaction_dataset_no_sample(interaction_choosers, interaction_alts): expected = pd.DataFrame( { @@ -167,9 +693,6 @@ def test_interaction_dataset_no_sample(interaction_choosers, interaction_alts): ) interacted, expected = interacted.align(expected, axis=1) - - print("interacted\n", interacted) - print("expected\n", expected) pdt.assert_frame_equal(interacted, expected) diff --git a/activitysim/core/test/test_simulate.py b/activitysim/core/test/test_simulate.py index 17d4ba2cd..21e0f90e7 100644 --- a/activitysim/core/test/test_simulate.py +++ b/activitysim/core/test/test_simulate.py @@ -10,7 +10,7 @@ import pandas.testing as pdt import pytest -from activitysim.core import simulate, workflow +from activitysim.core import chunk, simulate, workflow @pytest.fixture @@ -42,6 +42,19 @@ def data(data_dir): return pd.read_csv(os.path.join(data_dir, "data.csv")) +@pytest.fixture +def nest_spec(): + nest_spec = { + "name": "root", + "coefficient": 1.0, + "alternatives": [ + {"name": "alt0", "coefficient": 0.5, "alternatives": ["alt0.0", "alt0.1"]}, + "alt1", + ], + } + return nest_spec + + def test_read_model_spec(state, spec_name): spec = state.filesystem.read_model_spec(file_name=spec_name) @@ -88,3 +101,234 @@ def test_simple_simulate_chunked(state, data, spec): ) expected = pd.Series([1, 1, 1], index=data.index) pdt.assert_series_equal(choices, expected, check_dtype=False) + + +def test_eval_mnl_eet(state): + # Check that the same counts are returned by eval_mnl when using EET and when not. + + num_choosers = 100_000 + + np.random.seed(42) + data2 = pd.DataFrame( + { + "chooser_attr": np.random.rand(num_choosers), + }, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + spec2 = pd.DataFrame( + {"alt0": [1.0], "alt1": [2.0]}, + index=pd.Index(["chooser_attr"], name="Expression"), + ) + + # Set up a state with EET enabled + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", data2) + state.rng().begin_step("test_step_mnl") + + chunk_sizer = chunk.ChunkSizer(state, "", "", num_choosers) + + # run eval_mnl with EET enabled + choices_eet = simulate.eval_mnl( + state=state, + choosers=data2, + spec=spec2, + locals_d=None, + custom_chooser=None, + estimator=None, + chunk_sizer=chunk_sizer, + ) + + # Reset the state, without EET enabled + state.settings.use_explicit_error_terms = False + + state.rng().end_step("test_step_mnl") + state.rng().begin_step("test_step_mnl") + + choices_mnl = simulate.eval_mnl( + state=state, + choosers=data2, + spec=spec2, + locals_d=None, + custom_chooser=None, + estimator=None, + chunk_sizer=chunk_sizer, + ) + + # Compare counts + mnl_counts = choices_mnl.value_counts(normalize=True) + explicit_counts = choices_eet.value_counts(normalize=True) + assert np.allclose(mnl_counts, explicit_counts, atol=0.01) + + +def test_eval_nl_eet(state, nest_spec): + # Check that the same counts are returned by eval_nl when using EET and when not. + + num_choosers = 100_000 + + np.random.seed(42) + data2 = pd.DataFrame( + { + "chooser_attr": np.random.rand(num_choosers), + }, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + spec2 = pd.DataFrame( + {"alt1": [2.0], "alt0.0": [0.5], "alt0.1": [0.2]}, + index=pd.Index(["chooser_attr"], name="Expression"), + ) + + # Set up a state with EET enabled + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", data2) + state.rng().begin_step("test_step_mnl") + + chunk_sizer = chunk.ChunkSizer(state, "", "", num_choosers) + + # run eval_nl with EET enabled + choices_eet = simulate.eval_nl( + state=state, + choosers=data2, + spec=spec2, + nest_spec=nest_spec, + locals_d={}, + custom_chooser=None, + estimator=None, + trace_label="test", + chunk_sizer=chunk_sizer, + ) + + # Reset the state, without EET enabled + state.settings.use_explicit_error_terms = False + + state.rng().end_step("test_step_mnl") + state.rng().begin_step("test_step_mnl") + + choices_mnl = simulate.eval_nl( + state=state, + choosers=data2, + spec=spec2, + nest_spec=nest_spec, + locals_d={}, + custom_chooser=None, + trace_label="test", + estimator=None, + chunk_sizer=chunk_sizer, + ) + + # Compare counts + mnl_counts = choices_mnl.value_counts(normalize=True) + explicit_counts = choices_eet.value_counts(normalize=True) + assert np.allclose(mnl_counts, explicit_counts, atol=0.01) + + +def test_compute_nested_utilities(nest_spec): + # computes nested utilities manually and using the function and checks that + # the utilities are the same + + num_choosers = 2 + raw_utilities = pd.DataFrame( + {"alt1": [1, 10], "alt0.0": [2, 3], "alt0.1": [4, 5]}, + index=pd.Index(range(num_choosers)), + ) + + nested_utilities = simulate.compute_nested_utilities(raw_utilities, nest_spec) + + # these are from the definition of nest_spec + alt0_nest_coefficient = nest_spec["alternatives"][0]["coefficient"] + alt0_leaf_product_of_coefficients = nest_spec["coefficient"] * alt0_nest_coefficient + assert alt0_leaf_product_of_coefficients == 0.5 # 1 * 0.5 + + product_of_coefficientss = pd.DataFrame( + { + "alt1": [nest_spec["coefficient"]], + "alt0.0": [alt0_leaf_product_of_coefficients], + "alt0.1": [alt0_leaf_product_of_coefficients], + }, + index=[0], + ) + leaf_utilities = raw_utilities / product_of_coefficientss.iloc[0] + + constructed_nested_utilities = pd.DataFrame(index=raw_utilities.index) + + constructed_nested_utilities[leaf_utilities.columns] = leaf_utilities + constructed_nested_utilities["alt0"] = alt0_nest_coefficient * np.log( + np.exp(leaf_utilities[["alt0.0", "alt0.1"]]).sum(axis=1) + ) + constructed_nested_utilities["root"] = nest_spec["coefficient"] * np.log( + np.exp(constructed_nested_utilities[["alt1", "alt0"]]).sum(axis=1) + ) + + assert np.allclose( + nested_utilities, constructed_nested_utilities[nested_utilities.columns] + ), "Mismatch in nested utilities" + + +def test_eval_nl_logsums_eet_vs_non_eet(state, nest_spec): + """eval_nl with want_logsums=True must produce identical logsums under + EET and non-EET modes""" + + num_choosers = 100 + + np.random.seed(42) + data2 = pd.DataFrame( + {"chooser_attr": np.random.rand(num_choosers)}, + index=pd.Index(range(num_choosers), name="person_id"), + ) + + spec2 = pd.DataFrame( + {"alt1": [2.0], "alt0.0": [0.5], "alt0.1": [0.2]}, + index=pd.Index(["chooser_attr"], name="Expression"), + ) + + chunk_sizer = chunk.ChunkSizer(state, "", "", num_choosers) + + state.settings.use_explicit_error_terms = True + state.rng().set_base_seed(42) + state.rng().add_channel("person_id", data2) + state.rng().begin_step("test_step_logsums") + + result_eet = simulate.eval_nl( + state=state, + choosers=data2, + spec=spec2, + nest_spec=nest_spec, + locals_d={}, + custom_chooser=None, + estimator=None, + want_logsums=True, + trace_label="test", + chunk_sizer=chunk_sizer, + ) + + state.rng().end_step("test_step_logsums") + + state.settings.use_explicit_error_terms = False + state.rng().begin_step("test_step_logsums") + + result_non_eet = simulate.eval_nl( + state=state, + choosers=data2, + spec=spec2, + nest_spec=nest_spec, + locals_d={}, + custom_chooser=None, + estimator=None, + want_logsums=True, + trace_label="test", + chunk_sizer=chunk_sizer, + ) + + state.rng().end_step("test_step_logsums") + + # Both paths should return a DataFrame with 'choice' and 'logsum' columns + assert "logsum" in result_eet.columns, "EET result missing logsum column" + assert "logsum" in result_non_eet.columns, "non-EET result missing logsum column" + + # Logsums are deterministic — they must be identical across paths + assert np.allclose( + result_eet["logsum"].values, result_non_eet["logsum"].values, rtol=1e-10 + ) diff --git a/activitysim/examples/production_semcog/test/configs_eet/settings.yaml b/activitysim/examples/production_semcog/test/configs_eet/settings.yaml new file mode 100644 index 000000000..dcff83f5a --- /dev/null +++ b/activitysim/examples/production_semcog/test/configs_eet/settings.yaml @@ -0,0 +1,5 @@ +inherit_settings: True + +use_explicit_error_terms: True + +rng_base_seed: 42 diff --git a/activitysim/examples/production_semcog/test/regress/final_eet_trips.csv b/activitysim/examples/production_semcog/test/regress/final_eet_trips.csv new file mode 100644 index 000000000..cc98fe5d6 --- /dev/null +++ b/activitysim/examples/production_semcog/test/regress/final_eet_trips.csv @@ -0,0 +1,116 @@ +"person_id","household_id","primary_purpose","trip_num","outbound","trip_count","destination","origin","tour_id","purpose","destination_logsum","original_school_zone_id","parked_at_university","depart","tour_includes_parking","trip_id_pre_parking","trip_mode","mode_choice_logsum","trip_id" +2632461,1066212,"eatout",1,true,1,22688,22687,107930907,"eatout",,,false,24,0,863447257,"WALK",0.3324082937283966,1726894513 +2632461,1066212,"eatout",1,false,1,22687,22688,107930907,"home",,,false,32,0,863447261,"WALK",0.3324082937283966,1726894521 +2632461,1066212,"social",1,true,1,22676,22687,107930937,"social",,,false,38,0,863447497,"WALK",-0.372506247777352,1726894993 +2632461,1066212,"social",1,false,1,22687,22676,107930937,"home",,,false,38,0,863447501,"WALK",-0.372506247777352,1726895001 +2632461,1066212,"work",1,true,1,22770,22687,107930940,"work",,,false,11,0,863447521,"DRIVEALONE",-0.9006268476080008,1726895041 +2632461,1066212,"work",1,false,1,22687,22770,107930940,"home",,,false,23,0,863447525,"DRIVEALONE",-0.5528040173584109,1726895049 +2632746,1066390,"school",1,true,2,22684,22688,107942617,"shopping",10.301822957849977,,false,13,0,863540937,"SHARED3",0.08788155056513884,1727081873 +2632746,1066390,"school",2,true,2,22716,22684,107942617,"school",,,false,13,0,863540938,"SHARED3",0.21128282107010274,1727081874 +2632746,1066390,"school",1,false,1,22688,22716,107942617,"home",,,false,20,0,863540941,"SHARED3",-0.12094657865851986,1727081881 +2632746,1066390,"work",1,true,2,22798,22688,107942625,"parking",,,false,21,1,863541001,"DRIVEALONE",-1.0935617741756212,1727082001 +2632746,1066390,"work",2,true,2,22798,22798,107942625,"work",,,true,21,1,863541001,"WALK",2.688813549798029,1727082002 +2632746,1066390,"work",1,false,2,22798,22798,107942625,"parking",,,true,26,1,863541005,"WALK",2.6888134385754383,1727082009 +2632746,1066390,"work",2,false,2,22688,22798,107942625,"home",,,false,26,1,863541005,"DRIVEALONE",-1.285961232813202,1727082010 +2643231,1070862,"work",1,true,2,22767,22701,108372510,"parking",,,false,12,1,866980081,"DRIVEALONE",-2.254013060998411,1733960161 +2643231,1070862,"work",2,true,2,22767,22767,108372510,"work",,,true,12,1,866980081,"WALK",3.750337710238621,1733960162 +2643231,1070862,"work",1,false,2,22767,22767,108372510,"parking",,,true,27,1,866980085,"WALK",3.75033686292241,1733960169 +2643231,1070862,"work",2,false,2,22701,22767,108372510,"home",,,false,27,1,866980085,"DRIVEALONE",-1.0195938099395256,1733960170 +2851663,1151807,"work",1,true,2,22808,22768,116918222,"parking",,,false,8,1,935345777,"WALK",0.5794744566652396,1870691553 +2851663,1151807,"work",2,true,2,22808,22808,116918222,"work",,,true,8,1,935345777,"WALK",3.9202266680627016,1870691554 +2851663,1151807,"work",1,false,2,22808,22808,116918222,"parking",,,true,23,1,935345781,"WALK",3.9202264187221654,1870691561 +2851663,1151807,"work",2,false,2,22768,22808,116918222,"home",,,false,23,1,935345781,"WALK",0.5811901896672964,1870691562 +2851664,1151807,"atwork",1,true,1,22795,22795,116918247,"atwork",,,false,8,0,935345977,"WALK",0,1870691953 +2851664,1151807,"atwork",1,false,2,22807,22795,116918247,"eatout",11.697803529864785,,false,9,0,935345981,"WALK",-0.6403075075080801,1870691961 +2851664,1151807,"atwork",2,false,2,22795,22807,116918247,"work",,,false,9,0,935345982,"WALK",1.9742275881306344,1870691962 +2851664,1151807,"work",1,true,2,22795,22768,116918263,"parking",,,false,8,1,935346105,"DRIVEALONE",-0.1700734379058779,1870692209 +2851664,1151807,"work",2,true,2,22795,22795,116918263,"work",,,true,8,1,935346105,"WALK",2.014596847010505,1870692210 +2851664,1151807,"work",1,false,2,22795,22795,116918263,"parking",,,true,9,1,935346109,"WALK",2.014596847010505,1870692217 +2851664,1151807,"work",2,false,2,22768,22795,116918263,"home",,,false,9,1,935346109,"DRIVEALONE",-0.17669442402412502,1870692218 +2851664,1151807,"work",1,true,2,22795,22768,116918264,"parking",,,false,10,1,935346113,"SHARED2",0.18223026147932736,1870692225 +2851664,1151807,"work",2,true,2,22795,22795,116918264,"work",,,true,10,1,935346113,"WALK",3.0721786555313417,1870692226 +2851664,1151807,"work",1,false,3,22767,22795,116918264,"eatout",13.361606283751318,,true,12,1,935346117,"WALK",2.1699105206573512,1870692233 +2851664,1151807,"work",2,false,3,22795,22767,116918264,"parking",,,true,12,1,935346118,"WALK",3.660264542941122,1870692234 +2851664,1151807,"work",3,false,3,22768,22795,116918264,"home",,,false,12,1,935346118,"DRIVEALONE",0.19501777547255042,1870692235 +2851665,1151807,"school",1,true,1,22738,22768,116918296,"school",,,false,9,0,935346369,"WALK",-0.3380929737459932,1870692737 +2851665,1151807,"school",1,false,1,22768,22738,116918296,"home",,,false,25,0,935346373,"WALK",-0.3380929737459932,1870692745 +2851666,1151807,"school",1,true,1,22738,22768,116918337,"school",,,false,9,0,935346697,"WALK",-0.23394837977299351,1870693393 +2851666,1151807,"school",1,false,2,22768,22738,116918337,"eatout",12.976839556161908,,false,26,0,935346701,"WALK",-0.30724534671072457,1870693401 +2851666,1151807,"school",2,false,2,22768,22768,116918337,"home",,,false,26,0,935346702,"WALK",1.4569271228419698,1870693402 +2853258,1152693,"work",1,true,1,22808,22767,116983617,"work",,,false,20,0,935868937,"WALK",4.2361228435911125,1871737873 +2853258,1152693,"work",1,false,1,22767,22808,116983617,"home",,,false,42,0,935868941,"WALK",4.2355632459345705,1871737881 +2864033,1157863,"work",1,true,1,22766,22818,117425392,"work",,,false,22,0,939403137,"WALK",-0.5747999444276104,1878806273 +2864033,1157863,"work",1,false,3,22801,22766,117425392,"othmaint",11.425225674825322,,false,43,0,939403141,"WALK",-0.7024510798800492,1878806281 +2864033,1157863,"work",2,false,3,22802,22801,117425392,"othmaint",13.28624241505493,,false,43,0,939403142,"WALK",0.28664476657433274,1878806282 +2864033,1157863,"work",3,false,3,22818,22802,117425392,"home",,,false,44,0,939403143,"WALK",1.5286197350024198,1878806283 +2867650,1159450,"work",1,true,1,22740,22791,117573689,"work",,,false,5,0,940589513,"DRIVEALONE",-0.670801522478196,1881179025 +2867650,1159450,"work",1,false,1,22791,22740,117573689,"home",,,false,28,0,940589517,"SHARED2",0.03856943979091073,1881179033 +2867652,1159450,"school",1,true,1,22798,22791,117573763,"school",,,false,11,0,940590105,"WALK",-0.14197028764914804,1881180209 +2867652,1159450,"school",1,false,2,22807,22798,117573763,"escort",12.102989575726829,,false,26,0,940590109,"WALK",0.3099529390965043,1881180217 +2867652,1159450,"school",2,false,2,22791,22807,117573763,"home",,,false,27,0,940590110,"WALK",1.1921458680932129,1881180218 +2867653,1159450,"school",1,true,1,22716,22791,117573804,"school",,,false,9,0,940590433,"SHARED3",-0.7165798080815713,1881180865 +2867653,1159450,"school",1,false,1,22791,22716,117573804,"home",,,false,23,0,940590437,"SHARED3",-0.7056869394647015,1881180873 +2869308,1160345,"escort",1,true,4,22806,22788,117641637,"parking",,,false,37,1,941133097,"SHARED2",-0.35468797889700127,1882266193 +2869308,1160345,"escort",2,true,4,22761,22806,117641637,"escort",9.809199303175808,,true,37,1,941133097,"WALK",1.1693447862605972,1882266194 +2869308,1160345,"escort",3,true,4,22806,22761,117641637,"parking",,,true,38,1,941133098,"WALK",1.0527105195710942,1882266195 +2869308,1160345,"escort",4,true,4,22738,22806,117641637,"escort",,,false,38,1,941133098,"SHARED2",-0.7899590349500466,1882266196 +2869308,1160345,"escort",1,false,2,22762,22738,117641637,"escort",11.267844899645352,,false,39,1,941133101,"DRIVEALONE",-0.33121883758411125,1882266201 +2869308,1160345,"escort",2,false,2,22788,22762,117641637,"home",,,false,40,1,941133102,"SHARED2",-0.21686205931765942,1882266202 +2869308,1160345,"work",1,true,1,22769,22788,117641667,"work",,,false,11,1,941133337,"SHARED2",-0.24887791851324914,1882266673 +2869308,1160345,"work",1,false,6,22769,22769,117641667,"othmaint",11.968949912548455,,false,27,1,941133341,"SHARED3",-0.004404805067726633,1882266681 +2869308,1160345,"work",2,false,6,22761,22769,117641667,"parking",,,false,28,1,941133342,"WALK",-0.6678721152911544,1882266682 +2869308,1160345,"work",3,false,6,22767,22761,117641667,"shopping",10.633629340799134,,true,28,1,941133342,"WALK",3.0199993221581605,1882266683 +2869308,1160345,"work",4,false,6,22807,22767,117641667,"escort",13.512213256227986,,true,29,1,941133343,"WALK",4.2137726609909425,1882266684 +2869308,1160345,"work",5,false,6,22761,22807,117641667,"parking",,,true,30,1,941133344,"WALK",3.869947742844953,1882266685 +2869308,1160345,"work",6,false,6,22788,22761,117641667,"home",,,false,30,1,941133344,"SHARED3",-0.41885728895985064,1882266686 +2869309,1160345,"univ",1,true,2,22795,22788,117641700,"parking",,,false,13,1,941133601,"DRIVEALONE",-0.15235107523409816,1882267201 +2869309,1160345,"univ",2,true,2,22766,22795,117641700,"univ",,,true,13,1,941133601,"WALK_LOC",1.202786557349171,1882267202 +2869309,1160345,"univ",1,false,3,22766,22766,117641700,"othdiscr",12.456311079956105,,true,24,1,941133605,"WALK",2.0068506545834075,1882267209 +2869309,1160345,"univ",2,false,3,22795,22766,117641700,"parking",,,true,24,1,941133606,"WALK_LOC",1.142188272503556,1882267210 +2869309,1160345,"univ",3,false,3,22788,22795,117641700,"home",,,false,24,1,941133606,"DRIVEALONE",-0.15842120768012627,1882267211 +2869392,1160408,"shopping",1,true,1,22769,22784,117645105,"shopping",,,false,26,0,941160841,"DRIVEALONE",-0.6680935247002481,1882321681 +2869392,1160408,"shopping",1,false,2,22770,22769,117645105,"othmaint",11.503374294479649,,false,36,0,941160845,"WALK",-0.5869025084004701,1882321689 +2869392,1160408,"shopping",2,false,2,22784,22770,117645105,"home",,,false,37,0,941160846,"WALK",-0.14561343082958378,1882321690 +2871041,1161101,"work",1,true,1,22770,22747,117712720,"work",,,false,10,0,941701761,"WALK",4.37274480605373,1883403521 +2871041,1161101,"work",1,false,1,22747,22770,117712720,"home",,,false,30,0,941701765,"WALK",4.374474053696968,1883403529 +2871042,1161101,"work",1,true,2,22802,22747,117712761,"parking",,,false,6,1,941702089,"DRIVEALONE",0.31437493739186884,1883404177 +2871042,1161101,"work",2,true,2,22802,22802,117712761,"work",,,true,6,1,941702089,"WALK",3.98103278438962,1883404178 +2871042,1161101,"work",1,false,2,22802,22802,117712761,"parking",,,true,31,1,941702093,"WALK",3.9810287626204213,1883404185 +2871042,1161101,"work",2,false,2,22747,22802,117712761,"home",,,false,31,1,941702093,"WALK",0.29964022247838484,1883404186 +4717826,1936565,"univ",1,true,1,22809,22808,193430897,"univ",,,false,25,0,1547447177,"WALK",2.48948699138067,3094894353 +4717826,1936565,"univ",1,false,4,22809,22809,193430897,"univ",10.85837416878764,22809,false,42,0,1547447181,"WALK",3.0000160707611045,3094894361 +4717826,1936565,"univ",2,false,4,22802,22809,193430897,"social",14.420134553925665,,false,43,0,1547447182,"WALK",2.8898362057163802,3094894362 +4717826,1936565,"univ",3,false,4,22807,22802,193430897,"eatout",18.598339591406937,,false,44,0,1547447183,"WALK_LOC",5.851209408094483,3094894363 +4717826,1936565,"univ",4,false,4,22808,22807,193430897,"home",,,false,44,0,1547447184,"WALK",5.537675529040812,3094894364 +4718747,1937486,"univ",1,true,3,22807,22765,193468658,"eatout",25.835053255003054,,false,14,0,1547749265,"WALK_LOC",5.394119748970986,3095498529 +4718747,1937486,"univ",2,true,3,22807,22807,193468658,"social",26.07487490221835,,false,16,0,1547749266,"WALK",5.765967272606238,3095498530 +4718747,1937486,"univ",3,true,3,22809,22807,193468658,"univ",,,false,19,0,1547749267,"WALK",3.0089831584168625,3095498531 +4718747,1937486,"univ",1,false,1,22765,22809,193468658,"home",,,false,42,0,1547749269,"WALK",2.48457681340577,3095498537 +4718747,1937486,"shopping",1,true,2,22767,22765,193468660,"shopping",30.837861614853992,,false,12,0,1547749281,"WALK",6.438600267913209,3095498561 +4718747,1937486,"shopping",2,true,2,22770,22767,193468660,"shopping",,,false,13,0,1547749282,"WALK",5.192455869479483,3095498562 +4718747,1937486,"shopping",1,false,1,22765,22770,193468660,"home",,,false,13,0,1547749285,"WALK",4.807792080345957,3095498569 +4720352,1939091,"univ",1,true,1,22809,22765,193534463,"univ",,,false,9,0,1548275705,"WALK",-0.9117642771058314,3096551409 +4720352,1939091,"univ",1,false,3,22767,22809,193534463,"shopping",11.843847663623558,,false,9,0,1548275709,"WALK",-0.50518420043921,3096551417 +4720352,1939091,"univ",2,false,3,22760,22767,193534463,"othdiscr",19.589050848806597,,false,9,0,1548275710,"WALK",2.07708617142782,3096551418 +4720352,1939091,"univ",3,false,3,22765,22760,193534463,"home",,,false,9,0,1548275711,"WALK",2.8041844809824235,3096551419 +4720352,1939091,"univ",1,true,1,22809,22765,193534464,"univ",,,false,23,0,1548275713,"WALK",2.507472441202307,3096551425 +4720352,1939091,"univ",1,false,2,22766,22809,193534464,"univ",10.595098453730076,22766,false,27,0,1548275717,"WALK",2.554225976269817,3096551433 +4720352,1939091,"univ",2,false,2,22765,22766,193534464,"home",,,false,28,0,1548275718,"WALK",2.711686716364389,3096551434 +4722297,1942003,"univ",1,true,1,22809,22810,193614208,"univ",,,false,11,0,1548913665,"WALK",2.4667125356379236,3097827329 +4722297,1942003,"univ",1,false,1,22810,22809,193614208,"home",,,false,37,0,1548913669,"WALK",2.4563973988486754,3097827337 +4726458,1946164,"eatout",1,true,1,22770,22808,193784784,"eatout",,,false,27,0,1550278273,"WALK",0.3756438367025996,3100556545 +4726458,1946164,"eatout",1,false,1,22808,22770,193784784,"home",,,false,29,0,1550278277,"WALK",0.3756438367025996,3100556553 +4726458,1946164,"eatout",1,true,1,22771,22808,193784785,"eatout",,,false,29,0,1550278281,"WALK",0.6461148549373952,3100556561 +4726458,1946164,"eatout",1,false,1,22808,22771,193784785,"home",,,false,30,0,1550278285,"WALK",0.6461148549373952,3100556569 +4726458,1946164,"shopping",1,true,1,22770,22808,193784811,"shopping",,,false,14,0,1550278489,"WALK",0.3756438367025996,3100556977 +4726458,1946164,"shopping",1,false,1,22808,22770,193784811,"home",,,false,17,0,1550278493,"WALK",0.3756438367025996,3100556985 +4727363,1947069,"univ",1,true,1,22809,22765,193821914,"univ",,,false,14,0,1550575313,"WALK",-0.9117642771058314,3101150625 +4727363,1947069,"univ",1,false,3,22767,22809,193821914,"escort",13.861849979093286,,false,26,0,1550575317,"WALK",-0.50518420043921,3101150633 +4727363,1947069,"univ",2,false,3,22767,22767,193821914,"shopping",18.14486120913688,,false,26,0,1550575318,"WALK",2.62825193059268,3101150634 +4727363,1947069,"univ",3,false,3,22765,22767,193821914,"home",,,false,27,0,1550575319,"WALK",2.1708672114306493,3101150635 +4729458,1949164,"univ",1,true,2,22767,22745,193907809,"eatout",13.431035125581994,,false,11,0,1551262473,"WALK",2.0891749086454086,3102524945 +4729458,1949164,"univ",2,true,2,22764,22767,193907809,"univ",,,false,11,0,1551262474,"WALK",-0.5148347167335139,3102524946 +4729458,1949164,"univ",1,false,2,22767,22764,193907809,"othdiscr",14.563044668763776,,false,27,0,1551262477,"WALK",-0.5148347167335139,3102524953 +4729458,1949164,"univ",2,false,2,22745,22767,193907809,"home",,,false,28,0,1551262478,"WALK",2.0891749086454086,3102524954 +4729679,1949385,"eatout",1,true,1,22745,22745,193916845,"eatout",,,false,26,0,1551334761,"WALK",0.7839251911505445,3102669521 +4729679,1949385,"eatout",1,false,1,22745,22745,193916845,"home",,,false,27,0,1551334765,"WALK",0.7839251911505445,3102669529 diff --git a/activitysim/examples/production_semcog/test/test_semcog.py b/activitysim/examples/production_semcog/test/test_semcog.py index e247fd645..8b77a4e3a 100644 --- a/activitysim/examples/production_semcog/test/test_semcog.py +++ b/activitysim/examples/production_semcog/test/test_semcog.py @@ -11,7 +11,7 @@ from activitysim.core.test._tools import assert_frame_substantively_equal -def run_test_semcog(multiprocess=False): +def run_test_semcog(multiprocess=False, use_explicit_error_terms=False): def example_path(dirname): resource = os.path.join("examples", "production_semcog", dirname) return str(importlib.resources.files("activitysim").joinpath(resource)) @@ -19,9 +19,12 @@ def example_path(dirname): def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) - def regress(): + def regress(use_explicit_error_terms=False): regress_trips_df = pd.read_csv( - test_path("regress/final_trips.csv"), dtype={"depart": int} + test_path( + f"regress/final{'_eet' if use_explicit_error_terms else ''}_trips.csv" + ), + dtype={"depart": int}, ) final_trips_df = pd.read_csv( test_path("output/final_trips.csv"), dtype={"depart": int} @@ -30,6 +33,12 @@ def regress(): file_path = os.path.join(os.path.dirname(__file__), "../simulation.py") + test_config_files = [] + if use_explicit_error_terms: + test_config_files = [ + "-c", + test_path("configs_eet"), + ] if multiprocess: subprocess.run( [ @@ -37,6 +46,7 @@ def regress(): "run", "-a", file_path, + *test_config_files, "-c", test_path("configs_mp"), "-c", @@ -59,6 +69,7 @@ def regress(): "run", "-a", file_path, + *test_config_files, "-c", test_path("configs"), "-c", @@ -73,7 +84,7 @@ def regress(): check=True, ) - regress() + regress(use_explicit_error_terms=use_explicit_error_terms) def test_semcog(): @@ -84,6 +95,16 @@ def test_semcog_mp(): run_test_semcog(multiprocess=True) +def test_semcog_eet(): + run_test_semcog(multiprocess=False, use_explicit_error_terms=True) + + +def test_semcog_mp_eet(): + run_test_semcog(multiprocess=True, use_explicit_error_terms=True) + + if __name__ == "__main__": run_test_semcog(multiprocess=False) run_test_semcog(multiprocess=True) + run_test_semcog(multiprocess=False, use_explicit_error_terms=True) + run_test_semcog(multiprocess=True, use_explicit_error_terms=True) diff --git a/docs/core.rst b/docs/core.rst index 687e8f956..a7a9ba59d 100644 --- a/docs/core.rst +++ b/docs/core.rst @@ -323,6 +323,20 @@ To specify and solve an NL model: * specify the nesting structure via the NESTS setting in the model configuration YAML file. An example nested logit NESTS entry can be found in ``example/configs/tour_mode_choice.yaml`` * call ``simulate.simple_simulate()``. The ``simulate.interaction_simulate()`` functionality is not yet supported for NL. +Explicit Error Terms +^^^^^^^^^^^^^^^^^^^^ + +By default, ActivitySim makes choices by calculating analytical probabilities and then drawing once from +the cumulative distribution for each chooser. With Explicit Error Terms (EET), enabled by setting +``use_explicit_error_terms: True`` in ``settings.yaml``, ActivitySim instead draws a standard EV1 (Gumbel) error +term for each chooser-alternative pair, adds it to the observed utility, and chooses the maximum total utility. + +EET changes the final simulation step, not the utility expressions, availability logic, or nesting +structure. In practice, it can reduce Monte Carlo noise in scenario comparisons. + +For configuration guidance see :ref:`explicit_error_terms_ways_to_run`. For detailed implementation notes +see :doc:`/dev-guide/explicit-error-terms`. + API ^^^ diff --git a/docs/dev-guide/explicit-error-terms.md b/docs/dev-guide/explicit-error-terms.md new file mode 100644 index 000000000..da80fe450 --- /dev/null +++ b/docs/dev-guide/explicit-error-terms.md @@ -0,0 +1,147 @@ +(explicit-error-terms-dev)= +# Explicit Error Terms + +Explicit Error Terms (EET) is an alternative way to simulate choices from ActivitySim's +logit models. It keeps the same systematic utilities and the same random-utility +interpretation as the standard method, but changes how the final simulated choice is +drawn. + +For user-facing guidance on when to use EET, see {ref}`explicit_error_terms_ways_to_run`. + +## Enabling EET + +Enable EET globally in `settings.yaml`: + +```yaml +use_explicit_error_terms: True +``` + +The top-level switch is defined in +`activitysim.core.configuration.top.SimulationSettings.use_explicit_error_terms`. +Choice simulation code reads that setting through the model compute settings and routes +supported logit simulations through the EET path. + +## Default Draw Versus EET + +Under the default ActivitySim simulation path, choice drawing works like this: + +1. Compute systematic utilities. +2. Convert those utilities into analytical probabilities. +3. Draw one uniform random number per chooser. +4. Select the alternative whose cumulative probability interval contains that draw. + +With EET enabled, the final draw step changes: + +1. Compute systematic utilities. +2. Draw one iid EV1 error term for each chooser-alternative pair. +3. Add that error term to the systematic utility. +4. Choose the alternative with the highest total utility. + +For multinomial logit, ActivitySim adds Gumbel draws to the utility table and takes the +row-wise maximum. For nested logit, ActivitySim applies the same idea while walking the +nest tree, preserving the configured nesting structure. For details, see +[this ATRF paper](https://australasiantransportresearchforum.org.au/frozen-randomness-at-the-individual-utility-level/). + +The model being simulated does not change. EET changes how the random utility model is +sampled, not the underlying utility specification. + +## Practical Effects + +### Comparisons and Simulation Noise + +For EET to reduce simulation noise, it is important that alternatives of a choice situation +keep the same unobserved error term in different scenario runs. This is intimately tied +to how random numbers are generated; see {ref}`random_in_detail` for the underlying +random-number stream design and the `activitysim.core.random` API. +Because unchanged alternatives can keep the same unobserved draws, changes to choices between +scenarios can only happen when the observed utility of an alternative increases. This is not +the case for the Monte Carlo simulation method, where the draws are based on probabilities, +which necessarily change for all alternatives if any observed utility changes. + +This also means that it is advisable to use the same setting in all runs. Comparing a baseline +run with EET to a scenario run without EET mixes two simulation methods and can make differences +harder to interpret. Aggregate choice patterns should remain statistically the same +as for the default probability-based method. The project test suite includes parity tests for +MNL, NL, and interaction-based simulations. + +### Numerical and Debugging Behavior + +EET changes the final simulation step, not the utility calculation itself. Utility +expressions, availability logic, nesting structure, and utility validation still matter in +the same way as in the default method. + +In practice, EET can make some comparisons easier to interpret because the selected +alternative is the one with the highest total utility after adding the explicit error term, +rather than the one reached by a cumulative-probability threshold. That can reduce +sensitivity to small differences in the final CDF draw when comparing nearby scenarios. +It does not eliminate the need to inspect invalid or unavailable alternatives, and it does +not guarantee identical results across different RNG seeds or different model +configurations. + +For shadow-priced location choice, ActivitySim resets RNG offsets between iterations when +EET is enabled so each shadow-pricing iteration uses the same sequence of random numbers. +That keeps the comparison across iterations focused on the shadow price updates instead of +changing random draws between iterations. + +### Runtime + +EET is slower than the default probability-based draw because it generates and processes +one random error term per chooser-alternative pair, rather than one uniform draw per +chooser after probabilities are computed. The exact runtime impact depends on the number +of alternatives, nesting structure, and interaction size. Current runtime increases are on the +order of 100% per demand model run, which is due to the non-optimized way in which location +choice is currently handled. Runtime improvement work is under way, but large improvements can +also be obtained by using Monte Carlo simulation for the sampling part of location choice, see +{ref}`explicit_error_terms_ways_to_run`. + +(explicit_error_terms_memory)= +### Memory usage + +EET in its current implementation also increases memory pressure during location sampling. +During the sampling step, an array of size (number of choosers, number of alternatives, +number of samples) is allocated for all random error terms. This can quickly become unwieldy +for machines with limited memory, and [chunking](../users-guide/performance/chunking.md) will +likely be needed. + +When chunking is needed and [explicit chunking](../users-guide/performance/chunking.md#explicit-chunking) +is used, using fractional values for the chunk size rather than absolute numbers of choosers is +often a better fit. This is because the individual steps of location choice models +(location sampling, location logsums, and location choice from the sampled choice set) all have +very different chooser characteristics, but the chunk size currently can only be set at the model +level. Using absolute values for the explicit chunk size would lead to a large number of chunks +for the logsum calculations, which is relatively slow. + + +## Implementation Details and Adding New Models + +The core simulation is implemented in `activitysim.core.logit.make_choices_utility_based`. Most +calls to this function are wrapped in one of the following methods: + +- `activitysim.core.simulate` +- `activitysim.core.interaction_simulate` +- `activitysim.core.interaction_sample` +- `activitysim.core.interaction_sample_simulate` + +These methods have consistent implementations of EET and therefore any model using these will +automatically have EET implemented. Some models call the underlying choice simulation method +`activitysim.core.logit.make_choices` directly. For EET to work in that case, the developer has +to add a corresponding call to `logit.make_choices_utility_based`, see, e.g., +`activitysim.abm.models.utils.cdap.household_activity_choices`. Note models that draw directly +from probability distributions, like `activitysim.abm.models.utils.cdap.extra_hh_member_choices` +do not have a corresponding EET implementation because there are no utilities to work with. + + +### Unavailable choices utility convention + +For EET, only utility differences matter and therefore the choice between two utilities that are +very small, say -10000 and -10001, are identical to a choice between 0 and 1. For MC, utilities +have to be exponentiated and therefore floating point precision dictates the smallest and largest +utility that can be used in practice. ActivitySim models historically often use a utility of +-999 to make alternatives practically unavailable. That value is below the utility threshold +used in the probability-based path, which is about -691 because ActivitySim clips +exponentiated utilities at 1e-300. To keep behavior consistent, EET treats alternatives with +utilities at or below that threshold as unavailable; see `activitysim.core.logit.validate_utils`. + +### Scale of the distribution +Error terms are drawn from standard Gumbel distributions, i.e., the scale of the error term is +fixed to one. diff --git a/docs/dev-guide/index.rst b/docs/dev-guide/index.rst index da6c64973..82051ff08 100644 --- a/docs/dev-guide/index.rst +++ b/docs/dev-guide/index.rst @@ -33,6 +33,7 @@ Contents component-configs components/index ../core + explicit-error-terms ../benchmarking build-docs changes diff --git a/docs/users-guide/ways_to_run.rst b/docs/users-guide/ways_to_run.rst index 1b2122107..385367999 100644 --- a/docs/users-guide/ways_to_run.rst +++ b/docs/users-guide/ways_to_run.rst @@ -80,7 +80,7 @@ Refer to the :ref:`Run the Primary Example` section to learn how to run the prim Using Jupyter Notebook ______________________ -ActivitySim includes a `Jupyter Notebook `__ recipe book with interactive examples. +ActivitySim includes a `Jupyter Notebook `__ recipe book with interactive examples. * To start JupyterLab, from the ActivitySim project directory run ``uv run jupyter lab``. This will start the JupyterLab server and pop up a browser window with the interactive development environment. * Navigate to the ``examples/prototype_mtc/notebooks`` folder and select a notebook to learn more: @@ -283,3 +283,46 @@ With the set of output CSV files, the user can trace ActivitySim calculations in help debug data and/or logic errors. Refer to :ref:`trace` for more details on configuring tracing and the various output files. + +.. _explicit_error_terms_ways_to_run : + +Explicit Error Terms +____________________ + +ActivitySim makes heavy use of micro-simulation. Most model components are discrete choice models with an inherent +random component, and for each choice situation a single outcome is generated. +With the default Monte Carlo draw method, ActivitySim first calculates analytical probabilities from the +systematic utilities of a multinomial or nested logit model and then makes one draw from the +cumulative distribution for each chooser. Explicit Error Terms (EET) replaces that final draw with a direct +random-utility simulation by drawing an independent standard EV1 (Gumbel) error term for each +chooser-alternative pair, adding it to the systematic utility, and selecting the alternative with the highest +total utility. Both methods simulate the same underlying model, but EET can be less affected by Monte Carlo +noise when comparing scenarios. For more details see :doc:`/dev-guide/explicit-error-terms`. + +To enable EET for a model run, set the global switch in ``settings.yaml``: + +.. code-block:: yaml + + use_explicit_error_terms: True + +When comparing runs, enable or disable this setting consistently across the runs you want to compare. + +Using EET changes the simulation method, not the underlying model. Aggregate behavior should remain statistically +comparable to the default method, but individual simulated choices will not usually match record-by-record. +EET is also slower than the default probability-based draw because it generates and processes one random error +term per chooser-alternative pair, rather than one uniform draw per chooser after probabilities are computed. +Most of the current slowdown comes from location choice models, where the number of alternatives is large and +the current importance-sampling workflow still requires many repeated simulations. Work to reduce that overhead is +ongoing. Until then, it is also possible to turn off EET for the sampling part of these models by adding the following +lines to the settings of all models where location choice sampling is used (currently all location and destination +choice models as well as disaggregate accessibilities): + +.. code-block:: yaml + + compute_settings: + use_explicit_error_terms: + sample: false + +If you keep EET enabled for the sampling step, also consider memory usage during location sampling. +In that case, explicit chunking with a fractional ``explicit_chunk`` setting is often the most +practical approach; see :ref:`explicit_error_terms_memory` for details.