@@ -103,6 +103,7 @@ def __init__(
103103 clone_only = False ,
104104 reentrant = False ,
105105 steps_to_rerun = None ,
106+ step_only = False ,
106107 max_workers = MAX_WORKERS ,
107108 max_num_splits = MAX_NUM_SPLITS ,
108109 max_log_size = MAX_LOG_SIZE ,
@@ -145,14 +146,51 @@ def __init__(
145146 self ._skip_decorator_hooks = skip_decorator_hooks
146147
147148 # If steps_to_rerun is specified, we will not clone them in resume mode.
148- self ._steps_to_rerun = steps_to_rerun or {}
149+ self ._steps_to_rerun = steps_to_rerun or set ()
150+ self ._steps_can_clone = set ()
151+ self ._steps_ran = set ()
152+ self ._step_only = step_only
153+ all_steps = set ()
154+ cannot_clone_steps = set (self ._steps_to_rerun )
149155 # sorted_nodes are in topological order already, so we only need to
150156 # iterate through the nodes once to get a stable set of rerun steps.
157+ # A few modes:
158+ # - no steps_to_rerun:
159+ # - not clone_only and not step_only: clone all previously executed steps and
160+ # continue execution.
161+ # - clone_only and not step_only: clone all steps that have previously executed
162+ # and stop
163+ # - not clone_only and step_only: NOT possible (requires a steps_to_rerun)
164+ # - clone_only and step_only: NOT possible (requires a steps_to_rerun)
165+ # => in all these cases, _steps_to_rerun is empty and so _steps_can_clone is
166+ # all_steps
167+ # - steps_to_rerun:
168+ # - not clone_only and not step_only: clone all previously executed steps *except*
169+ # any of the steps in steps_to_rerun and the subsequent steps. Continue execution.
170+ # => _steps_to_rerun contains the steps to rerun and all descendants. _steps_can_clone
171+ # contains all other steps
172+ # - clone_only and not step_only: clone all steps that have previously executed
173+ # up to (but not including) any of the steps in steps_to_rerun and
174+ # subsequent steps.
175+ # => same as above but steps_to_rerun is not used to run anything
176+ # - not clone_only and step_only: clone all steps that have previously executed
177+ # up to (but not including) any of the steps in steps_to_rerun and
178+ # subsequent steps. Execute *only* the steps in steps_to_rerun if possible
179+ # and stop.
180+ # - clone_only and step_only: NOT possible (if step_only is specified, we turn
181+ # off clone_only -- clone_only implies no further execution since task
182+ # objects will not be generated).
183+ # => _steps_to_rerun contains *only* the initially passed steps to run and
184+ # _steps_can_clone contains the same as in the other cases.
151185 for step_name in self ._graph .sorted_nodes :
152- if step_name in self ._steps_to_rerun :
186+ all_steps .add (step_name )
187+ if step_name in cannot_clone_steps :
153188 out_funcs = self ._graph [step_name ].out_funcs or []
154189 for next_step in out_funcs :
155- self ._steps_to_rerun .add (next_step )
190+ cannot_clone_steps .add (next_step )
191+ self ._steps_can_clone = all_steps - cannot_clone_steps
192+ if not self ._step_only :
193+ self ._steps_to_rerun = cannot_clone_steps
156194
157195 self ._origin_ds_set = None
158196 if clone_run_id :
@@ -399,7 +437,7 @@ def clone_original_run(self, generate_task_obj=False, verbose=True):
399437 if (
400438 task_ds ["_task_ok" ]
401439 and step_name != "_parameters"
402- and (step_name not in self ._steps_to_rerun )
440+ and (step_name in self ._steps_can_clone )
403441 ):
404442 # "_unbounded_foreach" is a special flag to indicate that the transition
405443 # is an unbounded foreach.
@@ -677,6 +715,19 @@ def execute(self):
677715 system_msg = True ,
678716 )
679717 self ._params_task .mark_resume_done ()
718+ elif self ._step_only :
719+ # Check that we ran all the steps in self._steps_to_rerun
720+ steps_missing = self ._steps_to_rerun - self ._steps_ran
721+ if steps_missing :
722+ raise MetaflowInternalError (
723+ "The following steps were not executed: {0}" .format (
724+ ", " .join (steps_missing )
725+ )
726+ )
727+ self ._logger (
728+ "Step-only resume complete -- all specified steps were executed!" ,
729+ system_msg = True ,
730+ )
680731 else :
681732 raise MetaflowInternalError (
682733 "The *end* step was not successful by the end of flow."
@@ -1073,6 +1124,8 @@ def _queue_task_foreach(self, task, next_steps):
10731124 def _queue_tasks (self , finished_tasks ):
10741125 # finished tasks include only successful tasks
10751126 for task in finished_tasks :
1127+ step_name , _ , _ = task .finished_id
1128+ self ._steps_ran .add (step_name )
10761129 self ._finished [task .finished_id ] = task .path
10771130 self ._is_cloned [task .path ] = task .is_cloned
10781131
@@ -1137,6 +1190,15 @@ def _queue_tasks(self, finished_tasks):
11371190 )
11381191 )
11391192
1193+ if self ._step_only :
1194+ # We need to filter next_steps to only include steps that are in
1195+ # self._steps_to_rerun
1196+ next_steps = [
1197+ step for step in next_steps if step in self ._steps_to_rerun
1198+ ]
1199+ if not next_steps :
1200+ # No steps to execute, so we can stop
1201+ return
11401202 # Different transition types require different treatment
11411203 if any (self ._graph [f ].type == "join" for f in next_steps ):
11421204 # Next step is a join
0 commit comments