diff --git a/.proj.toml b/.proj.toml index 463aa0bb07..38690f710b 100644 --- a/.proj.toml +++ b/.proj.toml @@ -78,12 +78,12 @@ has-cpu-only-benchmarks = false has-cuda-tests = false has-cuda-benchmarks = false -# [targets.local-execution] -# type = "lib" -# has-cpu-only-tests = true -# has-cpu-only-benchmarks = false -# has-cuda-tests = true -# has-cuda-benchmarks = false +[targets.local-execution] +type = "lib" +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = true +has-cuda-benchmarks = false # [targets.local-pcg-execution] # type = "lib" diff --git a/lib/local-execution/include/local-execution/README.md b/lib/local-execution/include/local-execution/README.md new file mode 100644 index 0000000000..cc68162afc --- /dev/null +++ b/lib/local-execution/include/local-execution/README.md @@ -0,0 +1,13 @@ +The primary external-facing interface of local-execution. + +Major components: + +* `computation_graph_instance.h`: is the main external facing interface + * Takes a `ComputationGraph` as input, expands and initializes it + * Provides various methods to run all or a subset of passes +* `local_task_registry.h`: functions to retrieve task implementations + * Not a dynamic registry: tasks are all static now +* `local_task_argument_accessor.h`: local wrapper for `ITaskArgumentAccessor` + * Stores all of the necessary data required for a task to execute +* `task_execution.h`: utilities to prepare and execute tasks +* `tensor_allocation.h`: a pass for the dataflow graph that allocates all tensors diff --git a/lib/local-execution/include/local-execution/atomic_training_tensor_guid_t.dtg.toml b/lib/local-execution/include/local-execution/atomic_training_tensor_guid_t.dtg.toml deleted file mode 100644 index 12380d80ba..0000000000 --- a/lib/local-execution/include/local-execution/atomic_training_tensor_guid_t.dtg.toml +++ /dev/null @@ -1,17 +0,0 @@ -namespace = "FlexFlow" -name = "atomic_training_tensor_guid_t" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "utils/nonnegative_int/nonnegative_int.h" -] - -[[fields]] -name = "raw_index" -type = "::FlexFlow::nonnegative_int" diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/README.md b/lib/local-execution/include/local-execution/computation_graph_instance/README.md deleted file mode 100644 index 6b7f4b43db..0000000000 --- a/lib/local-execution/include/local-execution/computation_graph_instance/README.md +++ /dev/null @@ -1 +0,0 @@ -The primary external-facing interface of local-execution diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h b/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h index f28552603f..8e09b7960b 100644 --- a/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h +++ b/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h @@ -2,13 +2,21 @@ #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COMPUTATION_GRAPH_INSTANCE_H #include "kernels/accessor.h" -#include "local-execution/computation_graph_training_tensor_ref_t.dtg.h" -#include "local-execution/local_task_registry.dtg.h" -#include "local-execution/local_tensor_backing.dtg.h" +#include "kernels/allocation.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/profiling_settings.dtg.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.dtg.h" -#include "pcg/layer_guid_t.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph_from_cg_conversion.dtg.h" +#include "pcg/device_id_t.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" +#include "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h" +#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h" +#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h" +#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h" +#include "task-spec/ff_iteration_config.dtg.h" #include "utils/units/milliseconds_t.h" +#include #include namespace FlexFlow { @@ -16,30 +24,65 @@ namespace FlexFlow { struct ComputationGraphInstance { public: ComputationGraphInstance() = delete; - explicit ComputationGraphInstance( - TrainingSymbolicComputationGraphFromCgConversion const &, - LocalTensorBacking const &, - LocalTaskRegistry const &); - -public: - TrainingSymbolicComputationGraphFromCgConversion const & - get_symbolic_training_graph_for_cg() const; - LocalTensorBacking const &get_tensor_backing() const; - LocalTaskRegistry const &get_task_registry() const; + std::vector const &execution_order, + Allocator &allocator, + OptimizerAttrs const &optimizer_attrs, + std::optional logit_grad_tensor); + std::vector const &get_execution_order() const; + Allocator &get_allocator() const; + OptimizerAttrs const &get_optimizer_attrs() const; + void update_optimizer_attrs_for_next_iter(); + std::optional get_loss_tensor_accessor() const; private: - TrainingSymbolicComputationGraphFromCgConversion - symbolic_training_graph_for_cg; - LocalTensorBacking tensor_backing; - LocalTaskRegistry task_registry; + std::vector execution_order; + Allocator &allocator; + OptimizerAttrs optimizer_attrs; + std::optional logit_grad_tensor; }; ComputationGraphInstance create_computation_graph_instance( - ComputationGraph const &, - bidict> const - &); + ComputationGraph const &cg, + OptimizerAttrs const &optimizer_attrs, + std::optional const &loss_attrs, + std::optional label_tensor, + std::optional logit_tensor, + std::unordered_map const + &input_tensors, + Allocator &allocator, + ProfilingSettings const &profiling_settings, + device_handle_t const &device_handle, + FFIterationConfig const &iteration_config, + device_id_t device_idx); + +std::unordered_map> + perform_all_passes_for_computation_graph_instance( + ComputationGraphInstance &instance, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + FFIterationConfig iteration_config, + device_id_t device_idx); +std::unordered_map> + perform_forward_pass_for_computation_graph_instance( + ComputationGraphInstance const &instance, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + FFIterationConfig iteration_config, + device_id_t device_idx); +std::unordered_map> + perform_backward_pass_for_computation_graph_instance( + ComputationGraphInstance const &instance, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + FFIterationConfig iteration_config, + device_id_t device_idx); +void perform_update_pass_for_computation_graph_instance( + ComputationGraphInstance &instance, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + FFIterationConfig iteration_config, + device_id_t device_idx); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.dtg.toml b/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.dtg.toml deleted file mode 100644 index 8589d5edec..0000000000 --- a/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.dtg.toml +++ /dev/null @@ -1,35 +0,0 @@ -namespace = "FlexFlow" -name = "InitializedComputationGraphInstance" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "json", - "fmt", - "rapidcheck", -] - -includes = [ - # "local-execution/computation_graph_instance.dtg.h", - # "local-execution/local_device_states_backing.dtg.h", -] - -src_includes = [] - -fields = [] -# [[fields]] -# name = "per_device_op_states" -# type = "::FlexFlow::LocalDeviceStatesBacking" -# -# [[fields]] -# name = "allocator" -# type = "::FlexFlow::Allocator" -# -# [[fields]] -# name = "atomic_tensor_backing" -# type = "::FlexFlow::LocalAtomicTensorBacking" -# -# [[fields]] -# name = "computation_graph_instance" -# type = "::FlexFlow::ComputationGraphInstance" diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.h b/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.h deleted file mode 100644 index a014ff596d..0000000000 --- a/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_INITIALIZED_COMPUTATION_GRAPH_INSTANCE_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_INITIALIZED_COMPUTATION_GRAPH_INSTANCE_H - -#include "local-execution/computation_graph_instance/computation_graph_instance.h" -#include "local-execution/local_atomic_tensor_backing.dtg.h" -#include "local-execution/local_device_states_backing.dtg.h" -#include "local-execution/local_task_registry.dtg.h" -#include "local-execution/local_tensor_backing.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph_from_cg_conversion.dtg.h" -#include "utils/units/milliseconds_t.h" - -namespace FlexFlow { - -struct InitializedComputationGraphInstance { -public: - LocalTensorBacking const &get_tensor_backing() const; - LocalTaskRegistry const &get_task_registry() const; - TrainingSymbolicComputationGraphFromCgConversion const & - get_symbolic_training_graph_for_cg() const; - LocalAtomicTensorBacking const &get_atomic_tensor_backing() const; - Allocator &get_allocator() const; - RuntimeArgConfig const &get_runtime_arg_config() const; - -private: - LocalDeviceStatesBacking per_device_op_states; - Allocator &allocator; - LocalAtomicTensorBacking atomic_tensor_backing; - ComputationGraphInstance computation_graph_instance; -}; - -InitializedComputationGraphInstance - initialize_computation_graph_instance(ComputationGraphInstance const &, - Allocator &); - -std::unordered_map> - perform_forward_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &); - -std::unordered_map> - perform_backward_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &); - -void perform_update_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/computation_graph_training_tensor_ref_t.dtg.toml b/lib/local-execution/include/local-execution/computation_graph_training_tensor_ref_t.dtg.toml deleted file mode 100644 index d25dc407e2..0000000000 --- a/lib/local-execution/include/local-execution/computation_graph_training_tensor_ref_t.dtg.toml +++ /dev/null @@ -1,24 +0,0 @@ -namespace = "FlexFlow" -name = "computation_graph_training_tensor_ref_t" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "json", - "fmt", - "rapidcheck", -] - -includes = [ - "pcg/tensor_guid_t.dtg.h" , - "task-spec/op_training_tensor_type.dtg.h", -] - -[[fields]] -name = "tensor_guid" -type = "::FlexFlow::tensor_guid_t" - -[[fields]] -name = "tensor_type" -type = "::FlexFlow::OpTrainingTensorType" diff --git a/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h b/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h index ba5b511227..d07a8b731b 100644 --- a/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h +++ b/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h @@ -1,3 +1,5 @@ +#if 0 // FIXME (Elliott): fix cost estimator + #ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H @@ -33,3 +35,5 @@ CostEstimator get_local_cost_estimator(RuntimeArgConfig const &); } // namespace FlexFlow #endif + +#endif diff --git a/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h b/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h index 0b531f9b3d..79a62b628a 100644 --- a/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h @@ -1,3 +1,5 @@ +#if 0 // FIXME (Elliott): fix cost estimator + #ifndef _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H @@ -33,3 +35,5 @@ size_t get_tracked_memory_usage(Allocator &wrapped_allocator); } // namespace FlexFlow #endif + +#endif diff --git a/lib/local-execution/include/local-execution/device_state_initialization.h b/lib/local-execution/include/local-execution/device_state_initialization.h new file mode 100644 index 0000000000..6abd58a32c --- /dev/null +++ b/lib/local-execution/include/local-execution/device_state_initialization.h @@ -0,0 +1,39 @@ +#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_DEVICE_STATE_INITIALIZATION_H +#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_DEVICE_STATE_INITIALIZATION_H + +#include "kernels/allocation.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/profiling_settings.dtg.h" +#include "pcg/device_id_t.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h" +#include "task-spec/ff_iteration_config.dtg.h" + +namespace FlexFlow { + +bool no_nodes_are_initialized(DynamicOpenDataflowGraph const &g); + +DynamicNodeInvocation + initialize_node(DynamicNodeInvocation const &i, + Allocator &allocator, + ProfilingSettings const &profiling_settings, + device_handle_t const &device_handle, + FFIterationConfig const &iteration_config, + OptimizerAttrs const &optimizer_attrs, + device_id_t device_idx); + +/** + * @brief Initialize all operators and save the per-device op state + */ +DynamicOpenDataflowGraph perform_device_state_initialization( + DynamicOpenDataflowGraph const &, + Allocator &allocator, + ProfilingSettings const &profiling_settings, + device_handle_t const &device_handle, + FFIterationConfig const &iteration_config, + OptimizerAttrs const &optimizer_attrs, + device_id_t device_idx); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/execute_task_for_layer.h b/lib/local-execution/include/local-execution/execute_task_for_layer.h deleted file mode 100644 index 587ff96687..0000000000 --- a/lib/local-execution/include/local-execution/execute_task_for_layer.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_EXECUTE_TASK_FOR_LAYER_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_EXECUTE_TASK_FOR_LAYER_H - -#include "local-execution/local_atomic_tensor_backing.dtg.h" -#include "local-execution/local_ready_to_launch_task.dtg.h" -#include "local-execution/local_task_registry.dtg.h" -#include "local-execution/local_tensor_backing.dtg.h" -#include "pcg/layer_guid_t.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_task_invocation.dtg.h" -#include "task-spec/symbolic/symbolic_cg_op_attrs_and_training_signature_with_shapes.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph_from_cg_conversion.dtg.h" -#include "utils/units/milliseconds_t.h" - -namespace FlexFlow { - -LocalReadyToLaunchTask - prepare_runtime_task_invocation(RuntimeTaskInvocation const &, - LocalTensorBacking const &, - LocalAtomicTensorBacking const &, - Allocator &, - RuntimeArgConfig const &); - -std::optional execute_init_for_layer( - symbolic_layer_guid_t, - SymbolicCgOpAttrsAndTrainingSignatureWithShapes const &, - LocalTensorBacking const &, - LocalAtomicTensorBacking const &, - Allocator &, - LocalTaskRegistry const &, - RuntimeArgConfig const &); - -std::optional execute_forward_for_layer( - symbolic_layer_guid_t, - SymbolicCgOpAttrsAndTrainingSignatureWithShapes const &, - LocalTensorBacking const &, - LocalAtomicTensorBacking const &, - Allocator &, - LocalTaskRegistry const &, - RuntimeArgConfig const &); - -std::optional execute_backward_for_layer( - symbolic_layer_guid_t, - SymbolicCgOpAttrsAndTrainingSignatureWithShapes const &, - LocalTensorBacking const &, - LocalAtomicTensorBacking const &, - Allocator &, - LocalTaskRegistry const &, - RuntimeArgConfig const &); - -void execute_compute_loss(TrainingSymbolicComputationGraph const &, - LocalTensorBacking const &, - LocalAtomicTensorBacking const &, - Allocator &, - LocalTaskRegistry const &, - RuntimeArgConfig const &); - -void execute_update_for_layer(symbolic_layer_guid_t, - TrainingSymbolicComputationGraph const &, - LocalTensorBacking const &, - LocalAtomicTensorBacking const &, - OptimizerAttrs const &, - Allocator &, - RuntimeArgConfig const &); - -std::unordered_map> - execute_forward_pass( - TrainingSymbolicComputationGraphFromCgConversion const &training_cg, - LocalTensorBacking const &local_tensor_backing, - LocalAtomicTensorBacking const &local_atomic_tensor_backing, - Allocator &, - LocalTaskRegistry const &, - RuntimeArgConfig const &); - -std::unordered_map> - execute_backward_pass( - TrainingSymbolicComputationGraphFromCgConversion const &training_cg, - LocalTensorBacking const &local_tensor_backing, - LocalAtomicTensorBacking const &local_atomic_tensor_backing, - Allocator &, - LocalTaskRegistry const &, - RuntimeArgConfig const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_atomic_tensor_backing.dtg.toml b/lib/local-execution/include/local-execution/local_atomic_tensor_backing.dtg.toml deleted file mode 100644 index 5fe6b05b52..0000000000 --- a/lib/local-execution/include/local-execution/local_atomic_tensor_backing.dtg.toml +++ /dev/null @@ -1,21 +0,0 @@ -namespace = "FlexFlow" -name = "LocalAtomicTensorBacking" -type = "struct" -features = [ - "eq", - "fmt", -] - -includes = [ - "kernels/accessor.h", - "local-execution/atomic_training_tensor_guid_t.dtg.h", -] - -src_includes = [ - "utils/fmt/unordered_map.h", -] - - -[[fields]] -name = "accessor_from_atomic_tensor_map" -type = "std::unordered_map<::FlexFlow::atomic_training_tensor_guid_t, ::FlexFlow::GenericTensorAccessorW>" diff --git a/lib/local-execution/include/local-execution/local_atomic_tensor_backing.h b/lib/local-execution/include/local-execution/local_atomic_tensor_backing.h deleted file mode 100644 index 11f9f3e56a..0000000000 --- a/lib/local-execution/include/local-execution/local_atomic_tensor_backing.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_ATOMIC_TENSOR_BACKING_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_ATOMIC_TENSOR_BACKING_H - -#include "kernels/allocation.h" -#include "local-execution/atomic_task_invocation.dtg.h" -#include "local-execution/local_atomic_tensor_backing.dtg.h" -#include "local-execution/tensor_slot_backing.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h" -#include "task-spec/task_argument_accessor/task_argument_accessor.h" - -namespace FlexFlow { - -std::unordered_map - construct_tensor_slots_backing_for_binding(LocalAtomicTensorBacking const &, - AtomicTaskBinding const &); - -TaskArgumentAccessor get_task_arg_accessor_for_atomic_task_binding( - LocalAtomicTensorBacking const &, AtomicTaskBinding const &, Allocator &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_concrete_task_graph.dtg.toml b/lib/local-execution/include/local-execution/local_concrete_task_graph.dtg.toml deleted file mode 100644 index 8dde33a49a..0000000000 --- a/lib/local-execution/include/local-execution/local_concrete_task_graph.dtg.toml +++ /dev/null @@ -1,24 +0,0 @@ -namespace = "FlexFlow" -name = "LocalConcreteTaskGraph" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "json", - "fmt", - "rapidcheck", -] - -includes = [ - "local-execution/local_concrete_task_invocation.dtg.h", -] - -src_includes = [ - "utils/hash/unordered_set.h", - "utils/fmt/unordered_set.h", -] - -[[fields]] -name = "task_invocations" -type = "std::unordered_set<::FlexFlow::LocalConcreteTaskInvocation>" diff --git a/lib/local-execution/include/local-execution/local_concrete_task_graph.h b/lib/local-execution/include/local-execution/local_concrete_task_graph.h deleted file mode 100644 index c2f8c405b0..0000000000 --- a/lib/local-execution/include/local-execution/local_concrete_task_graph.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_CONCRETE_TASK_GRAPH_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_CONCRETE_TASK_GRAPH_H - -#include "local-execution/local_concrete_task_graph.dtg.h" - -namespace FlexFlow { - -std::vector - local_concrete_task_graph_topological_ordering( - LocalConcreteTaskGraph const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_concrete_task_invocation.dtg.toml b/lib/local-execution/include/local-execution/local_concrete_task_invocation.dtg.toml deleted file mode 100644 index ce0b64dd6b..0000000000 --- a/lib/local-execution/include/local-execution/local_concrete_task_invocation.dtg.toml +++ /dev/null @@ -1,17 +0,0 @@ -namespace = "FlexFlow" -name = "LocalConcreteTaskInvocation" -type = "struct" -features = [] - -includes = [ - "task-spec/task_id_t.dtg.h", - "task-spec/task_argument_accessor/task_argument_accessor.h", -] - -[[fields]] -name = "task_id" -type = "::FlexFlow::task_id_t" - -[[fields]] -name = "task_arg_accessor" -type = "::FlexFlow::TaskArgumentAccessor" diff --git a/lib/local-execution/include/local-execution/local_device_states_backing.dtg.toml b/lib/local-execution/include/local-execution/local_device_states_backing.dtg.toml deleted file mode 100644 index 350bf7756f..0000000000 --- a/lib/local-execution/include/local-execution/local_device_states_backing.dtg.toml +++ /dev/null @@ -1,14 +0,0 @@ -namespace = "FlexFlow" -name = "LocalDeviceStatesBacking" -type = "struct" -features = [] - -includes = [ - "task-spec/device_specific_per_device_op_state.dtg.h", - "pcg/layer_guid_t.dtg.h", - "", -] - -[[fields]] -name = "per_device_op_states" -type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::DeviceSpecificPerDeviceOpState>>" diff --git a/lib/local-execution/include/local-execution/local_device_states_backing.h b/lib/local-execution/include/local-execution/local_device_states_backing.h deleted file mode 100644 index 5650197e44..0000000000 --- a/lib/local-execution/include/local-execution/local_device_states_backing.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_DEVICE_STATES_BACKING_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_DEVICE_STATES_BACKING_H - -#include "local-execution/local_device_states_backing.dtg.h" -#include "local-execution/local_task_argument_accessor.h" -#include "local-execution/local_task_registry.dtg.h" -#include "local-execution/local_tensor_backing.dtg.h" -#include "pcg/computation_graph.h" -#include "task-spec/per_device_op_state.h" -#include "task-spec/symbolic/symbolic_layer_training_tensor_group_signature_with_shapes.dtg.h" - -namespace FlexFlow { - -LocalDeviceStatesBacking make_local_device_states_backing_for_computation_graph( - LocalTaskRegistry const &, - std::unordered_map< - layer_guid_t, - SymbolicLayerTrainingTensorGroupSignatureWithShapes> const &, - RuntimeArgConfig const &runtime_arg_config, - LocalTensorBacking const &, - Allocator &); - -std::optional - get_per_device_op_state_if_exists(LocalDeviceStatesBacking const &, - layer_guid_t const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index 53026f81fd..44844a67f1 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -1,36 +1,32 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H +#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H +#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/tensor_slot_backing.dtg.h" +#include "kernels/accessor.h" #include "pcg/device_id_t.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h" -#include "task-spec/task_argument_accessor/task_argument_accessor.h" +#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h" +#include "task-spec/task_argument_accessor/itask_argument_accessor.h" #include "task-spec/task_argument_accessor/task_tensor_parameter.dtg.h" #include -#include namespace FlexFlow { struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { explicit LocalTaskArgumentAccessor( Allocator const &allocator, - std::unordered_map const + std::unordered_map const &tensor_slots_backing, ProfilingSettings const &profiling_settings, device_handle_t const &ff_handle, - DeviceType kernel_device_type, - PCGOperatorAttrs const &op_attrs, + std::optional const &op_attrs, std::optional const &loss_attrs, std::optional const &per_device_op_state, FFIterationConfig const &iteration_config, std::optional const &optimizer_attrs, - size_t device_idx); + device_id_t device_idx); LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete; LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete; - ConcreteArgSpec const &get_concrete_arg(arg_slot_id_t) const override; - GenericTensorAccessor get_tensor(TaskTensorParameter slot, Permissions priv) const override; @@ -49,13 +45,13 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { private: Allocator allocator; - std::unordered_map + std::unordered_map tensor_slots_backing; ProfilingSettings profiling_settings; device_handle_t ff_handle; DeviceType kernel_device_type; - PCGOperatorAttrs op_attrs; + std::optional op_attrs; std::optional loss_attrs; std::optional per_device_op_state; FFIterationConfig iteration_config; diff --git a/lib/local-execution/include/local-execution/local_task_registry.dtg.toml b/lib/local-execution/include/local-execution/local_task_registry.dtg.toml deleted file mode 100644 index 056fe39ca7..0000000000 --- a/lib/local-execution/include/local-execution/local_task_registry.dtg.toml +++ /dev/null @@ -1,21 +0,0 @@ -namespace = "FlexFlow" -name = "LocalTaskRegistry" -type = "struct" -features = [ - "eq", - "fmt", - "hash" -] - -includes = [ - "task-spec/task_impl_function.dtg.h", -] - -src_includes = [ - "utils/hash/unordered_map.h", - "utils/fmt/unordered_map.h", -] - -[[fields]] -name = "task_mapping" -type = "std::unordered_map<::FlexFlow::task_id_t, ::FlexFlow::TaskImplFunction>" diff --git a/lib/local-execution/include/local-execution/local_task_registry.h b/lib/local-execution/include/local-execution/local_task_registry.h index 6adacab0a9..99c20c7d65 100644 --- a/lib/local-execution/include/local-execution/local_task_registry.h +++ b/lib/local-execution/include/local-execution/local_task_registry.h @@ -1,30 +1,34 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H -#define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H +#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_TASK_REGISTRY_H +#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_TASK_REGISTRY_H -#include "local-execution/local_task_registry.dtg.h" -#include "pcg/layer_attrs.dtg.h" +#include "op-attrs/computation_graph_op_attrs.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" #include "task-spec/device_specific_per_device_op_state.dtg.h" -#include "task-spec/ops/op_task_type.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "utils/units/milliseconds_t.h" +#include namespace FlexFlow { -LocalTaskRegistry construct_local_task_registry_for_layers( - std::unordered_set const &); +std::optional + get_init_task_impl_for_op_attrs(ComputationGraphOpAttrs const &); +std::optional + get_fwd_task_impl_for_op_attrs(ComputationGraphOpAttrs const &); +std::optional + get_bwd_task_impl_for_op_attrs(ComputationGraphOpAttrs const &); std::optional - call_init_task_impl(LocalTaskRegistry const &local_task_registry, - task_id_t task_id, - TaskArgumentAccessor const &arg_accessor); - + call_init_task_impl(ComputationGraphOpAttrs const &, + TaskArgumentAccessor const &); std::optional - call_fwb_task_impl(LocalTaskRegistry const &local_task_registry, - task_id_t task_id, - TaskArgumentAccessor const &arg_accessor); - -void call_generic_task_impl(LocalTaskRegistry const &local_task_registry, - task_id_t task_id, - TaskArgumentAccessor const &arg_accessor); + call_fwd_task_impl(ComputationGraphOpAttrs const &, + TaskArgumentAccessor const &); +std::optional + call_bwd_task_impl(ComputationGraphOpAttrs const &, + TaskArgumentAccessor const &); +void call_update_task_impl(OptimizerAttrs const &, + TaskArgumentAccessor const &); +void call_loss_task_impl(TaskArgumentAccessor const &); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/operator_task_set.dtg.toml b/lib/local-execution/include/local-execution/operator_task_set.dtg.toml deleted file mode 100644 index b074d981d1..0000000000 --- a/lib/local-execution/include/local-execution/operator_task_set.dtg.toml +++ /dev/null @@ -1,25 +0,0 @@ -namespace = "FlexFlow" -name = "OperatorTaskSet" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "local-execution/task_id_with_noop_default_t.dtg.h" -] - -[[fields]] -name = "init_task" -type = "::FlexFlow::task_id_with_noop_default_t" - -[[fields]] -name = "fwd_task" -type = "::FlexFlow::task_id_with_noop_default_t" - -[[fields]] -name = "bwd_task" -type = "::FlexFlow::task_id_with_noop_default_t" diff --git a/lib/local-execution/include/local-execution/operator_task_set.h b/lib/local-execution/include/local-execution/operator_task_set.h deleted file mode 100644 index b94ed9ac47..0000000000 --- a/lib/local-execution/include/local-execution/operator_task_set.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPERATOR_TASK_SET_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPERATOR_TASK_SET_H - -#include "local-execution/operator_task_set.dtg.h" -#include "op-attrs/computation_graph_op_attrs.dtg.h" -#include "task-spec/ops/op_task_type.dtg.h" -#include "utils/bidict/bidict.h" - -namespace FlexFlow { - -bidict - get_map_from_task_type_to_task(OperatorTaskSet const &); -std::unordered_set - get_all_tasks_in_task_set(OperatorTaskSet const &); - -task_id_with_noop_default_t - get_task_for_task_type(OperatorTaskSet const &op_task_set, - OpTaskType task_type); - -OperatorTaskSet - get_task_set_for_operator(ComputationGraphOpAttrs const &op_attrs); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/per_device_op_state_initialization.h b/lib/local-execution/include/local-execution/per_device_op_state_initialization.h deleted file mode 100644 index 31f8958a1c..0000000000 --- a/lib/local-execution/include/local-execution/per_device_op_state_initialization.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_PER_DEVICE_OP_STATE_INITIALIZATION_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_PER_DEVICE_OP_STATE_INITIALIZATION_H - -#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h" -namespace FlexFlow { - -DynamicOpenDataflowGraph perform_per_device_op_state_initialization( - DynamicOpenDataflowGraph const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/task_execution.h b/lib/local-execution/include/local-execution/task_execution.h index 215f1dbc08..61a57dbfa0 100644 --- a/lib/local-execution/include/local-execution/task_execution.h +++ b/lib/local-execution/include/local-execution/task_execution.h @@ -2,32 +2,32 @@ #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_TASK_EXECUTION_H #include "kernels/profiling_settings.dtg.h" -#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h" #include "task-spec/per_device_op_state.dtg.h" #include "task-spec/task_argument_accessor/task_argument_accessor.h" +#include "utils/units/milliseconds_t.h" namespace FlexFlow { TaskArgumentAccessor make_task_argument_accessor_for_invocation( DynamicNodeInvocation const &invocation, + Allocator &allocator, ProfilingSettings const &profiling_settings, - DeviceType kernel_device_type, - PCGOperatorAttrs op_attrs, - std::optional const &loss_attrs, + device_handle_t const &ff_handle, std::optional const &per_device_op_state, - FFIterationConfig iteration_config, - std::optional const &optimizer_attrs); + FFIterationConfig const &iteration_config, + std::optional const &optimizer_attrs, + device_id_t device_idx); -void execute_dynamic_node_invocation( +std::optional execute_dynamic_node_invocation( DynamicNodeInvocation const &invocation, + Allocator &allocator, ProfilingSettings const &profiling_settings, - DeviceType kernel_device_type, - PCGOperatorAttrs op_attrs, - std::optional const &loss_attrs, + device_handle_t const &ff_handle, std::optional const &per_device_op_state, - FFIterationConfig iteration_config, - std::optional const &optimizer_attrs); + FFIterationConfig const &iteration_config, + std::optional const &optimizer_attrs, + device_id_t device_idx); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/task_id_with_noop_default_t.h b/lib/local-execution/include/local-execution/task_id_with_noop_default_t.h deleted file mode 100644 index 72e151bcc8..0000000000 --- a/lib/local-execution/include/local-execution/task_id_with_noop_default_t.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_TASK_ID_WITH_NOOP_DEFAULT_T_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_TASK_ID_WITH_NOOP_DEFAULT_T_H - -#include "local-execution/task_id_with_noop_default_t.dtg.h" - -namespace FlexFlow { - -task_id_with_noop_default_t make_default_noop_task(); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/tensor_allocation.h b/lib/local-execution/include/local-execution/tensor_allocation.h index 67acb3de70..76fb3bbee6 100644 --- a/lib/local-execution/include/local-execution/tensor_allocation.h +++ b/lib/local-execution/include/local-execution/tensor_allocation.h @@ -9,12 +9,14 @@ namespace FlexFlow { bool no_tensors_are_allocated(DynamicOpenDataflowGraph const &); bool all_tensors_are_allocated(DynamicOpenDataflowGraph const &); +bool tensors_are_ready_for_allocation(DynamicOpenDataflowGraph const &g); + DynamicValueAttrs perform_tensor_allocation_for_value(DynamicValueAttrs const &, Allocator &); DynamicOpenDataflowGraph perform_tensor_allocation( DynamicOpenDataflowGraph const &, - std::unordered_map const + std::unordered_map const &preallocated, Allocator &); diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc new file mode 100644 index 0000000000..e251fafe5f --- /dev/null +++ b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc @@ -0,0 +1,243 @@ +#include "local-execution/computation_graph_instance/computation_graph_instance.h" +#include "local-execution/device_state_initialization.h" +#include "local-execution/task_execution.h" +#include "local-execution/tensor_allocation.h" +#include "pcg/optimizer_attrs.h" +#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h" +#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h" +#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h" +#include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h" +#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h" +#include "task-spec/dynamic_graph/loss_insertion.h" +#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.h" +#include "task-spec/dynamic_graph/pass_expansion.h" +#include "task-spec/dynamic_graph/update_insertion.h" +#include "task-spec/per_device_op_state.h" +#include "task-spec/task_argument_accessor/task_argument_accessor.h" +#include "utils/containers/transform.h" +#include "utils/containers/unordered_map_from_pairs.h" +#include "utils/graph/digraph/algorithms/get_topological_ordering.h" +#include "utils/optional.h" +#include + +namespace FlexFlow { + +ComputationGraphInstance::ComputationGraphInstance( + std::vector const &execution_order, + Allocator &allocator, + OptimizerAttrs const &optimizer_attrs, + std::optional logit_grad_tensor) + : execution_order(execution_order), allocator(allocator), + optimizer_attrs(optimizer_attrs), logit_grad_tensor(logit_grad_tensor) {} + +std::vector const & + ComputationGraphInstance::get_execution_order() const { + return this->execution_order; +} +Allocator &ComputationGraphInstance::get_allocator() const { + return this->allocator; +} +OptimizerAttrs const &ComputationGraphInstance::get_optimizer_attrs() const { + return this->optimizer_attrs; +} +void ComputationGraphInstance::update_optimizer_attrs_for_next_iter() { + this->optimizer_attrs = + get_optimizer_attrs_for_next_iter(this->optimizer_attrs); +} +std::optional + ComputationGraphInstance::get_loss_tensor_accessor() const { + return this->logit_grad_tensor; +} + +static GenericTensorAccessorW + get_loss_tensor_accessor(DynamicOpenDataflowGraph const &dg, + DynamicValueAttrs const &value) { + std::optional accessor = + assert_unwrap(find_output_value_attrs(dg, value.tensor_guid, value.role)) + .accessor; + return assert_unwrap(accessor).require_write(); +} + +ComputationGraphInstance create_computation_graph_instance( + ComputationGraph const &cg, + OptimizerAttrs const &optimizer_attrs, + std::optional const &loss_attrs, + std::optional label_tensor, + std::optional logit_tensor, + std::unordered_map const + &input_tensors, + Allocator &allocator, + ProfilingSettings const &profiling_settings, + device_handle_t const &device_handle, + FFIterationConfig const &iteration_config, + device_id_t device_idx) { + DynamicOpenDataflowGraph dg = make_dynamic_open_dataflow_graph_from_cg(cg); + dg = perform_pass_expansion(dg); + + std::unordered_map inputs = + input_tensors; + std::optional logit_grad_value; + if (loss_attrs.has_value()) { + auto [loss_inserted_dg, label_v, logit_grad_v] = perform_loss_insertion( + dg, + assert_unwrap(loss_attrs), + dynamic_tensor_guid_t{assert_unwrap(logit_tensor)}); + dg = loss_inserted_dg; + logit_grad_value = logit_grad_v; + inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)}); + } + + dg = perform_update_insertion(dg, optimizer_attrs); + dg = perform_tensor_allocation(dg, inputs, allocator); + + std::optional logit_grad_tensor = + transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) { + return get_loss_tensor_accessor(dg, lgv); + }); + + dg = perform_device_state_initialization(dg, + allocator, + profiling_settings, + device_handle, + iteration_config, + optimizer_attrs, + device_idx); + + // Compute the topological ordering of the graph + auto [kwarg_graph, node_map] = + labelled_open_kwarg_dataflow_graph_from_dynamic_open_dataflow_graph(dg); + std::vector node_topo_order = get_topological_ordering(kwarg_graph); + std::vector invocation_topo_order = transform( + node_topo_order, [&](Node node) { return node_map.at_l(node); }); + + return ComputationGraphInstance{ + invocation_topo_order, allocator, optimizer_attrs, logit_grad_tensor}; +} + +static std::unordered_map> + execute_dynamic_node_invocation_set( + std::vector const &invocations, + Allocator &allocator, + OptimizerAttrs const &optimizer_attrs, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + FFIterationConfig iteration_config, + device_id_t device_idx) { + return unordered_map_from_pairs( + transform(invocations, [&](DynamicNodeInvocation const &invocation) { + std::optional timing = execute_dynamic_node_invocation( + /*invocation=*/invocation, + /*allocator=*/allocator, + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/ff_handle, + /*per_device_op_state=*/ + transform(invocation.node_attrs.per_device_op_state, + [&](DeviceSpecificPerDeviceOpState const &op_state) { + return get_device_state_from_device_specific( + op_state, device_idx); + }), + /*iteration_config=*/iteration_config, + /*optimizer_attrs=*/optimizer_attrs, + /*device_idx=*/device_idx); + return std::pair{invocation.node_attrs.layer_guid, timing}; + })); +} + +std::unordered_map> + perform_all_passes_for_computation_graph_instance( + ComputationGraphInstance &instance, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + FFIterationConfig iteration_config, + device_id_t device_idx) { + std::vector execution_order = + instance.get_execution_order(); + std::unordered_map> + result = execute_dynamic_node_invocation_set( + /*invocations=*/execution_order, + /*allocator=*/instance.get_allocator(), + /*optimizer_attrs=*/instance.get_optimizer_attrs(), + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/ff_handle, + /*iteration_config=*/iteration_config, + /*device_idx=*/device_idx); + instance.update_optimizer_attrs_for_next_iter(); + return result; +} + +std::unordered_map> + perform_forward_pass_for_computation_graph_instance( + ComputationGraphInstance const &instance, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + FFIterationConfig iteration_config, + device_id_t device_idx) { + std::vector execution_order = + filter(instance.get_execution_order(), + [](DynamicNodeInvocation const &invocation) { + DynamicTaskType task_type = + assert_unwrap(invocation.node_attrs.task_type); + return task_type == DynamicTaskType::FWD; + }); + + return execute_dynamic_node_invocation_set( + /*invocations=*/execution_order, + /*allocator=*/instance.get_allocator(), + /*optimizer_attrs=*/instance.get_optimizer_attrs(), + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/ff_handle, + /*iteration_config=*/iteration_config, + /*device_idx=*/device_idx); +} + +std::unordered_map> + perform_backward_pass_for_computation_graph_instance( + ComputationGraphInstance const &instance, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + FFIterationConfig iteration_config, + device_id_t device_idx) { + std::vector execution_order = + filter(instance.get_execution_order(), + [](DynamicNodeInvocation const &invocation) { + DynamicTaskType task_type = + assert_unwrap(invocation.node_attrs.task_type); + return task_type == DynamicTaskType::BWD; + }); + + return execute_dynamic_node_invocation_set( + /*invocations=*/execution_order, + /*allocator=*/instance.get_allocator(), + /*optimizer_attrs=*/instance.get_optimizer_attrs(), + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/ff_handle, + /*iteration_config=*/iteration_config, + /*device_idx=*/device_idx); +} + +void perform_update_pass_for_computation_graph_instance( + ComputationGraphInstance &instance, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + FFIterationConfig iteration_config, + device_id_t device_idx) { + std::vector execution_order = + filter(instance.get_execution_order(), + [](DynamicNodeInvocation const &invocation) { + DynamicTaskType task_type = + assert_unwrap(invocation.node_attrs.task_type); + return task_type == DynamicTaskType::UPD; + }); + + execute_dynamic_node_invocation_set( + /*invocations=*/execution_order, + /*allocator=*/instance.get_allocator(), + /*optimizer_attrs=*/instance.get_optimizer_attrs(), + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/ff_handle, + /*iteration_config=*/iteration_config, + /*device_idx=*/device_idx); + instance.update_optimizer_attrs_for_next_iter(); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/initialized_computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/initialized_computation_graph_instance.cc deleted file mode 100644 index a9f7018bb2..0000000000 --- a/lib/local-execution/src/local-execution/computation_graph_instance/initialized_computation_graph_instance.cc +++ /dev/null @@ -1,19 +0,0 @@ -#include "local-execution/computation_graph_instance/initialized_computation_graph_instance.h" - -namespace FlexFlow { - -std::unordered_map> - perform_forward_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &instance) { - - NOT_IMPLEMENTED(); -} - -std::unordered_map> - perform_backward_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &instance) { - - NOT_IMPLEMENTED(); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc index fc181d26b0..79e2dcafb2 100644 --- a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc +++ b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc @@ -1,3 +1,5 @@ +#if 0 // FIXME (Elliott): fix cost estimator + #include "local-execution/cost_estimator/local_cost_estimator.h" #include "compiler/machine_mapping/machine_view.dtg.h" #include "kernels/create_local_allocator_for_device_type.h" @@ -151,3 +153,5 @@ CostEstimator } } // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc b/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc index 3ac7352e59..2930ba0c86 100644 --- a/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc +++ b/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc @@ -1,3 +1,5 @@ +#if 0 // FIXME (Elliott): fix cost estimator + #include "local-execution/tracked_allocator.h" #include "kernels/device.h" @@ -33,3 +35,5 @@ Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { } } // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local-execution/device_state_initialization.cc b/lib/local-execution/src/local-execution/device_state_initialization.cc new file mode 100644 index 0000000000..b5462b4b78 --- /dev/null +++ b/lib/local-execution/src/local-execution/device_state_initialization.cc @@ -0,0 +1,84 @@ +#include "local-execution/device_state_initialization.h" +#include "local-execution/local_task_registry.h" +#include "local-execution/task_execution.h" +#include "op-attrs/computation_graph_op_attrs.dtg.h" +#include "op-attrs/computation_graph_op_attrs.h" +#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h" +#include "utils/containers/all_are_true.h" +#include "utils/containers/transform.h" +#include "utils/optional.h" +#include + +namespace FlexFlow { + +bool no_nodes_are_initialized(DynamicOpenDataflowGraph const &g) { + return all_are_true( + transform(get_dynamic_nodes(g), [](DynamicNodeAttrs const &n) -> bool { + return !n.per_device_op_state.has_value(); + })); +} + +DynamicNodeInvocation + initialize_node(DynamicNodeInvocation const &i, + Allocator &allocator, + ProfilingSettings const &profiling_settings, + device_handle_t const &device_handle, + FFIterationConfig const &iteration_config, + OptimizerAttrs const &optimizer_attrs, + device_id_t device_idx) { + if (!i.node_attrs.op_attrs.has_value() || + !i.node_attrs.op_attrs.value().is_pcg_op()) { + return i; + } + + // Get op + ComputationGraphOpAttrs op_attrs = + assert_unwrap(compgraph_op_attrs_from_pcg_op_attrs( + assert_unwrap(i.node_attrs.op_attrs).require_pcg_op())); + + // Prepare arguments + TaskArgumentAccessor arg_accessor = + make_task_argument_accessor_for_invocation( + /*invocation=*/i, + /*allocator=*/allocator, + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/device_handle, + /*per_device_op_state=*/std::nullopt, + /*iteration_config=*/iteration_config, + /*optimizer_attrs=*/optimizer_attrs, + /*device_idx=*/device_idx); + + // Run task init + std::optional per_device_op_state = + call_init_task_impl(op_attrs, arg_accessor); + + DynamicNodeInvocation result = i; + result.node_attrs.per_device_op_state = per_device_op_state; + return result; +} + +DynamicOpenDataflowGraph perform_device_state_initialization( + DynamicOpenDataflowGraph const &dg, + Allocator &allocator, + ProfilingSettings const &profiling_settings, + device_handle_t const &device_handle, + FFIterationConfig const &iteration_config, + OptimizerAttrs const &optimizer_attrs, + device_id_t device_idx) { + + ASSERT(no_nodes_are_initialized(dg)); + DynamicOpenDataflowGraph result = transform_dynamic_invocation_set( + dg, [&](DynamicNodeInvocation const &invocation) { + return initialize_node(invocation, + allocator, + profiling_settings, + device_handle, + iteration_config, + optimizer_attrs, + device_idx); + }); + + return result; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/execute_task_for_layer.cc b/lib/local-execution/src/local-execution/execute_task_for_layer.cc deleted file mode 100644 index 5a7ea74e52..0000000000 --- a/lib/local-execution/src/local-execution/execute_task_for_layer.cc +++ /dev/null @@ -1,274 +0,0 @@ -#include "local-execution/execute_task_for_layer.h" -#include "local-execution/atomic_task_binding.dtg.h" -#include "local-execution/local_atomic_tensor_backing.h" -#include "local-execution/local_ready_to_launch_task.dtg.h" -#include "local-execution/local_task_registry.h" -#include "local-execution/local_tensor_backing.h" -#include "task-spec/fwb_op_task_type.h" -#include "task-spec/runtime_task_invocation/runtime_task_invocation.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph.h" -#include "utils/containers/flatmap.h" - -namespace FlexFlow { - -LocalReadyToLaunchTask prepare_runtime_task_invocation( - RuntimeTaskInvocation const &runtime_task_invocation, - LocalTensorBacking const &local_tensor_backing, - LocalAtomicTensorBacking const &local_atomic_tensor_backing, - Allocator &allocator, - RuntimeArgConfig const &runtime_arg_config) { - - AtomicTaskInvocation atomic_task_invocation = - lower_local_runtime_task_invocation_to_atomic_task_invocation( - local_tensor_backing, runtime_task_invocation, runtime_arg_config); - - TaskArgumentAccessor task_arg_accessor = - get_task_arg_accessor_for_atomic_task_invocation( - local_atomic_tensor_backing, atomic_task_invocation, allocator); - - return LocalReadyToLaunchTask{ - atomic_task_invocation.task_id, - task_arg_accessor, - }; -} - -std::optional execute_init_for_layer( - symbolic_layer_guid_t symbolic_layer_guid, - TrainingSymbolicComputationGraph const &g, - LocalTensorBacking const &tensor_backing, - LocalAtomicTensorBacking const &atomic_tensor_backing, - Allocator &allocator, - LocalTaskRegistry const &task_registry, - RuntimeArgConfig const &runtime_arg_config) { - - SymbolicCgOpAttrsAndTrainingSignatureWithShapes attrs_and_signature = - get_attrs_and_signature_for_layer(g, symbolic_layer_guid); - - RuntimeTaskInvocation runtime_task_invocation = ({ - std::optional maybe_runtime_task_invocation = - get_init_runtime_task_invocation_for_layer(symbolic_layer_guid, - attrs_and_signature); - if (!maybe_runtime_task_invocation.has_value()) { - return std::nullopt; - } - maybe_runtime_task_invocation.value(); - }); - - LocalReadyToLaunchTask prepared_task = - prepare_runtime_task_invocation(runtime_task_invocation, - tensor_backing, - atomic_tensor_backing, - allocator, - runtime_arg_config); - - std::optional per_device_op_state = - call_init_task_impl(task_registry, - prepared_task.task_id, - prepared_task.task_arg_accessor); - - return per_device_op_state; -} - -static std::optional execute_fwb_for_layer( - symbolic_layer_guid_t symbolic_layer_guid, - SymbolicCgOpAttrsAndTrainingSignatureWithShapes const &attrs_and_signature, - LocalTensorBacking const &local_tensor_backing, - LocalAtomicTensorBacking const &local_atomic_tensor_backing, - Allocator &allocator, - LocalTaskRegistry const &local_task_registry, - RuntimeArgConfig const &runtime_arg_config, - FwbOpTaskType task_type) { - - OpTaskType op_task_type = - assert_unwrap(op_task_type_from_fwb_op_task_type(task_type)); - - RuntimeTaskInvocation runtime_task_invocation = ({ - std::optional maybe_runtime_task_invocation = - get_runtime_task_invocation_for_layer_and_type( - symbolic_layer_guid, attrs_and_signature, op_task_type); - if (!maybe_runtime_task_invocation.has_value()) { - return std::nullopt; - } - maybe_runtime_task_invocation.value(); - }); - - task_id_t task_id = runtime_task_invocation.task_id; - - RuntimeTaskBinding runtime_task_binding = runtime_task_invocation.binding; - - AtomicTaskBinding atomic_task_binding = - lower_local_runtime_task_binding_to_atomic_task_binding( - local_tensor_backing, runtime_task_binding, runtime_arg_config); - - TaskArgumentAccessor task_arg_accessor = - get_task_arg_accessor_for_atomic_task_binding( - local_atomic_tensor_backing, atomic_task_binding, allocator); - - std::optional execution_time = - call_fwb_task_impl(local_task_registry, task_id, task_arg_accessor); - - return execution_time; -} - -std::optional execute_forward_for_layer( - symbolic_layer_guid_t layer, - SymbolicCgOpAttrsAndTrainingSignatureWithShapes const &attrs_and_signature, - LocalTensorBacking const &tensor_backing, - LocalAtomicTensorBacking const &atomic_tensor_backing, - Allocator &allocator, - LocalTaskRegistry const &task_registry, - RuntimeArgConfig const &runtime_arg_config) { - - return execute_fwb_for_layer(layer, - attrs_and_signature, - tensor_backing, - atomic_tensor_backing, - allocator, - task_registry, - runtime_arg_config, - FwbOpTaskType::FWD); -} - -std::optional execute_backward_for_layer( - symbolic_layer_guid_t layer, - SymbolicCgOpAttrsAndTrainingSignatureWithShapes const &attrs_and_signature, - LocalTensorBacking const &tensor_backing, - LocalAtomicTensorBacking const &atomic_tensor_backing, - Allocator &allocator, - LocalTaskRegistry const &task_registry, - RuntimeArgConfig const &runtime_arg_config) { - - return execute_fwb_for_layer(layer, - attrs_and_signature, - tensor_backing, - atomic_tensor_backing, - allocator, - task_registry, - runtime_arg_config, - FwbOpTaskType::BWD); -} - -void execute_compute_loss(LossAttrs const &loss_attrs, - symbolic_forward_tensor_guid_t logit_fwd_tensor, - symbolic_gradient_tensor_guid_t logit_grad_tensor, - symbolic_loss_tensor_guid_t loss_tensor, - LocalTensorBacking const &tensor_backing, - LocalAtomicTensorBacking const &atomic_tensor_backing, - Allocator &allocator, - LocalTaskRegistry const &task_registry, - RuntimeArgConfig const &runtime_arg_config) { - - RuntimeTaskInvocation invocation = get_compute_loss_runtime_task_invocation( - loss_attrs, logit_fwd_tensor, logit_grad_tensor, loss_tensor); - - LocalReadyToLaunchTask prepared_task = - prepare_runtime_task_invocation(invocation, - tensor_backing, - atomic_tensor_backing, - allocator, - runtime_arg_config); - - call_generic_task_impl( - task_registry, prepared_task.task_id, prepared_task.task_arg_accessor); -} - -void execute_update_for_layer( - symbolic_layer_guid_t symbolic_layer_guid, - TrainingSymbolicComputationGraph const &graph, - LocalTensorBacking const &tensor_backing, - LocalAtomicTensorBacking const &atomic_tensor_backing, - OptimizerAttrs const &optimizer_attrs, - Allocator &allocator, - LocalTaskRegistry const &task_registry, - RuntimeArgConfig const &runtime_arg_config) { - - SymbolicTrainingLayerAttrsPlusContext attrs_plus_context = - get_symbolic_training_layer_attrs_plus_context(graph, - symbolic_layer_guid); - - RuntimeTaskInvocation invocation = ({ - std::optional maybe_invocation = - get_update_runtime_task_invocation_for_layer(attrs_plus_context, - optimizer_attrs); - if (!maybe_invocation.has_value()) { - return; - } - maybe_invocation.value(); - }); - - LocalReadyToLaunchTask prepared_task = - prepare_runtime_task_invocation(invocation, - tensor_backing, - atomic_tensor_backing, - allocator, - runtime_arg_config); - - call_generic_task_impl( - task_registry, prepared_task.task_id, prepared_task.task_arg_accessor); -} - -std::unordered_map> - execute_forward_pass( - TrainingSymbolicComputationGraphFromCgConversion const &training_cg, - LocalTensorBacking const &local_tensor_backing, - LocalAtomicTensorBacking const &local_atomic_tensor_backing, - Allocator &allocator, - LocalTaskRegistry const &local_task_registry, - RuntimeArgConfig const &runtime_arg_config) { - std::unordered_map> - per_layer_elapsed_time; - - for (symbolic_layer_guid_t symbolic_layer_guid : - symbolic_cg_topological_ordering( - training_cg.training_symbolic_computation_graph)) { - - std::optional elapsed_time = execute_forward_for_layer( - symbolic_layer_guid, - training_cg.training_symbolic_computation_graph, - local_tensor_backing, - local_atomic_tensor_backing, - allocator, - local_task_registry, - runtime_arg_config); - - layer_guid_t layer_guid = - training_cg.layer_mapping.at_r(symbolic_layer_guid); - per_layer_elapsed_time.insert({layer_guid, elapsed_time}); - } - - return per_layer_elapsed_time; -} - -std::unordered_map> - execute_backward_pass( - TrainingSymbolicComputationGraphFromCgConversion const &training_cg, - LocalTensorBacking const &local_tensor_backing, - LocalAtomicTensorBacking const &local_atomic_tensor_backing, - Allocator &allocator, - LocalTaskRegistry const &local_task_registry, - RuntimeArgConfig const &runtime_arg_config) { - std::unordered_map> - per_layer_elapsed_time; - - for (symbolic_layer_guid_t symbolic_layer_guid : - reversed(symbolic_cg_topological_ordering( - training_cg.training_symbolic_computation_graph))) { - - std::optional elapsed_time = execute_backward_for_layer( - symbolic_layer_guid, - training_cg.training_symbolic_computation_graph, - local_tensor_backing, - local_atomic_tensor_backing, - allocator, - local_task_registry, - runtime_arg_config); - - layer_guid_t layer_guid = - training_cg.layer_mapping.at_r(symbolic_layer_guid); - per_layer_elapsed_time.insert({layer_guid, elapsed_time}); - } - - return per_layer_elapsed_time; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_atomic_tensor_backing.cc b/lib/local-execution/src/local-execution/local_atomic_tensor_backing.cc deleted file mode 100644 index c43fd6bdf3..0000000000 --- a/lib/local-execution/src/local-execution/local_atomic_tensor_backing.cc +++ /dev/null @@ -1,35 +0,0 @@ -#include "local-execution/local_atomic_tensor_backing.h" -#include "local-execution/local_task_argument_accessor.h" -#include "utils/containers/map_values.h" - -namespace FlexFlow { - -std::unordered_map - construct_tensor_slots_backing_for_binding( - LocalAtomicTensorBacking const &tensor_backing, - AtomicTaskBinding const &binding) { - return map_values(binding.tensor_bindings, - [&](atomic_training_tensor_guid_t t) -> TensorSlotBacking { - return TensorSlotBacking{ - tensor_backing.accessor_from_atomic_tensor_map.at(t), - }; - }); -} - -TaskArgumentAccessor get_task_arg_accessor_for_atomic_task_invocation( - LocalAtomicTensorBacking const &local_tensor_backing, - AtomicTaskBinding const &atomic_task_binding, - Allocator &allocator) { - - std::unordered_map - tensor_slots_backing = construct_tensor_slots_backing_for_binding( - local_tensor_backing, atomic_task_binding); - - std::unordered_map arg_slots_backing = - atomic_task_binding.arg_bindings; - - return TaskArgumentAccessor::create( - allocator, tensor_slots_backing, arg_slots_backing, 0); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_concrete_task_graph.cc b/lib/local-execution/src/local-execution/local_concrete_task_graph.cc deleted file mode 100644 index 9806758f06..0000000000 --- a/lib/local-execution/src/local-execution/local_concrete_task_graph.cc +++ /dev/null @@ -1,12 +0,0 @@ -#include "local-execution/local_concrete_task_graph.h" - -namespace FlexFlow { - -std::vector - local_concrete_task_graph_topological_ordering( - LocalConcreteTaskGraph const &) { - - NOT_IMPLEMENTED(); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_device_states_backing.cc b/lib/local-execution/src/local-execution/local_device_states_backing.cc deleted file mode 100644 index 1dc34b120d..0000000000 --- a/lib/local-execution/src/local-execution/local_device_states_backing.cc +++ /dev/null @@ -1,48 +0,0 @@ -#include "local-execution/local_device_states_backing.h" -#include "local-execution/local_task_registry.h" -#include "local-execution/local_tensor_backing.h" -#include "task-spec/task_signature_impl.h" -#include "utils/containers/generate_map.h" -#include "utils/containers/keys.h" -#include "utils/overload.h" - -namespace FlexFlow { - -// LocalDeviceStatesBacking -// make_local_device_states_backing_for_computation_graph( -// LocalTaskRegistry const &task_registry, -// std::unordered_map const &layers, -// std::unordered_map const -// &op_attrs, RuntimeArgConfig const &runtime_arg_config, LocalTensorBacking -// const &local_tensor_backing, Allocator &allocator) { -// -// std::unordered_map> -// per_device_op_states = generate_map( -// keys(layers), -// [&](layer_guid_t const &layer_guid) -> -// std::optional { -// return create_per_device_op_state( -// task_registry, -// local_tensor_backing, -// runtime_arg_config, -// allocator, -// op_attrs, -// layers.at(layer_guid)); -// }); -// -// return LocalDeviceStatesBacking{ -// per_device_op_states, -// }; -// } - -// std::optional -// get_per_device_op_state_if_exists( -// LocalArgsBacking const &local_args_backing, -// layer_guid_t const &layer_guid) { -// -// return local_args_backing.per_device_op_states.at(layer_guid); -// } - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_task_argument_accessor.cc b/lib/local-execution/src/local-execution/local_task_argument_accessor.cc index c9bdb84fbf..8a4df61d17 100644 --- a/lib/local-execution/src/local-execution/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local-execution/local_task_argument_accessor.cc @@ -1,45 +1,46 @@ #include "local-execution/local_task_argument_accessor.h" +#include "kernels/accessor.h" +#include "pcg/device_id.h" #include "pcg/device_id_t.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/transform.h" -#include "utils/hash/pair.h" -#include "utils/overload.h" +#include "utils/exception.h" +#include "utils/optional.h" namespace FlexFlow { LocalTaskArgumentAccessor::LocalTaskArgumentAccessor( Allocator const &allocator, - std::unordered_map const + std::unordered_map const &tensor_slots_backing, ProfilingSettings const &profiling_settings, device_handle_t const &ff_handle, - DeviceType kernel_device_type, - PCGOperatorAttrs const &op_attrs, + std::optional const &op_attrs, std::optional const &loss_attrs, std::optional const &per_device_op_state, FFIterationConfig const &iteration_config, std::optional const &optimizer_attrs, - size_t device_idx) + device_id_t device_idx) : allocator(allocator), tensor_slots_backing(tensor_slots_backing), profiling_settings(profiling_settings), ff_handle(ff_handle), - kernel_device_type(kernel_device_type), op_attrs(op_attrs), - loss_attrs(loss_attrs), per_device_op_state(per_device_op_state), + op_attrs(op_attrs), loss_attrs(loss_attrs), + per_device_op_state(per_device_op_state), iteration_config(iteration_config), optimizer_attrs(optimizer_attrs), - device_idx(make_device_id_t_from_idx(nonnegative_int{device_idx}, - kernel_device_type)) {} + device_idx(device_idx) {} -GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( - TensorSlotName slot, - Permissions priv, - TrainingTensorType tensor_type) const { - GenericTensorAccessorW tensor_backing = - this->tensor_slots_backing.at(slot_tensor_type).require_single(); +GenericTensorAccessor + LocalTaskArgumentAccessor::get_tensor(TaskTensorParameter slot, + Permissions priv) const { + DynamicTensorAccessor tensor_backing = this->tensor_slots_backing.at(slot); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = - read_only_accessor_from_write_accessor(tensor_backing); - return readonly_tensor_backing; + if (tensor_backing.is_read()) { + return tensor_backing.require_read(); + } else { + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor( + tensor_backing.require_write()); + return readonly_tensor_backing; + } } else if (priv == Permissions::RW || priv == Permissions::WO) { - return tensor_backing; + return tensor_backing.require_write(); } else { PANIC(fmt::format("Unhandled privilege mode {}", priv)); } @@ -54,11 +55,11 @@ device_handle_t LocalTaskArgumentAccessor::get_ff_handle() const { } DeviceType LocalTaskArgumentAccessor::get_kernel_device_type() const { - return this->kernel_device_type; + return get_device_type(this->device_idx); } PCGOperatorAttrs LocalTaskArgumentAccessor::get_op_attrs() const { - return this->op_attrs; + return assert_unwrap(this->op_attrs); } LossAttrs LocalTaskArgumentAccessor::get_loss_attrs() const { @@ -81,7 +82,7 @@ Allocator LocalTaskArgumentAccessor::get_allocator() const { return this->allocator; } -size_t LocalTaskArgumentAccessor::get_device_idx() const { +device_id_t LocalTaskArgumentAccessor::get_device_idx() const { return this->device_idx; } diff --git a/lib/local-execution/src/local-execution/local_task_registry.cc b/lib/local-execution/src/local-execution/local_task_registry.cc index fb6936425d..abf6595cf4 100644 --- a/lib/local-execution/src/local-execution/local_task_registry.cc +++ b/lib/local-execution/src/local-execution/local_task_registry.cc @@ -1,49 +1,166 @@ #include "local-execution/local_task_registry.h" -#include "local-execution/operator_task_set.h" -#include "pcg/computation_graph.h" -#include "task-spec/task_signature_impl.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/filtrans.h" -#include "utils/containers/flatmap.h" -#include "utils/containers/generate_map.h" -#include "utils/containers/map_values.h" -#include "utils/containers/try_at.h" -#include "utils/containers/values.h" +#include "op-attrs/computation_graph_op_attrs.dtg.h" +#include "task-spec/loss_functions.h" +#include "task-spec/ops/impl/attention.h" +#include "task-spec/ops/impl/batch_matmul.h" +#include "task-spec/ops/impl/batch_norm.h" +#include "task-spec/ops/impl/broadcast.h" +#include "task-spec/ops/impl/cast.h" +#include "task-spec/ops/impl/concat.h" +#include "task-spec/ops/impl/conv_2d.h" +#include "task-spec/ops/impl/dropout.h" +#include "task-spec/ops/impl/element_binary.h" +#include "task-spec/ops/impl/element_unary.h" +#include "task-spec/ops/impl/embedding.h" +#include "task-spec/ops/impl/flat.h" +#include "task-spec/ops/impl/gather.h" +#include "task-spec/ops/impl/layer_norm.h" +#include "task-spec/ops/impl/linear.h" +#include "task-spec/ops/impl/pool_2d.h" +#include "task-spec/ops/impl/reduce.h" +#include "task-spec/ops/impl/reshape.h" +#include "task-spec/ops/impl/reverse.h" +#include "task-spec/ops/impl/softmax.h" +#include "task-spec/ops/impl/split.h" +#include "task-spec/ops/impl/topk.h" +#include "task-spec/ops/impl/transpose.h" +#include "task-spec/optimizer.h" +#include "task-spec/task_impl_function.dtg.h" +#include "utils/exception.h" +#include "utils/optional.h" +#include "utils/overload.h" +#include namespace FlexFlow { -LocalTaskRegistry construct_local_task_registry_for_layers( - std::unordered_set const &op_attrs) { - - std::unordered_set task_ids = flatmap( - op_attrs, - [](ComputationGraphOpAttrs const &op_attrs) - -> std::unordered_set { return get_task_ids(op_attrs); }); +std::optional + get_init_task_impl_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) { + + return op_attrs.visit>(overload{ + [](BatchMatmulAttrs const &) { return std::nullopt; }, + [](BatchNormAttrs const &) { return get_batch_norm_init_task_impl(); }, + [](BroadcastAttrs const &) { return std::nullopt; }, + [](CastAttrs const &) { return std::nullopt; }, + [](ConcatAttrs const &) { return std::nullopt; }, + [](Conv2DAttrs const &) { return get_conv_2d_init_task_impl(); }, + [](DropoutAttrs const &) { return get_dropout_init_task_impl(); }, + [](ElementBinaryAttrs const &) { + return get_element_binary_init_task_impl(); + }, + [](ElementUnaryAttrs const &) { + return get_element_unary_init_task_impl(); + }, + [](EmbeddingAttrs const &) { return std::nullopt; }, + [](FlatAttrs const &) { return std::nullopt; }, + [](GatherAttrs const &) { return get_gather_init_task_impl(); }, + [](InputAttrs const &) { return std::nullopt; }, + [](LayerNormAttrs const &) { return get_layer_norm_init_task_impl(); }, + [](LinearAttrs const &) { return get_linear_init_task_impl(); }, + [](MultiHeadAttentionAttrs const &) { + return get_attention_init_task_impl(); + }, + [](NoopAttrs const &) { return std::nullopt; }, + [](Pool2DAttrs const &) { return get_pool_2d_init_task_impl(); }, + [](ReduceAttrs const &) { return get_reduce_init_task_impl(); }, + [](ReshapeAttrs const &) { return std::nullopt; }, + [](ReverseAttrs const &) { return std::nullopt; }, + [](SoftmaxAttrs const &) { return get_softmax_init_task_impl(); }, + [](SplitAttrs const &) { return std::nullopt; }, + [](TopKAttrs const &) { return std::nullopt; }, + [](TransposeAttrs const &) { return std::nullopt; }, + [](WeightAttrs const &) { return std::nullopt; }, + }); +} - std::unordered_map task_mapping = - generate_map(task_ids, get_task_signature_and_impl_for_task_id); +std::optional + get_fwd_task_impl_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) { + + return op_attrs.visit>(overload{ + [](BatchMatmulAttrs const &) { return get_batch_matmul_fwd_task_impl(); }, + [](BatchNormAttrs const &) { return get_batch_norm_fwd_task_impl(); }, + [](BroadcastAttrs const &) { return get_broadcast_fwd_task_impl(); }, + [](CastAttrs const &) { return get_cast_fwd_task_impl(); }, + [](ConcatAttrs const &) { return get_concat_fwd_task_impl(); }, + [](Conv2DAttrs const &) { return get_conv_2d_fwd_task_impl(); }, + [](DropoutAttrs const &) { return get_dropout_fwd_task_impl(); }, + [](ElementBinaryAttrs const &) { + return get_element_binary_fwd_task_impl(); + }, + [](ElementUnaryAttrs const &) { + return get_element_unary_fwd_task_impl(); + }, + [](EmbeddingAttrs const &) { return get_embedding_fwd_task_impl(); }, + [](FlatAttrs const &) { return get_flat_fwd_task_impl(); }, + [](GatherAttrs const &) { return get_gather_fwd_task_impl(); }, + [](InputAttrs const &) { return std::nullopt; }, + [](LayerNormAttrs const &) { return get_layer_norm_fwd_task_impl(); }, + [](LinearAttrs const &) { return get_linear_fwd_task_impl(); }, + [](MultiHeadAttentionAttrs const &) { + return get_attention_fwd_task_impl(); + }, + [](NoopAttrs const &) { return std::nullopt; }, + [](Pool2DAttrs const &) { return get_pool_2d_fwd_task_impl(); }, + [](ReduceAttrs const &) { return get_reduce_fwd_task_impl(); }, + [](ReshapeAttrs const &) { return get_reshape_fwd_task_impl(); }, + [](ReverseAttrs const &) { return get_reverse_fwd_task_impl(); }, + [](SoftmaxAttrs const &) { return get_softmax_fwd_task_impl(); }, + [](SplitAttrs const &) { return get_split_fwd_task_impl(); }, + [](TopKAttrs const &) { return get_topk_fwd_task_impl(); }, + [](TransposeAttrs const &) { return get_transpose_fwd_task_impl(); }, + [](WeightAttrs const &) { return std::nullopt; }, + }); +} - return LocalTaskRegistry{ - /*task_mapping=*/task_mapping, - }; +std::optional + get_bwd_task_impl_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) { + + return op_attrs.visit>(overload{ + [](BatchMatmulAttrs const &) { return get_batch_matmul_bwd_task_impl(); }, + [](BatchNormAttrs const &) { return get_batch_norm_bwd_task_impl(); }, + [](BroadcastAttrs const &) { return get_broadcast_bwd_task_impl(); }, + [](CastAttrs const &) { return get_cast_bwd_task_impl(); }, + [](ConcatAttrs const &) { return get_concat_bwd_task_impl(); }, + [](Conv2DAttrs const &) { return get_conv_2d_bwd_task_impl(); }, + [](DropoutAttrs const &) { return get_dropout_bwd_task_impl(); }, + [](ElementBinaryAttrs const &) { + return get_element_binary_bwd_task_impl(); + }, + [](ElementUnaryAttrs const &) { + return get_element_unary_bwd_task_impl(); + }, + [](EmbeddingAttrs const &) { return get_embedding_bwd_task_impl(); }, + [](FlatAttrs const &) { return get_flat_bwd_task_impl(); }, + [](GatherAttrs const &) { return get_gather_bwd_task_impl(); }, + [](InputAttrs const &) { return std::nullopt; }, + [](LayerNormAttrs const &) { return get_layer_norm_bwd_task_impl(); }, + [](LinearAttrs const &) { return get_linear_bwd_task_impl(); }, + [](MultiHeadAttentionAttrs const &) { + return get_attention_bwd_task_impl(); + }, + [](NoopAttrs const &) { return std::nullopt; }, + [](Pool2DAttrs const &) { return get_pool_2d_bwd_task_impl(); }, + [](ReduceAttrs const &) { return get_reduce_bwd_task_impl(); }, + [](ReshapeAttrs const &) { return get_reshape_bwd_task_impl(); }, + [](ReverseAttrs const &) { return get_reverse_bwd_task_impl(); }, + [](SoftmaxAttrs const &) { return get_softmax_bwd_task_impl(); }, + [](SplitAttrs const &) { return get_split_bwd_task_impl(); }, + [](TopKAttrs const &) { return get_topk_bwd_task_impl(); }, + [](TransposeAttrs const &) { return get_transpose_bwd_task_impl(); }, + [](WeightAttrs const &) { return std::nullopt; }, + }); } std::optional - call_init_task_impl(LocalTaskRegistry const &local_task_registry, - task_id_with_noop_default_t registered_task, + call_init_task_impl(ComputationGraphOpAttrs const &op_attrs, TaskArgumentAccessor const &arg_accessor) { - - if (registered_task.is_noop_task()) { + std::optional task_impl_fn = + get_init_task_impl_for_op_attrs(op_attrs); + if (!task_impl_fn.has_value()) { return std::nullopt; } - task_id_t task_id = registered_task.require_real_task(); - - TaskSignatureAndImpl task_sig_impl = - local_task_registry.task_mapping.at(task_id); - auto fn = - task_sig_impl.impl_function.get().function_ptr; + assert_unwrap(task_impl_fn).get().function_ptr; std::optional device_state = fn(arg_accessor); @@ -51,24 +168,46 @@ std::optional } std::optional - call_fwb_task_impl(LocalTaskRegistry const &task_registry, - task_id_t const &task_id, + call_fwd_task_impl(ComputationGraphOpAttrs const &op_attrs, TaskArgumentAccessor const &acc) { - TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); + std::optional task_impl_fn = + get_fwd_task_impl_for_op_attrs(op_attrs); + if (!task_impl_fn.has_value()) { + return std::nullopt; + } auto fn = - task_sig_impl.impl_function.get().function_ptr; + assert_unwrap(task_impl_fn).get().function_ptr; return fn(acc); } -void call_generic_task_impl(LocalTaskRegistry const &task_registry, - task_id_t const &task_id, - TaskArgumentAccessor const &acc) { - TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); +std::optional + call_bwd_task_impl(ComputationGraphOpAttrs const &op_attrs, + TaskArgumentAccessor const &acc) { + std::optional task_impl_fn = + get_bwd_task_impl_for_op_attrs(op_attrs); + if (!task_impl_fn.has_value()) { + return std::nullopt; + } auto fn = - task_sig_impl.impl_function.get().function_ptr; + assert_unwrap(task_impl_fn).get().function_ptr; + + return fn(acc); +} + +void call_update_task_impl(OptimizerAttrs const &optimizer_attrs, + TaskArgumentAccessor const &acc) { + TaskImplFunction task_impl_fn = get_update_task_impl(optimizer_attrs); + auto fn = task_impl_fn.get().function_ptr; - fn(acc); + return fn(acc); +} + +void call_loss_task_impl(TaskArgumentAccessor const &acc) { + TaskImplFunction task_impl_fn = get_loss_bwd_task_impl(); + auto fn = task_impl_fn.get().function_ptr; + + return fn(acc); } } // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/operator_task_set.cc b/lib/local-execution/src/local-execution/operator_task_set.cc deleted file mode 100644 index a1b1d0817b..0000000000 --- a/lib/local-execution/src/local-execution/operator_task_set.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include "local-execution/operator_task_set.h" -#include "local-execution/task_id_with_noop_default_t.h" -#include "task-spec/task_signature_impl.h" -#include "utils/bidict/algorithms/right_entries.h" -#include "utils/containers/values.h" - -namespace FlexFlow { - -bidict - get_map_from_task_type_to_task(OperatorTaskSet const &op_task_set) { - return { - {OpTaskType::INIT, op_task_set.init_task}, - {OpTaskType::FWD, op_task_set.fwd_task}, - {OpTaskType::BWD, op_task_set.bwd_task}, - }; -} - -std::unordered_set - get_all_tasks_in_task_set(OperatorTaskSet const &op_task_set) { - return right_entries(get_map_from_task_type_to_task(op_task_set)); -} - -task_id_with_noop_default_t - get_task_for_task_type(OperatorTaskSet const &op_task_set, - OpTaskType task_type) { - return get_map_from_task_type_to_task(op_task_set).at_l(task_type); -} - -OperatorTaskSet - get_task_set_for_operator(ComputationGraphOpAttrs const &attrs) { - task_id_with_noop_default_t init_task = make_default_noop_task(); - task_id_with_noop_default_t fwd_task = make_default_noop_task(); - task_id_with_noop_default_t bwd_task = make_default_noop_task(); - - std::vector task_ids = get_task_ids(attrs); - - for (task_id_t const &task_id : task_ids) { - TaskSignatureAndImpl task_signature_and_impl = - get_task_signature_and_impl_for_task_id(task_id); - - OpTaskSignature task_signature = task_signature_and_impl.task_signature; - - switch (task_signature.type) { - case OpTaskType::INIT: - ASSERT(is_invocation_valid(task_signature, - get_init_op_task_invocation(attrs))); - init_task = task_id_with_noop_default_t{task_id}; - break; - case OpTaskType::FWD: - ASSERT(is_invocation_valid(task_signature, - get_forward_op_task_invocation(attrs))); - fwd_task = task_id_with_noop_default_t{task_id}; - break; - case OpTaskType::BWD: - ASSERT(is_invocation_valid(task_signature, - get_backward_op_task_invocation(attrs))); - bwd_task = task_id_with_noop_default_t{task_id}; - break; - default: - PANIC("Unhandled OpTaskType", fmt::to_string(task_signature.type)); - } - } - - return OperatorTaskSet{ - /*init_task=*/init_task, - /*fwd_task=*/fwd_task, - /*bwd_task=*/bwd_task, - }; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/task_execution.cc b/lib/local-execution/src/local-execution/task_execution.cc index 09276aa218..c96c834d4a 100644 --- a/lib/local-execution/src/local-execution/task_execution.cc +++ b/lib/local-execution/src/local-execution/task_execution.cc @@ -1,24 +1,133 @@ #include "local-execution/task_execution.h" #include "local-execution/local_task_argument_accessor.h" +#include "local-execution/local_task_registry.h" +#include "op-attrs/computation_graph_op_attrs.h" +#include "pcg/optimizer_attrs.h" +#include "pcg/optimizer_slot_name.dtg.h" +#include "task-spec/dynamic_graph/dynamic_tensor_slot.dtg.h" +#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h" +#include "task-spec/dynamic_graph/training_operation_attrs.dtg.h" +#include "task-spec/task_argument_accessor/task_tensor_parameter.h" +#include "utils/containers/binary_merge_disjoint_maps.h" +#include "utils/containers/map_keys_and_values.h" +#include "utils/exception.h" +#include "utils/optional.h" +#include "utils/overload.h" +#include namespace FlexFlow { +TaskTensorParameter make_task_tensor_parameter_from_dynamic_slot( + DynamicTensorSlot const &slot, + std::optional const &optimizer_attrs) { + return assert_unwrap(slot.slot_tensor_role) + .visit(overload{ + [&](FwbTensorType const &fwb_tensor) { + switch (fwb_tensor) { + case FwbTensorType::FORWARD: + return make_task_tensor_parameter_fwd(slot.slot_name); + case FwbTensorType::GRADIENT: + return make_task_tensor_parameter_grad(slot.slot_name); + default: + PANIC("Unhandled FwbTensorType", fmt::to_string(fwb_tensor)); + } + }, + [&](DynamicOptimizerTensorRole const &optimizer_tensor) { + return make_task_tensor_parameter_opt( + slot.slot_name, optimizer_tensor.optimizer_slot_name); + }, + [&](DynamicLossTensorRole const &loss_tensor) { + return make_task_tensor_parameter_loss(); + }, + }); +} + TaskArgumentAccessor make_task_argument_accessor_for_invocation( DynamicNodeInvocation const &invocation, Allocator &allocator, ProfilingSettings const &profiling_settings, - DeviceType kernel_device_type, - PCGOperatorAttrs op_attrs, - std::optional const &loss_attrs, + device_handle_t const &ff_handle, std::optional const &per_device_op_state, - FFIterationConfig iteration_config, - std::optional const &optimizer_attrs) { - std::unordered_map < + FFIterationConfig const &iteration_config, + std::optional const &optimizer_attrs, + device_id_t device_idx) { + auto make_param = [&](DynamicTensorSlot const &slot) { + return make_task_tensor_parameter_from_dynamic_slot(slot, optimizer_attrs); + }; + auto get_accessor = [](DynamicValueAttrs const &value) { + return assert_unwrap(value.accessor); + }; + std::unordered_map + tensor_slots_backing = binary_merge_disjoint_maps( + map_keys_and_values(invocation.inputs, make_param, get_accessor), + map_keys_and_values(invocation.outputs, make_param, get_accessor)); + + return TaskArgumentAccessor::create( + /*allocator=*/allocator, + /*tensor_slots_backing=*/tensor_slots_backing, + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/ff_handle, + /*op_attrs=*/ + and_then(invocation.node_attrs.op_attrs, + [](TrainingOperationAttrs const &op_attrs) { + return op_attrs.try_require_pcg_op(); + }), + /*loss_attrs=*/ + and_then(invocation.node_attrs.op_attrs, + [](TrainingOperationAttrs const &op_attrs) { + return op_attrs.try_require_loss(); + }), + /*per_device_op_state=*/per_device_op_state, + /*iteration_config=*/iteration_config, + /*optimizer_attrs=*/optimizer_attrs, + /*device_idx=*/device_idx); +} - return TaskArgumentAccessor::create( +std::optional execute_dynamic_node_invocation( + DynamicNodeInvocation const &invocation, + Allocator &allocator, + ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, + std::optional const &per_device_op_state, + FFIterationConfig const &iteration_config, + std::optional const &optimizer_attrs, + device_id_t device_idx) { + TaskArgumentAccessor arg_accessor = + make_task_argument_accessor_for_invocation( + /*invocation=*/invocation, /*allocator=*/allocator, - /*tensor_slots_backing=*/ - ); + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/ff_handle, + /*per_device_op_state=*/per_device_op_state, + /*iteration_config=*/iteration_config, + /*optimizer_attrs=*/optimizer_attrs, + /*device_idx=*/device_idx); + + DynamicTaskType task_type = assert_unwrap(invocation.node_attrs.task_type); + std::optional result; + switch (task_type) { + case DynamicTaskType::FWD: { + ComputationGraphOpAttrs op_attrs = + assert_unwrap(compgraph_op_attrs_from_pcg_op_attrs( + assert_unwrap(invocation.node_attrs.op_attrs).require_pcg_op())); + result = call_fwd_task_impl(op_attrs, arg_accessor); + } break; + case DynamicTaskType::BWD: { + ComputationGraphOpAttrs op_attrs = + assert_unwrap(compgraph_op_attrs_from_pcg_op_attrs( + assert_unwrap(invocation.node_attrs.op_attrs).require_pcg_op())); + result = call_bwd_task_impl(op_attrs, arg_accessor); + } break; + case DynamicTaskType::UPD: + call_update_task_impl(assert_unwrap(optimizer_attrs), arg_accessor); + break; + case DynamicTaskType::LOSS: + call_loss_task_impl(arg_accessor); + break; + default: + PANIC("Unhandled DynamicTaskType", task_type); + } + return result; } } // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/task_id_with_noop_default_t.cc b/lib/local-execution/src/local-execution/task_id_with_noop_default_t.cc deleted file mode 100644 index 15b2fe786b..0000000000 --- a/lib/local-execution/src/local-execution/task_id_with_noop_default_t.cc +++ /dev/null @@ -1,9 +0,0 @@ -#include "local-execution/task_id_with_noop_default_t.h" - -namespace FlexFlow { - -task_id_with_noop_default_t make_noop_registered_task() { - return task_id_with_noop_default_t{std::monostate{}}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/tensor_allocation.cc b/lib/local-execution/src/local-execution/tensor_allocation.cc index 16d6712616..bb2a1ba2a4 100644 --- a/lib/local-execution/src/local-execution/tensor_allocation.cc +++ b/lib/local-execution/src/local-execution/tensor_allocation.cc @@ -1,6 +1,7 @@ #include "local-execution/tensor_allocation.h" #include "op-attrs/parallel_tensor_shape.h" #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h" +#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h" #include "utils/bidict/generate_bidict.h" #include "utils/containers/all_are_true.h" #include "utils/containers/contains_key.h" @@ -24,6 +25,13 @@ bool all_tensors_are_allocated(DynamicOpenDataflowGraph const &g) { })); } +bool tensors_are_ready_for_allocation(DynamicOpenDataflowGraph const &g) { + return all_are_true( + transform(get_dynamic_values(g), [](DynamicValueAttrs const &v) -> bool { + return v.parallel_tensor_shape.has_value(); + })); +} + DynamicValueAttrs perform_tensor_allocation_for_value(DynamicValueAttrs const &value, Allocator &allocator) { @@ -35,16 +43,18 @@ DynamicValueAttrs GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); DynamicValueAttrs result = value; - result.accessor = accessor; + result.accessor = DynamicTensorAccessor{accessor}; return result; } DynamicOpenDataflowGraph perform_tensor_allocation( DynamicOpenDataflowGraph const &g, - std::unordered_map const + std::unordered_map const &preallocated, Allocator &allocator) { + ASSERT(no_tensors_are_allocated(g)); + ASSERT(tensors_are_ready_for_allocation(g)); for (DynamicValueAttrs const &v : keys(preallocated)) { ASSERT(v.accessor == std::nullopt); } @@ -64,7 +74,7 @@ DynamicOpenDataflowGraph perform_tensor_allocation( } }); - return transform_dynamic_invocation_set( + DynamicOpenDataflowGraph result = transform_dynamic_invocation_set( g, [&](DynamicNodeInvocation const &i) -> DynamicNodeInvocation { return DynamicNodeInvocation{ /*inputs=*/map_values( @@ -80,6 +90,10 @@ DynamicOpenDataflowGraph perform_tensor_allocation( }), }; }); + + ASSERT(all_tensors_are_allocated(result)); + + return result; } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/internal/test_utils.cc b/lib/local-execution/test/src/internal/test_utils.cc deleted file mode 100644 index 629640b6ae..0000000000 --- a/lib/local-execution/test/src/internal/test_utils.cc +++ /dev/null @@ -1,19 +0,0 @@ -#include "internal/test_utils.h" -#include "pcg/tensor_guid_t.dtg.h" - -namespace FlexFlow { - -PerDeviceFFHandle get_mock_per_device_ff_handle() { - return {nullptr, nullptr, nullptr, 0, false}; -} - -size_t MockTensorGuidSource::next_available_mock_tensor_guid = 0; - -MockTensorGuidSource::MockTensorGuidSource() {} - -tensor_guid_t MockTensorGuidSource::new_mock_tensor_guid() { - size_t next_guid = MockTensorGuidSource::next_available_mock_tensor_guid++; - return tensor_guid_t{DataflowOutput{Node{0}, nonnegative_int{next_guid}}}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/test/src/internal/test_utils.h b/lib/local-execution/test/src/internal/test_utils.h deleted file mode 100644 index 056e92687c..0000000000 --- a/lib/local-execution/test/src/internal/test_utils.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_TEST_UTILS -#define _FLEXFLOW_LOCAL_EXECUTION_TEST_UTILS - -#include "kernels/ff_handle.h" -#include "pcg/tensor_guid_t.dtg.h" - -namespace FlexFlow { - -struct MockTensorGuidSource { -public: - MockTensorGuidSource(); - - tensor_guid_t new_mock_tensor_guid(); - -private: - static size_t next_available_mock_tensor_guid; -}; - -PerDeviceFFHandle get_mock_per_device_ff_handle(); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/test/src/local-execution/local_cost_estimator.cc b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc index 1e0891e1a3..788817d3ed 100644 --- a/lib/local-execution/test/src/local-execution/local_cost_estimator.cc +++ b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc @@ -1,4 +1,5 @@ -#include "local-execution/local_cost_estimator.h" +#if 0 // FIXME (Elliott): fix cost estimator +#include "local-execution/cost_estimator/local_cost_estimator.h" #include "compiler/machine_mapping/machine_view.h" #include "internal/test_utils.h" #include "kernels/device_handle_t.h" @@ -140,3 +141,4 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } } } +#endif diff --git a/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc index ff90abcde7..2f2dbbd503 100644 --- a/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc +++ b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc @@ -1,8 +1,16 @@ #include "local-execution/local_task_argument_accessor.h" +#include "kernels/device_handle_t.h" #include "kernels/local_cpu_allocator.h" -#include "task-spec/task_signature_impl.h" +#include "kernels/profiling_settings.dtg.h" +#include "op-attrs/ops/input_attrs.dtg.h" +#include "pcg/device_id_t.h" +#include "task-spec/task_argument_accessor/task_tensor_parameter.h" +#include "task-spec/task_impl_function.dtg.h" #include "utils/fmt/variant.h" +#include "utils/nonnegative_int/nonnegative_int.h" +#include "utils/positive_int/positive_int.h" #include +#include using namespace ::FlexFlow; @@ -27,183 +35,89 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW input_grad = allocator.allocate_tensor(input_tensor_shape); - std::vector variadic_tensors = {input, input}; - std::vector variadic_tensors_grad = {input_grad, - input_grad}; - enum Slots { INPUT, VARIADIC_TENSORS, }; - std::unordered_map + std::unordered_map tensor_slots_backing = { { - training_tensor_slot_id_t{TensorSlotName::LHS_INPUT, - TrainingTensorType::FORWARD}, - TensorSlotBacking{input}, - }, - { - training_tensor_slot_id_t{TensorSlotName::LHS_INPUT, - TrainingTensorType::GRADIENT}, - TensorSlotBacking{input_grad}, - }, - { - training_tensor_slot_id_t{TensorSlotName::INPUT, - TrainingTensorType::FORWARD}, - TensorSlotBacking{variadic_tensors}, + make_task_tensor_parameter_fwd(TensorSlotName::LHS_INPUT), + DynamicTensorAccessor{input}, }, { - training_tensor_slot_id_t{TensorSlotName::INPUT, - TrainingTensorType::GRADIENT}, - TensorSlotBacking{variadic_tensors_grad}, + make_task_tensor_parameter_grad(TensorSlotName::LHS_INPUT), + DynamicTensorAccessor{input_grad}, }, }; + device_id_t device_idx = + make_device_id_t_from_idx(nonnegative_int{0}, DeviceType::CPU); + LocalTaskArgumentAccessor acc = LocalTaskArgumentAccessor{ /*allocator=*/allocator, /*tensor_slots_backing=*/tensor_slots_backing, - /*arg_slots_backing=*/{}, - /*device_idx=*/0, + /*profiling_settings=*/ProfilingSettings{0, 0}, + /*ff_handle=*/cpu_make_device_handle_t(), + /*op_attrs=*/PCGOperatorAttrs{InputAttrs{input_tensor_shape}}, + /*loss_attrs=*/std::nullopt, + /*per_device_op_state=*/std::nullopt, + /*iteration_config=*/FFIterationConfig{1_p}, + /*optimizer_attrs=*/std::nullopt, + /*device_idx=*/device_idx, }; SUBCASE("get_tensor") { - SUBCASE("get_tensor(TensorSlotName, Permissions::RO, " - "TrainingTensorType::FORWARD)") { + SUBCASE("get_tensor for read-only forward tensor") { GenericTensorAccessor correct = GenericTensorAccessor{ read_only_accessor_from_write_accessor(input)}; - GenericTensorAccessor result = - acc.get_tensor(TensorSlotName::LHS_INPUT, - Permissions::RO, - TrainingTensorType::FORWARD); + GenericTensorAccessor result = acc.get_tensor( + make_task_tensor_parameter_fwd(TensorSlotName::LHS_INPUT), + Permissions::RO); CHECK(correct == result); } - SUBCASE("get_tensor(TensorSlotName, Permissions::RO, " - "TrainingTensorType::GRADIENT)") { + SUBCASE("get_tensor for read-only gradient tensor") { GenericTensorAccessor correct = GenericTensorAccessor{ read_only_accessor_from_write_accessor(input_grad)}; - GenericTensorAccessor result = - acc.get_tensor(TensorSlotName::LHS_INPUT, - Permissions::RO, - TrainingTensorType::GRADIENT); + GenericTensorAccessor result = acc.get_tensor( + make_task_tensor_parameter_grad(TensorSlotName::LHS_INPUT), + Permissions::RO); CHECK(correct == result); } - SUBCASE("get_tensor(TensorSlotName, Permissions::WO, " - "TrainingTensorType::FORWARD)") { + SUBCASE("get_tensor for write-only forward tensor") { GenericTensorAccessor correct = GenericTensorAccessor{input}; - GenericTensorAccessor result = - acc.get_tensor(TensorSlotName::LHS_INPUT, - Permissions::WO, - TrainingTensorType::FORWARD); + GenericTensorAccessor result = acc.get_tensor( + make_task_tensor_parameter_fwd(TensorSlotName::LHS_INPUT), + Permissions::WO); CHECK(correct == result); } - SUBCASE("get_tensor(TensorSlotName, Permissions::WO, " - "TrainingTensorType::GRADIENT)") { + SUBCASE("get_tensor for write-only gradient tensor") { GenericTensorAccessor correct = GenericTensorAccessor{input_grad}; - GenericTensorAccessor result = - acc.get_tensor(TensorSlotName::LHS_INPUT, - Permissions::WO, - TrainingTensorType::GRADIENT); + GenericTensorAccessor result = acc.get_tensor( + make_task_tensor_parameter_grad(TensorSlotName::LHS_INPUT), + Permissions::WO); CHECK(correct == result); } - SUBCASE("get_tensor(TensorSlotName, Permissions::RW, " - "TrainingTensorType::FORWARD)") { + SUBCASE("get_tensor for read-write forward tensor") { GenericTensorAccessor correct = GenericTensorAccessor{input}; - GenericTensorAccessor result = - acc.get_tensor(TensorSlotName::LHS_INPUT, - Permissions::RW, - TrainingTensorType::FORWARD); + GenericTensorAccessor result = acc.get_tensor( + make_task_tensor_parameter_fwd(TensorSlotName::LHS_INPUT), + Permissions::RW); CHECK(correct == result); } - SUBCASE("get_tensor(TensorSlotName, Permissions::RW, " - "TrainingTensorType::GRADIENT)") { + SUBCASE("get_tensor for read-write gradient tensor") { GenericTensorAccessor correct = GenericTensorAccessor{input_grad}; - GenericTensorAccessor result = - acc.get_tensor(TensorSlotName::LHS_INPUT, - Permissions::RW, - TrainingTensorType::GRADIENT); + GenericTensorAccessor result = acc.get_tensor( + make_task_tensor_parameter_grad(TensorSlotName::LHS_INPUT), + Permissions::RW); CHECK(correct == result); } } - - SUBCASE("get_variadic_tensor") { - SUBCASE("get_variadic_tensor(TensorSlotName, Permissions::RO, " - "TrainingTensorType::FORWARD)") { - VariadicGenericTensorAccessor correct = - VariadicGenericTensorAccessor{std::vector{ - read_only_accessor_from_write_accessor(variadic_tensors.at(0)), - read_only_accessor_from_write_accessor( - variadic_tensors.at(1))}}; - VariadicGenericTensorAccessor result = - acc.get_variadic_tensor(TensorSlotName::INPUT, - Permissions::RO, - TrainingTensorType::FORWARD); - CHECK(result == correct); - } - - SUBCASE("get_variadic_tensor(TensorSlotName, Permissions::RO, " - "TrainingTensorType::GRADIENT)") { - VariadicGenericTensorAccessor correct = - VariadicGenericTensorAccessor{std::vector{ - read_only_accessor_from_write_accessor( - variadic_tensors_grad.at(0)), - read_only_accessor_from_write_accessor( - variadic_tensors_grad.at(1))}}; - VariadicGenericTensorAccessor result = - acc.get_variadic_tensor(TensorSlotName::INPUT, - Permissions::RO, - TrainingTensorType::GRADIENT); - CHECK(result == correct); - } - - SUBCASE("get_variadic_tensor(TensorSlotName, Permissions::WO, " - "TrainingTensorType::FORWARD)") { - VariadicGenericTensorAccessor correct = - VariadicGenericTensorAccessor{variadic_tensors}; - VariadicGenericTensorAccessor result = - acc.get_variadic_tensor(TensorSlotName::INPUT, - Permissions::WO, - TrainingTensorType::FORWARD); - CHECK(result == correct); - } - - SUBCASE("get_variadic_tensor(TensorSlotName, Permissions::WO, " - "TrainingTensorType::GRADIENT)") { - VariadicGenericTensorAccessor correct = - VariadicGenericTensorAccessor{variadic_tensors_grad}; - VariadicGenericTensorAccessor result = - acc.get_variadic_tensor(TensorSlotName::INPUT, - Permissions::WO, - TrainingTensorType::GRADIENT); - CHECK(result == correct); - } - - SUBCASE("get_variadic_tensor(TensorSlotName, Permissions::WO, " - "TrainingTensorType::FORWARD)") { - VariadicGenericTensorAccessor correct = - VariadicGenericTensorAccessor{variadic_tensors}; - VariadicGenericTensorAccessor result = - acc.get_variadic_tensor(TensorSlotName::INPUT, - Permissions::RW, - TrainingTensorType::FORWARD); - CHECK(result == correct); - } - - SUBCASE("get_variadic_tensor(TensorSlotName, Permissions::WO, " - "TrainingTensorType::GRADIENT)") { - VariadicGenericTensorAccessor correct = - VariadicGenericTensorAccessor{variadic_tensors_grad}; - VariadicGenericTensorAccessor result = - acc.get_variadic_tensor(TensorSlotName::INPUT, - Permissions::RW, - TrainingTensorType::GRADIENT); - CHECK(result == correct); - } - } } } diff --git a/lib/local-execution/test/src/local-execution/local_task_registry.cc b/lib/local-execution/test/src/local-execution/local_task_registry.cc deleted file mode 100644 index 5dc66c8ebc..0000000000 --- a/lib/local-execution/test/src/local-execution/local_task_registry.cc +++ /dev/null @@ -1,282 +0,0 @@ -#include "local-execution/local_task_registry.h" -#include "kernels/local_cuda_allocator.h" -#include "local-execution/local_cost_estimator.h" -#include "local-execution/local_task_registry.dtg.h" -#include "local-execution/operator_task_set.h" -#include "local-execution/registered_task.h" -#include "pcg/computation_graph_builder.h" -#include "pcg/layer_guid_t.dtg.h" -#include "task-spec/task_signature_impl.h" -#include "utils/fmt/optional.h" -#include "utils/fmt/unordered_map.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("LocalTaskRegistry") { - layer_guid_t layer_guid = layer_guid_t{Node{0}}; - positive_int embed_dim = 32_p; - positive_int num_heads = 10_p; - ComputationGraphOpAttrs attrs = - ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/embed_dim, - /*num_heads=*/num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0, - /*bias=*/true, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - }}; - - OperatorTaskSet mha_task_set = get_task_set_for_operator(attrs); - { - OperatorTaskSet expected_mha_task_set = OperatorTaskSet{ - /*init_task=*/registered_task_t{task_id_t::ATTENTION_INIT_TASK_ID}, - /*fwd_task=*/registered_task_t{task_id_t::ATTENTION_FWD_TASK_ID}, - /*bwd_task=*/registered_task_t{task_id_t::ATTENTION_BWD_TASK_ID}, - }; - REQUIRE(mha_task_set == expected_mha_task_set); - } - - std::unordered_map mha_task_mapping = { - {task_id_t::ATTENTION_INIT_TASK_ID, - get_task_signature_and_impl_for_task_id( - task_id_t::ATTENTION_INIT_TASK_ID)}, - {task_id_t::ATTENTION_FWD_TASK_ID, - get_task_signature_and_impl_for_task_id( - task_id_t::ATTENTION_FWD_TASK_ID)}, - {task_id_t::ATTENTION_BWD_TASK_ID, - get_task_signature_and_impl_for_task_id( - task_id_t::ATTENTION_BWD_TASK_ID)}, - }; - - SUBCASE("register single layer") { - LocalTaskRegistry task_registry = - construct_local_task_registry_for_layers( - {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); - - LocalTaskRegistry correct_task_registry = [&] { - std::unordered_map task_sets = { - { - layer_guid, - mha_task_set, - }, - }; - - return LocalTaskRegistry{ - /*task_sets=*/{ - {layer_guid, mha_task_set}, - }, - /*task_mapping=*/mha_task_mapping, - }; - }(); - - CHECK(task_registry == correct_task_registry); - } - - SUBCASE("multiple layers same task") { - layer_guid_t other_layer_guid = layer_guid_t{Node{1}}; - LocalTaskRegistry task_registry = - construct_local_task_registry_for_layers({ - {layer_guid, LayerAttrs{attrs, std::nullopt}}, - {other_layer_guid, LayerAttrs{attrs, std::nullopt}}, - }); - - SUBCASE("layer to task ids") { - std::unordered_map correct = { - {layer_guid, mha_task_set}, - {other_layer_guid, mha_task_set}, - }; - CHECK(task_registry.task_sets == correct); - } - - SUBCASE("task to signature+impl mapping") { - std::unordered_map correct = - mha_task_mapping; - - CHECK(task_registry.task_mapping == correct); - } - } - - SUBCASE("different attrs, still same task fn mapping") { - layer_guid_t layer_1 = layer_guid_t{Node{1}}; - positive_int embed_dim = 100_p; - layer_guid_t layer_2 = layer_guid_t{Node{2}}; - ComputationGraphOpAttrs other_attrs = - ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/embed_dim, - /*num_heads=*/num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0, - /*bias=*/true, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - }}; - LocalTaskRegistry task_registry = - construct_local_task_registry_for_layers({ - {layer_guid, LayerAttrs{attrs, std::nullopt}}, - {layer_1, LayerAttrs{attrs, std::nullopt}}, - {layer_2, LayerAttrs{other_attrs, std::nullopt}}, - }); - - std::unordered_map correct_task_mapping = - mha_task_mapping; - - CHECK(task_registry.task_mapping == correct_task_mapping); - } - - SUBCASE("equality") { - SUBCASE("different attrs is still equal") { - positive_int embed_dim = 100_p; - ComputationGraphOpAttrs other_attrs = - ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/embed_dim, - /*num_heads=*/num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0, - /*bias=*/true, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - }}; - - LocalTaskRegistry task_registry = - construct_local_task_registry_for_layers( - {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); - LocalTaskRegistry other_task_registry = - construct_local_task_registry_for_layers( - {{layer_guid, LayerAttrs{other_attrs, std::nullopt}}}); - - CHECK(task_registry == other_task_registry); - } - - SUBCASE("different layer_guid is not equal") { - LocalTaskRegistry task_registry = - construct_local_task_registry_for_layers( - {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); - layer_guid_t other_layer_guid = layer_guid_t{Node{1}}; - LocalTaskRegistry other_task_registry = - construct_local_task_registry_for_layers( - {{other_layer_guid, LayerAttrs{attrs, std::nullopt}}}); - - CHECK(task_registry != other_task_registry); - } - } - - SUBCASE("try_get_registered_task") { - SUBCASE("Task exists") { - LocalTaskRegistry task_registry = - construct_local_task_registry_for_layers({ - {layer_guid, LayerAttrs{attrs, std::nullopt}}, - }); - - SUBCASE("Init") { - std::optional result = try_get_registered_task( - task_registry, layer_guid, OpTaskType::INIT); - std::optional correct = registered_task_t{ - task_id_t::ATTENTION_INIT_TASK_ID, - }; - - CHECK(result == correct); - } - - SUBCASE("Fwd") { - std::optional result = try_get_registered_task( - task_registry, layer_guid, OpTaskType::FWD); - std::optional correct = registered_task_t{ - task_id_t::ATTENTION_FWD_TASK_ID, - }; - - CHECK(result == correct); - } - - SUBCASE("Bwd") { - std::optional result = try_get_registered_task( - task_registry, layer_guid, OpTaskType::BWD); - std::optional correct = registered_task_t{ - task_id_t::ATTENTION_BWD_TASK_ID, - }; - - CHECK(result == correct); - } - } - - SUBCASE("Partial task does not exist") { - ComputationGraphOpAttrs bmm_attrs = ComputationGraphOpAttrs{ - BatchMatmulAttrs{ - /*a_seq_length_dim=*/10_p, - /*b_seq_length_dim=*/20_p, - }, - }; - - LocalTaskRegistry task_registry = - construct_local_task_registry_for_layers({ - {layer_guid, LayerAttrs{bmm_attrs, std::nullopt}}, - }); - - SUBCASE("Init") { - std::optional result = try_get_registered_task( - task_registry, layer_guid, OpTaskType::INIT); - std::optional correct = - make_noop_registered_task(); - - CHECK(result == correct); - } - - SUBCASE("Fwd") { - std::optional result = try_get_registered_task( - task_registry, layer_guid, OpTaskType::FWD); - std::optional correct = registered_task_t{ - task_id_t::BATCHMATMUL_FWD_TASK_ID, - }; - - CHECK(result == correct); - } - - SUBCASE("Bwd") { - std::optional result = try_get_registered_task( - task_registry, layer_guid, OpTaskType::BWD); - std::optional correct = registered_task_t{ - task_id_t::BATCHMATMUL_BWD_TASK_ID, - }; - - CHECK(result == correct); - } - } - - SUBCASE("Empty tasks") { - LocalTaskRegistry task_registry = LocalTaskRegistry{ - /*task_sets=*/{}, - /*task_mapping=*/{}, - }; - - SUBCASE("Init") { - std::optional result = try_get_registered_task( - task_registry, layer_guid, OpTaskType::INIT); - std::optional correct = std::nullopt; - - CHECK(result == correct); - } - - SUBCASE("Fwd") { - std::optional result = try_get_registered_task( - task_registry, layer_guid, OpTaskType::FWD); - std::optional correct = std::nullopt; - - CHECK(result == correct); - } - - SUBCASE("Bwd") { - std::optional result = try_get_registered_task( - task_registry, layer_guid, OpTaskType::BWD); - std::optional correct = std::nullopt; - - CHECK(result == correct); - } - } - } - } -} diff --git a/lib/local-execution/test/src/local-execution/local_training_backing.cc b/lib/local-execution/test/src/local-execution/local_training_backing.cc deleted file mode 100644 index 393cfab9dc..0000000000 --- a/lib/local-execution/test/src/local-execution/local_training_backing.cc +++ /dev/null @@ -1,137 +0,0 @@ -#include "local-execution/local_training_backing.h" -#include "internal/test_utils.h" -#include "kernels/local_cuda_allocator.h" -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" -#include "pcg/computation_graph.h" -#include "pcg/computation_graph_builder.h" -#include "pcg/optimizer_attrs.dtg.h" -#include "task-spec/forward_tensor_source.h" -#include "task-spec/gradient_tensor_source.h" -#include "task-spec/optimizer_tensor_source.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.h" -#include "task-spec/training_computation_graph.h" -#include "utils/containers/get_only.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("execute_update") { - // initialize runtime configs - ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true); - - Allocator allocator = create_local_cuda_memory_allocator(); - - // construct computation graph - ComputationGraph computation_graph = make_empty_computation_graph(); - - positive_int batch_size = 10_p; - positive_int data_dim = 16_p; - positive_int output_dim = 32_p; - - TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; - - TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT}; - - LayerAddedResult inputs_layer = - add_input_layer(computation_graph, input_tensor_shape); - - LayerAddedResult weights_layer = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ - weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}}, - "weights"}, - {}, - {}); - - LayerAddedResult linear_operator = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, - /*use_bias=*/false, - DataType::FLOAT, - Activation::RELU, - std::nullopt}}, - "linear"}, - inputs_layer.outputs, - weights_layer.outputs); - tensor_guid_t logit_tensor = get_only(linear_operator.outputs); - - RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config( - managed_handle.raw_handle(), - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}); - - ForwardTensorSource forward_tensor_source; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - LossTensorSource loss_tensor_source; - - auto make_training_backing = [&](OptimizerAttrs const &optimizer_attrs) { - TrainingComputationGraph training_computation_graph = - generate_training_computation_graph(computation_graph, - optimizer_attrs, - logit_tensor, - forward_tensor_source, - gradient_tensor_source, - optimizer_tensor_source, - loss_tensor_source); - - return make_local_training_backing_for_computation_graph( - /*allocator=*/allocator, - /*preallocated_tensors=*/{}, - /*training_computation_graph=*/training_computation_graph, - /*runtime_arg_config=*/runtime_arg_config, - /*optimizer_attrs=*/optimizer_attrs); - }; - - SUBCASE("SGDOptimizerAttrs") { - SUBCASE("momentum=0") { - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, - /*momentum=*/0.0f, - /*nesterov=*/false, - /*weight_decay=*/0.001}}; - - execute_update(make_training_backing(optimizer_attrs), - linear_operator.layer, - optimizer_attrs, - allocator); - } - - SUBCASE("momentum=0.9") { - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, - /*momentum=*/0.9, - /*nesterov=*/false, - /*weight_decay=*/0.001}}; - - execute_update(make_training_backing(optimizer_attrs), - linear_operator.layer, - optimizer_attrs, - allocator); - } - } - - SUBCASE("AdamOptimizerAttrs") { - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, - /*beta1=*/0.9, - /*beta2=*/0.999, - /*weight_decay=*/0.001, - /*alpha_t=*/0.001, - /*beta_t=*/0.9, - /*beta2_t=*/0.999, - /*epsilon=*/1e-8}}; - execute_update(make_training_backing(optimizer_attrs), - linear_operator.layer, - optimizer_attrs, - allocator); - } - } -} diff --git a/lib/local-execution/test/src/local-execution/loss_functions.cc b/lib/local-execution/test/src/local-execution/loss_functions.cc index 939bcec43d..b885778b46 100644 --- a/lib/local-execution/test/src/local-execution/loss_functions.cc +++ b/lib/local-execution/test/src/local-execution/loss_functions.cc @@ -1,19 +1,15 @@ -#include "internal/test_utils.h" +#include "kernels/device_handle_t.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" -#include "local-execution/local_training_backing.h" +#include "local-execution/computation_graph_instance/computation_graph_instance.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" +#include "pcg/device_id_t.h" #include "pcg/optimizer_attrs.dtg.h" -#include "task-spec/forward_tensor_source.h" -#include "task-spec/gradient_tensor_source.h" -#include "task-spec/loss_tensor_source.h" -#include "task-spec/optimizer_tensor_source.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.h" -#include "task-spec/training_computation_graph.h" -#include "utils/containers/get_only.h" +#include "utils/containers/require_only_key.h" +#include "utils/optional.h" #include using namespace ::FlexFlow; @@ -43,6 +39,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LayerAddedResult inputs_layer = add_input_layer(computation_graph, input_tensor_shape); + tensor_guid_t inputs_tensor = + require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT); LayerAddedResult weights_layer = add_layer( computation_graph, @@ -51,6 +49,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::nullopt}, {}, {}); + tensor_guid_t weights_tensor = + require_only_key(weights_layer.outputs, TensorSlotName::OUTPUT); LayerAddedResult linear_operator = add_layer( computation_graph, @@ -60,14 +60,20 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Activation::RELU, std::nullopt}}, std::nullopt}, - inputs_layer.outputs, - weights_layer.outputs); - tensor_guid_t logit_tensor = get_only(linear_operator.outputs); - - RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config( - managed_handle.raw_handle(), - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}); + { + { + TensorSlotName::INPUT, + inputs_tensor, + }, + }, + { + { + TensorSlotName::WEIGHT, + weights_tensor, + }, + }); + tensor_guid_t logit_tensor = + require_only_key(linear_operator.outputs, TensorSlotName::OUTPUT); OptimizerAttrs optimizer_attrs = OptimizerAttrs{ SGDOptimizerAttrs{ @@ -78,78 +84,75 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }, }; - ForwardTensorSource forward_tensor_source; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - LossTensorSource loss_tensor_source; - - TrainingComputationGraph training_computation_graph = - generate_training_computation_graph(computation_graph, - optimizer_attrs, - logit_tensor, - forward_tensor_source, - gradient_tensor_source, - optimizer_tensor_source, - loss_tensor_source); - - auto make_training_backing = [&](TensorShape const &label_tensor_shape) { - GenericTensorAccessorW label_tensor_accessor = - allocator.allocate_tensor(label_tensor_shape); - - return make_local_training_backing_for_computation_graph( - /*allocator=*/allocator, - /*preallocated_tensors=*/ - { - { - training_tensor_guid_t{ - training_computation_graph.label_tensor}, - label_tensor_accessor, - }, - }, - /*training_computation_graph=*/training_computation_graph, - /*runtime_arg_config=*/runtime_arg_config, - /*optimizer_attrs=*/optimizer_attrs); + device_id_t device_idx = + make_device_id_t_from_idx(nonnegative_int{0}, DeviceType::GPU); + device_handle_t ff_handle = + gpu_make_device_handle_t(managed_handle.raw_handle()); + + std::unordered_map input_tensors; + + auto compute_loss = [&](LossAttrs const &loss_attrs, + GenericTensorAccessorR label_tensor) { + ComputationGraphInstance computation_graph_instance = + create_computation_graph_instance( + /*cg=*/computation_graph, + /*optimizer=*/optimizer_attrs, + /*loss=*/loss_attrs, + /*label_tensor=*/label_tensor, + /*logit_tensor=*/logit_tensor, + /*input_tensors=*/input_tensors, + /*allocator=*/allocator, + /*profiling_settings=*/ProfilingSettings{0, 1}, + /*device_handle=*/ff_handle, + /*iteration_config=*/FFIterationConfig{1_p}, + /*device_idx=*/device_idx); + + perform_all_passes_for_computation_graph_instance( + /*instance=*/computation_graph_instance, + /*profiling_settings=*/ProfilingSettings{0, 0}, + /*ff_handle=*/ff_handle, + /*iteration_config=*/FFIterationConfig{1_p}, + /*device_idx=*/device_idx); + assert_unwrap(computation_graph_instance.get_loss_tensor_accessor()); }; SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { TensorShape label_tensor_shape = TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT}; - - LocalTrainingBacking local_training_backing = - make_training_backing(label_tensor_shape); + GenericTensorAccessorW label_tensor = + allocator.allocate_tensor(label_tensor_shape); LossAttrs loss_attrs = LossAttrs{ SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}; - compute_loss(local_training_backing, loss_attrs, allocator); + compute_loss(loss_attrs, label_tensor); } SUBCASE("NonconfigurableLossAttrs") { TensorShape label_tensor_shape = TensorShape{ TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; - - LocalTrainingBacking local_training_backing = - make_training_backing(label_tensor_shape); + GenericTensorAccessorW label_tensor = + allocator.allocate_tensor(label_tensor_shape); SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { LossAttrs loss_attrs = LossAttrs{ NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; - compute_loss(local_training_backing, loss_attrs, allocator); + compute_loss(loss_attrs, label_tensor); } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}; - compute_loss(local_training_backing, loss_attrs, allocator); + compute_loss(loss_attrs, label_tensor); } SUBCASE("LossFunction::IDENTITY") { LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}; - compute_loss(local_training_backing, loss_attrs, allocator); + compute_loss(loss_attrs, label_tensor); } } } diff --git a/lib/local-execution/test/src/local-execution/tensor_allocation.cc b/lib/local-execution/test/src/local-execution/tensor_allocation.cc deleted file mode 100644 index e2c2869700..0000000000 --- a/lib/local-execution/test/src/local-execution/tensor_allocation.cc +++ /dev/null @@ -1,10 +0,0 @@ -#include "local-execution/tensor_allocation.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("perform_tensor_allocation") { - CHECK_MESSAGE(false, "TODO: perform_tensor_allocation"); - } -} diff --git a/lib/local-execution/test/src/local-execution/test_e2e.cc b/lib/local-execution/test/src/local-execution/test_e2e.cc index bc70195eef..a74d165a31 100644 --- a/lib/local-execution/test/src/local-execution/test_e2e.cc +++ b/lib/local-execution/test/src/local-execution/test_e2e.cc @@ -1,26 +1,22 @@ -#include "internal/test_utils.h" #include "kernels/compare_tensor_accessors.h" #include "kernels/copy_tensor_accessor.h" +#include "kernels/device_handle_t.h" #include "kernels/format_accessor_contents.h" #include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "kernels/tensor_accessor_reductions.h" -#include "local-execution/local_training_backing.h" -#include "local-execution/model_training_instance.h" +#include "local-execution/computation_graph_instance/computation_graph_instance.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" +#include "pcg/device_id_t.h" +#include "pcg/device_type.dtg.h" #include "pcg/optimizer_attrs.dtg.h" -#include "task-spec/forward_tensor_source.h" -#include "task-spec/gradient_tensor_source.h" -#include "task-spec/loss_tensor_source.h" -#include "task-spec/optimizer_tensor_source.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.h" -#include "task-spec/training_computation_graph.h" +#include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h" #include "test/utils/doctest/check_kv.h" -#include "utils/containers/get_only.h" +#include "utils/containers/require_only_key.h" #include using namespace ::FlexFlow; @@ -54,6 +50,11 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_tensor_shape = TensorShape{ TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + TensorShape label_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; + GenericTensorAccessorW label_tensor = + allocator.allocate_tensor(label_tensor_shape); + TensorShape weight_shape_1 = TensorShape{ TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT}; TensorShape weight_shape_2 = TensorShape{ @@ -61,6 +62,8 @@ TEST_SUITE(FF_TEST_SUITE) { LayerAddedResult inputs_layer = add_input_layer_with_grad(computation_graph, input_tensor_shape); + tensor_guid_t t_input = + require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT); LayerAddedResult weights_layer_1 = add_layer( computation_graph, @@ -69,6 +72,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::nullopt}, {}, {}); + tensor_guid_t t_weights_1 = + require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT); LayerAddedResult weights_layer_2 = add_layer( computation_graph, @@ -77,6 +82,8 @@ TEST_SUITE(FF_TEST_SUITE) { std::nullopt}, {}, {}); + tensor_guid_t t_weights_2 = + require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT); LayerAddedResult linear_operator_1 = add_layer( computation_graph, @@ -86,8 +93,20 @@ TEST_SUITE(FF_TEST_SUITE) { Activation::RELU, std::nullopt}}, std::nullopt}, - inputs_layer.outputs, - weights_layer_1.outputs); + { + { + TensorSlotName::INPUT, + t_input, + }, + }, + { + { + TensorSlotName::WEIGHT, + t_weights_1, + }, + }); + tensor_guid_t t_linear_1 = + require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT); LayerAddedResult linear_operator_2 = add_layer( computation_graph, @@ -97,16 +116,22 @@ TEST_SUITE(FF_TEST_SUITE) { Activation::RELU, std::nullopt}}, std::nullopt}, - linear_operator_1.outputs, - weights_layer_2.outputs); - - tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs); - - RuntimeArgConfig runtime_arg_config = cpu_make_runtime_arg_config( - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}); + { + { + TensorSlotName::INPUT, + t_linear_1, + }, + }, + { + { + TensorSlotName::WEIGHT, + t_weights_2, + }, + }); + tensor_guid_t t_linear_2 = + require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT); - // initialize training backing + // instantiate computation graph LossAttrs loss_attrs = LossAttrs{ NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; OptimizerAttrs optimizer_attrs = @@ -114,42 +139,40 @@ TEST_SUITE(FF_TEST_SUITE) { /*momentum=*/0.9, /*nesterov=*/false, /*weight_decay=*/0.001}}; - - ForwardTensorSource forward_tensor_source; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - LossTensorSource loss_tensor_source; - - TrainingComputationGraph training_computation_graph = - generate_training_computation_graph(computation_graph, - optimizer_attrs, - logit_tensor, - forward_tensor_source, - gradient_tensor_source, - optimizer_tensor_source, - loss_tensor_source); - - LocalTrainingBacking local_training_backing = - make_local_training_backing_for_computation_graph( + device_handle_t ff_handle = cpu_make_device_handle_t(); + device_id_t device_idx = + make_device_id_t_from_idx(nonnegative_int{0}, DeviceType::CPU); + + std::unordered_map input_tensors; + + ComputationGraphInstance computation_graph_instance = + create_computation_graph_instance( + /*cg=*/computation_graph, + /*optimizer=*/optimizer_attrs, + /*loss=*/loss_attrs, + /*label_tensor=*/label_tensor, + /*logit_tensor=*/t_linear_2, + /*input_tensors=*/input_tensors, /*allocator=*/allocator, - /*preallocated_tensors=*/{}, - /*training_computation_graph=*/training_computation_graph, - /*runtime_arg_config=*/runtime_arg_config, - /*optimizer_attrs=*/optimizer_attrs); + /*profiling_settings=*/ProfilingSettings{0, 0}, + /*device_handle=*/ff_handle, + /*iteration_config=*/FFIterationConfig{1_p}, + /*device_idx=*/device_idx); // begin training loop - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - allocator, local_training_backing, loss_attrs, optimizer_attrs}; - int num_epochs = 5; std::vector loss_values; for (int i = 0; i < num_epochs; i++) { - model_training_instance.forward(); - model_training_instance.backward(); - model_training_instance.update(); + perform_all_passes_for_computation_graph_instance( + /*instance=*/computation_graph_instance, + /*profiling_settings=*/ProfilingSettings{0, 0}, + /*ff_handle=*/ff_handle, + /*iteration_config=*/FFIterationConfig{1_p}, + /*device_idx=*/device_idx); loss_values.push_back(copy_tensor_accessor_r( - model_training_instance.get_loss_tensor_accessor(), allocator)); + computation_graph_instance.get_loss_tensor_accessor().value(), + allocator)); } // Assert that each sample in the batch has a lower loss in last epoch than @@ -191,6 +214,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape input_tensor_shape = TensorShape{ TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + TensorShape label_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; + GenericTensorAccessorW label_tensor = + allocator.allocate_tensor(label_tensor_shape); + TensorShape weight_shape_1 = TensorShape{ TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT}; TensorShape weight_shape_2 = TensorShape{ @@ -198,6 +226,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LayerAddedResult inputs_layer = add_input_layer_with_grad(computation_graph, input_tensor_shape); + tensor_guid_t t_input = + require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT); LayerAddedResult weights_layer_1 = add_layer( computation_graph, @@ -206,6 +236,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::nullopt}, {}, {}); + tensor_guid_t t_weights_1 = + require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT); LayerAddedResult weights_layer_2 = add_layer( computation_graph, @@ -214,6 +246,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::nullopt}, {}, {}); + tensor_guid_t t_weights_2 = + require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT); LayerAddedResult linear_operator_1 = add_layer( computation_graph, @@ -223,8 +257,20 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Activation::RELU, std::nullopt}}, std::nullopt}, - inputs_layer.outputs, - weights_layer_1.outputs); + { + { + TensorSlotName::INPUT, + t_input, + }, + }, + { + { + TensorSlotName::WEIGHT, + t_weights_1, + }, + }); + tensor_guid_t t_linear_1 = + require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT); LayerAddedResult linear_operator_2 = add_layer( computation_graph, @@ -234,17 +280,22 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Activation::RELU, std::nullopt}}, std::nullopt}, - linear_operator_1.outputs, - weights_layer_2.outputs); - - tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs); - - RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config( - managed_handle.raw_handle(), - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}); + { + { + TensorSlotName::INPUT, + t_linear_1, + }, + }, + { + { + TensorSlotName::WEIGHT, + t_weights_2, + }, + }); + tensor_guid_t t_linear_2 = + require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT); - // initialize training backing + // instantiate computation graph LossAttrs loss_attrs = LossAttrs{ NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; OptimizerAttrs optimizer_attrs = OptimizerAttrs{ @@ -255,51 +306,43 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*weight_decay=*/0.001, }, }; - - ForwardTensorSource forward_tensor_source; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - LossTensorSource loss_tensor_source; - - TrainingComputationGraph training_computation_graph = - generate_training_computation_graph(computation_graph, - optimizer_attrs, - logit_tensor, - forward_tensor_source, - gradient_tensor_source, - optimizer_tensor_source, - loss_tensor_source); - - LocalTrainingBacking local_training_backing = - make_local_training_backing_for_computation_graph( + device_id_t device_idx = + make_device_id_t_from_idx(nonnegative_int{0}, DeviceType::GPU); + device_handle_t ff_handle = + gpu_make_device_handle_t(managed_handle.raw_handle()); + + std::unordered_map input_tensors; + + ComputationGraphInstance computation_graph_instance = + create_computation_graph_instance( + /*cg=*/computation_graph, + /*optimizer=*/optimizer_attrs, + /*loss=*/loss_attrs, + /*label_tensor=*/label_tensor, + /*logit_tensor=*/t_linear_2, + /*input_tensors=*/input_tensors, /*allocator=*/allocator, - /*preallocated_tensors=*/ - { - { - training_tensor_guid_t{ - training_computation_graph.label_tensor}, - label_tensor_backing, - }, - }, - /*training_computation_graph=*/training_computation_graph, - /*runtime_arg_config=*/runtime_arg_config, - /*optimizer_attrs=*/optimizer_attrs); + /*profiling_settings=*/ProfilingSettings{0, 0}, + /*device_handle=*/ff_handle, + /*iteration_config=*/FFIterationConfig{1_p}, + /*device_idx=*/device_idx); // begin training loop - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - allocator, local_training_backing, loss_attrs, optimizer_attrs}; - Allocator cpu_allocator = create_local_cpu_memory_allocator(); int num_epochs = 5; std::vector loss_values; for (int i = 0; i < num_epochs; i++) { - model_training_instance.forward(); - model_training_instance.backward(); - model_training_instance.update(); + perform_all_passes_for_computation_graph_instance( + /*instance=*/computation_graph_instance, + /*profiling_settings=*/ProfilingSettings{0, 0}, + /*ff_handle=*/ff_handle, + /*iteration_config=*/FFIterationConfig{1_p}, + /*device_idx=*/device_idx); loss_values.push_back(copy_tensor_accessor_r( - model_training_instance.get_loss_tensor_accessor(), cpu_allocator)); + computation_graph_instance.get_loss_tensor_accessor().value(), + cpu_allocator)); } // Assert that each sample in the batch has a lower loss in last epoch than diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml index a429a958f7..c6e6673f33 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml @@ -10,12 +10,17 @@ features = [ includes = [ "pcg/layer_guid_t.dtg.h", "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h", + "task-spec/dynamic_graph/dynamic_loss_layer_guid_t.dtg.h", ] [[values]] -name = "layer_guid" type = "::FlexFlow::layer_guid_t" +key = "layer_guid" [[values]] -name = "pcg_layer_guid" type = "::FlexFlow::parallel_layer_guid_t" +key = "pcg_layer_guid" + +[[values]] +type = "::FlexFlow::dynamic_loss_layer_guid_t" +key = "loss_layer_guid" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.h b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.h new file mode 100644 index 0000000000..8a42911c44 --- /dev/null +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_LAYER_GUID_T_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_LAYER_GUID_T_H + +#include "pcg/layer_guid_t.dtg.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" +#include "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h" + +namespace FlexFlow { + +dynamic_layer_guid_t mk_dynamic_layer_guid_for_cg_layer(layer_guid_t); +dynamic_layer_guid_t mk_dynamic_layer_guid_for_pcg_layer(parallel_layer_guid_t); +dynamic_layer_guid_t mk_dynamic_layer_guid_for_loss(); + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_loss_layer_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_loss_layer_guid_t.dtg.toml new file mode 100644 index 0000000000..b8de8c62c2 --- /dev/null +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_loss_layer_guid_t.dtg.toml @@ -0,0 +1,13 @@ +namespace = "FlexFlow" +name = "dynamic_loss_layer_guid_t" +type = "struct" +features = [ + "eq", + "ord", + "hash", + "json", + "fmt", + "rapidcheck", +] + +fields = [] diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_loss_tensor_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_loss_tensor_guid_t.dtg.toml new file mode 100644 index 0000000000..35bf3924aa --- /dev/null +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_loss_tensor_guid_t.dtg.toml @@ -0,0 +1,13 @@ +namespace = "FlexFlow" +name = "dynamic_loss_tensor_guid_t" +type = "struct" +features = [ + "eq", + "ord", + "hash", + "json", + "fmt", + "rapidcheck", +] + +fields = [] diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_loss_tensor_role.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_loss_tensor_role.dtg.toml new file mode 100644 index 0000000000..dde7bda5f0 --- /dev/null +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_loss_tensor_role.dtg.toml @@ -0,0 +1,13 @@ +namespace = "FlexFlow" +name = "DynamicLossTensorRole" +type = "struct" +features = [ + "eq", + "ord", + "hash", + "json", + "fmt", + "rapidcheck", +] + +fields = [] diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_node_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_node_attrs.dtg.toml index 4d58e5f04f..128e305dc6 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_node_attrs.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_node_attrs.dtg.toml @@ -12,8 +12,8 @@ includes = [ "task-spec/dynamic_graph/dynamic_task_type.dtg.h", "pcg/machine_space_coordinate.dtg.h", "pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h", - "op-attrs/pcg_operator_attrs.dtg.h", "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h", + "task-spec/dynamic_graph/training_operation_attrs.dtg.h", "task-spec/device_specific_per_device_op_state.dtg.h", ] @@ -35,7 +35,7 @@ type = "std::optional<::FlexFlow::MappedOperatorTaskGroup>" [[fields]] name = "op_attrs" -type = "std::optional<::FlexFlow::PCGOperatorAttrs>" +type = "std::optional<::FlexFlow::TrainingOperationAttrs>" [[fields]] name = "layer_guid" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.toml index ba16732364..cac13465a0 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.toml @@ -1,8 +1,9 @@ namespace = "FlexFlow" name = "DynamicOpenDataflowGraph" type = "struct" -features = [ +features = [ "eq", + "fmt", ] includes = [ @@ -10,6 +11,10 @@ includes = [ "", ] +src_includes = [ + "utils/fmt/unordered_set.h", +] + [[fields]] name = "invocations" type = "std::unordered_set<::FlexFlow::DynamicNodeInvocation>" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_open_dataflow_graph.h b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_open_dataflow_graph.h index a3bbba592f..ddd97a258a 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_open_dataflow_graph.h +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_open_dataflow_graph.h @@ -3,6 +3,7 @@ #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h" #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h" +#include "utils/graph/labelled_open_kwarg_dataflow_graph/labelled_open_kwarg_dataflow_graph.h" namespace FlexFlow { @@ -31,6 +32,11 @@ std::unordered_multiset std::unordered_set get_dynamic_invocation_set(DynamicOpenDataflowGraph const &); +std::optional + find_output_value_attrs(DynamicOpenDataflowGraph const &, + dynamic_tensor_guid_t, + std::optional const &); + DynamicOpenDataflowGraph transform_dynamic_invocation_set( DynamicOpenDataflowGraph const &, std::function const @@ -44,6 +50,14 @@ DynamicOpenDataflowGraph flatmap_dynamic_invocation_set( DynamicOpenDataflowGraph dynamic_open_dataflow_graph_from_invocation_set( std::unordered_set const &); +std::pair, + bidict> + labelled_open_kwarg_dataflow_graph_from_dynamic_open_dataflow_graph( + DynamicOpenDataflowGraph const &); + bool dynamic_open_dataflow_graphs_are_isomorphic( DynamicOpenDataflowGraph const &, DynamicOpenDataflowGraph const &); diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_task_type.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_task_type.dtg.toml index 2885d7d0d3..1585d55192 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_task_type.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_task_type.dtg.toml @@ -16,3 +16,6 @@ name = "BWD" [[values]] name = "UPD" + +[[values]] +name = "LOSS" diff --git a/lib/local-execution/include/local-execution/tensor_slot_backing.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.toml similarity index 50% rename from lib/local-execution/include/local-execution/tensor_slot_backing.dtg.toml rename to lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.toml index 4d8c817461..85f8f299a4 100644 --- a/lib/local-execution/include/local-execution/tensor_slot_backing.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.toml @@ -1,24 +1,20 @@ namespace = "FlexFlow" -name = "TensorSlotBacking" +name = "DynamicTensorAccessor" type = "variant" features = [ "eq", "fmt", + "hash", ] includes = [ "kernels/accessor.h", - "", -] - -src_includes = [ - "utils/fmt/vector.h", ] [[values]] -type = "::FlexFlow::GenericTensorAccessorW" -key = "single" +type = "::FlexFlow::GenericTensorAccessorR" +key = "read" [[values]] -type = "std::vector<::FlexFlow::GenericTensorAccessorW>" -key = "variadic" +type = "::FlexFlow::GenericTensorAccessorW" +key = "write" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml index b0c571871b..75e9099104 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml @@ -10,12 +10,17 @@ features = [ includes = [ "pcg/tensor_guid_t.dtg.h", "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h", + "task-spec/dynamic_graph/dynamic_loss_tensor_guid_t.dtg.h", ] [[values]] -name = "tensor_guid" type = "::FlexFlow::tensor_guid_t" +key = "cg_tensor" [[values]] -name = "pcg_tensor_guid" type = "::FlexFlow::parallel_tensor_guid_t" +key = "pcg_tensor" + +[[values]] +type = "::FlexFlow::dynamic_loss_tensor_guid_t" +key = "loss_tensor" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.h b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.h new file mode 100644 index 0000000000..3497eec0ae --- /dev/null +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_TENSOR_GUID_T_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_TENSOR_GUID_T_H + +#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" +#include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h" + +namespace FlexFlow { + +dynamic_tensor_guid_t mk_dynamic_tensor_guid_for_tensor_guid(tensor_guid_t); +dynamic_tensor_guid_t + mk_dynamic_tensor_guid_for_parallel_tensor_guid(parallel_tensor_guid_t); +dynamic_tensor_guid_t mk_dynamic_tensor_guid_for_loss(); + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_role.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_role.dtg.toml index 91d05dbc2d..d640542323 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_role.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_role.dtg.toml @@ -11,6 +11,7 @@ features = [ includes = [ "task-spec/fwb_tensor_type.dtg.h", + "task-spec/dynamic_graph/dynamic_loss_tensor_role.dtg.h", "task-spec/dynamic_graph/dynamic_optimizer_tensor_role.dtg.h", ] @@ -21,3 +22,7 @@ key = "fwb_tensor" [[values]] type = "::FlexFlow::DynamicOptimizerTensorRole" key = "optimizer_tensor" + +[[values]] +type = "::FlexFlow::DynamicLossTensorRole" +key = "loss_tensor" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_role.h b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_role.h index 374230bd0d..a9db66c980 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_role.h +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_role.h @@ -11,6 +11,7 @@ DynamicTensorRole dynamic_tensor_role_from_fwb_tensor_type(FwbTensorType); DynamicTensorRole mk_dynamic_tensor_role_fwd(); DynamicTensorRole mk_dynamic_tensor_role_bwd(); DynamicTensorRole mk_dynamic_tensor_role_opt(OptimizerSlotName); +DynamicTensorRole mk_dynamic_tensor_role_loss(); } // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml index 6638f16e62..89b94b1017 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml @@ -11,9 +11,8 @@ includes = [ "", "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h", "op-attrs/parallel_tensor_shape.dtg.h", - "kernels/accessor.h", - "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h", "op-attrs/parallel_tensor_space_coordinate.dtg.h", + "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h", "task-spec/dynamic_graph/dynamic_tensor_role.dtg.h", ] @@ -35,7 +34,7 @@ type = "std::optional<::FlexFlow::ParallelTensorSpaceCoordinate>" [[fields]] name = "accessor" -type = "std::optional<::FlexFlow::GenericTensorAccessorW>" +type = "std::optional<::FlexFlow::DynamicTensorAccessor>" [[fields]] name = "role" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h new file mode 100644 index 0000000000..c7cef3f06f --- /dev/null +++ b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_LOSS_INSERTION_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_LOSS_INSERTION_H + +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h" +#include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h" +#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h" +#include "task-spec/dynamic_graph/loss_insertion_result.dtg.h" + +namespace FlexFlow { + +LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg, + LossAttrs const &loss_attrs, + dynamic_tensor_guid_t logit_tensor); + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion_result.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion_result.dtg.toml new file mode 100644 index 0000000000..4c2c316d1d --- /dev/null +++ b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion_result.dtg.toml @@ -0,0 +1,24 @@ +namespace = "FlexFlow" +name = "LossInsertionResult" +type = "struct" +features = [ + "eq", + "fmt", +] + +includes = [ + "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h", + "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h", +] + +[[fields]] +name = "dataflow_graph" +type = "::FlexFlow::DynamicOpenDataflowGraph" + +[[fields]] +name = "label_value" +type = "::FlexFlow::DynamicValueAttrs" + +[[fields]] +name = "logit_grad_value" +type = "::FlexFlow::DynamicValueAttrs" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml new file mode 100644 index 0000000000..66c475b3a9 --- /dev/null +++ b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml @@ -0,0 +1,21 @@ +namespace = "FlexFlow" +name = "TrainingOperationAttrs" +type = "variant" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "op-attrs/ops/loss_functions/loss_attrs.dtg.h", + "op-attrs/pcg_operator_attrs.dtg.h", +] + +[[values]] +type = "::FlexFlow::PCGOperatorAttrs" +key = "pcg_op" + +[[values]] +type = "::FlexFlow::LossAttrs" +key = "loss" diff --git a/lib/task-spec/include/task-spec/task_argument_accessor/itask_argument_accessor.h b/lib/task-spec/include/task-spec/task_argument_accessor/itask_argument_accessor.h index 8a8d741d90..a7d1af4022 100644 --- a/lib/task-spec/include/task-spec/task_argument_accessor/itask_argument_accessor.h +++ b/lib/task-spec/include/task-spec/task_argument_accessor/itask_argument_accessor.h @@ -24,8 +24,6 @@ struct ITaskArgumentAccessor { virtual ~ITaskArgumentAccessor() = default; - virtual ConcreteArgSpec const &get_concrete_arg(arg_slot_id_t) const = 0; - virtual GenericTensorAccessor get_tensor(TaskTensorParameter, Permissions priv) const = 0; diff --git a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_layer_guid_t.cc b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_layer_guid_t.cc new file mode 100644 index 0000000000..1b985076e2 --- /dev/null +++ b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_layer_guid_t.cc @@ -0,0 +1,17 @@ +#include "task-spec/dynamic_graph/dynamic_layer_guid_t.h" +#include "task-spec/dynamic_graph/dynamic_loss_layer_guid_t.dtg.h" + +namespace FlexFlow { + +dynamic_layer_guid_t mk_dynamic_layer_guid_for_cg_layer(layer_guid_t l) { + return dynamic_layer_guid_t{l}; +} +dynamic_layer_guid_t + mk_dynamic_layer_guid_for_pcg_layer(parallel_layer_guid_t l) { + return dynamic_layer_guid_t{l}; +} +dynamic_layer_guid_t mk_dynamic_layer_guid_for_loss() { + return dynamic_layer_guid_t{dynamic_loss_layer_guid_t{}}; +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc index 8568b56b11..bf9fe1d3a0 100644 --- a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc +++ b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_open_dataflow_graph.cc @@ -84,6 +84,20 @@ std::unordered_set return g.invocations; } +std::optional + find_output_value_attrs(DynamicOpenDataflowGraph const &dg, + dynamic_tensor_guid_t tensor_guid, + std::optional const &role) { + for (DynamicNodeInvocation const &invocation : dg.invocations) { + for (auto const &[slot, output] : invocation.outputs) { + if (output.tensor_guid == tensor_guid && output.role == role) { + return output; + } + } + } + return std::nullopt; +} + DynamicOpenDataflowGraph transform_dynamic_invocation_set( DynamicOpenDataflowGraph const &g, std::function const @@ -120,10 +134,11 @@ DynamicOpenDataflowGraph dynamic_open_dataflow_graph_from_invocation_set( }; } -LabelledOpenKwargDataflowGraph +std::pair, + bidict> labelled_open_kwarg_dataflow_graph_from_dynamic_open_dataflow_graph( DynamicOpenDataflowGraph const &g) { @@ -177,6 +192,7 @@ LabelledOpenKwargDataflowGraph node_map; std::unordered_set to_add = g.invocations; auto add_invocation_to_graph = @@ -189,6 +205,7 @@ LabelledOpenKwargDataflowGraph lhs_dataflow_graph = labelled_open_kwarg_dataflow_graph_from_dynamic_open_dataflow_graph( - lhs); + lhs) + .first; LabelledOpenKwargDataflowGraphView rhs_dataflow_graph = labelled_open_kwarg_dataflow_graph_from_dynamic_open_dataflow_graph( - rhs); + rhs) + .first; return find_isomorphism_between_labelled_open_kwarg_dataflow_graphs( lhs_dataflow_graph, rhs_dataflow_graph) diff --git a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_tensor_guid_t.cc b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_tensor_guid_t.cc new file mode 100644 index 0000000000..b335575fb2 --- /dev/null +++ b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_tensor_guid_t.cc @@ -0,0 +1,17 @@ +#include "task-spec/dynamic_graph/dynamic_tensor_guid_t.h" +#include "task-spec/dynamic_graph/dynamic_loss_tensor_guid_t.dtg.h" + +namespace FlexFlow { + +dynamic_tensor_guid_t mk_dynamic_tensor_guid_for_tensor_guid(tensor_guid_t t) { + return dynamic_tensor_guid_t{t}; +} +dynamic_tensor_guid_t + mk_dynamic_tensor_guid_for_parallel_tensor_guid(parallel_tensor_guid_t t) { + return dynamic_tensor_guid_t{t}; +} +dynamic_tensor_guid_t mk_dynamic_tensor_guid_for_loss() { + return dynamic_tensor_guid_t{dynamic_loss_tensor_guid_t{}}; +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_tensor_role.cc b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_tensor_role.cc index 235436cdac..17b97a4b37 100644 --- a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_tensor_role.cc +++ b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_tensor_role.cc @@ -19,4 +19,8 @@ DynamicTensorRole mk_dynamic_tensor_role_opt(OptimizerSlotName s) { return DynamicTensorRole{DynamicOptimizerTensorRole{s}}; } +DynamicTensorRole mk_dynamic_tensor_role_loss() { + return DynamicTensorRole{DynamicLossTensorRole{}}; +} + } // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_value_attrs.cc b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_value_attrs.cc index 418f496450..282279edbe 100644 --- a/lib/task-spec/src/task-spec/dynamic_graph/dynamic_value_attrs.cc +++ b/lib/task-spec/src/task-spec/dynamic_graph/dynamic_value_attrs.cc @@ -2,10 +2,15 @@ namespace FlexFlow { -DynamicValueAttrs decide_dynamic_value_attrs_role(DynamicValueAttrs const &, - DynamicTensorRole) { +DynamicValueAttrs + decide_dynamic_value_attrs_role(DynamicValueAttrs const &attrs, + DynamicTensorRole role) { + ASSERT(attrs.role == std::nullopt); - NOT_IMPLEMENTED(); + DynamicValueAttrs result = attrs; + result.role = role; + + return result; } } // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc new file mode 100644 index 0000000000..4270119612 --- /dev/null +++ b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc @@ -0,0 +1,65 @@ +#include "task-spec/dynamic_graph/loss_insertion.h" +#include "pcg/optimizer_attrs.h" +#include "task-spec/dynamic_graph/dynamic_layer_guid_t.h" +#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h" +#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h" +#include "task-spec/dynamic_graph/dynamic_tensor_guid_t.h" +#include "task-spec/dynamic_graph/dynamic_tensor_role.h" +#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h" +#include "task-spec/dynamic_graph/training_operation_attrs.dtg.h" +#include "utils/optional.h" +#include + +namespace FlexFlow { + +LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg, + LossAttrs const &loss_attrs, + dynamic_tensor_guid_t logit_tensor) { + DynamicValueAttrs logit_value = assert_unwrap( + find_output_value_attrs(dg, logit_tensor, mk_dynamic_tensor_role_fwd())); + + DynamicValueAttrs label_value{ + /*tensor_guid=*/mk_dynamic_tensor_guid_for_loss(), + /*parallel_tensor_shape=*/logit_value.parallel_tensor_shape, + /*shard_coord=*/logit_value.shard_coord, + /*accessor=*/std::nullopt, + /*role=*/mk_dynamic_tensor_role_loss(), + }; + DynamicValueAttrs logit_grad_value{ + /*tensor_guid=*/logit_value.tensor_guid, + /*parallel_tensor_shape=*/logit_value.parallel_tensor_shape, + /*shard_coord=*/logit_value.shard_coord, + /*accessor=*/std::nullopt, + /*role=*/mk_dynamic_tensor_role_bwd(), + }; + DynamicNodeInvocation loss_invocation{ + /*inputs=*/{ + {DynamicTensorSlot{/*slot_name=*/TensorSlotName::INPUT, + /*slot_tensor_role=*/label_value.role}, + label_value}, + {DynamicTensorSlot{/*slot_name=*/TensorSlotName::LOGIT, + /*slot_tensor_role=*/logit_value.role}, + logit_value}, + }, + /*node_attrs=*/ + DynamicNodeAttrs{ + /*task_type=*/DynamicTaskType::LOSS, + /*device_coord=*/std::nullopt, + /*mapping=*/std::nullopt, + /*op_attrs=*/TrainingOperationAttrs{loss_attrs}, + /*layer_guid=*/mk_dynamic_layer_guid_for_loss(), + /*per_device_op_state=*/std::nullopt, + }, + /*outputs=*/ + { + {DynamicTensorSlot{/*slot_name=*/TensorSlotName::LOGIT, + /*slot_tensor_role=*/logit_grad_value.role}, + logit_grad_value}, + }, + }; + DynamicOpenDataflowGraph result = dg; + result.invocations.insert(loss_invocation); + return LossInsertionResult{result, label_value, logit_grad_value}; +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc index 415151f43a..204597386e 100644 --- a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc +++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.cc @@ -1,9 +1,12 @@ #include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.h" +#include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/pcg_operator_attrs.h" #include "pcg/computation_graph.h" +#include "pcg/tensor_attrs.dtg.h" #include "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h" #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h" #include "task-spec/dynamic_graph/dynamic_tensor_role.h" +#include "task-spec/dynamic_graph/training_operation_attrs.dtg.h" #include "utils/containers/generate_map.h" #include #include @@ -20,7 +23,9 @@ DynamicOpenDataflowGraph /*task_type=*/std::nullopt, /*device_coord=*/std::nullopt, /*mapping=*/std::nullopt, - /*op_attrs=*/pcg_op_attrs_from_compgraph_op_attrs(attrs.op_attrs), + /*op_attrs=*/ + TrainingOperationAttrs{ + pcg_op_attrs_from_compgraph_op_attrs(attrs.op_attrs)}, /*pcg_layer_guid=*/dynamic_layer_guid_t{layer}, /*per_device_op_state=*/std::nullopt, }; @@ -28,7 +33,8 @@ DynamicOpenDataflowGraph std::unordered_map result_inputs = transform( get_incoming_tensors(cg, layer), - [](TensorSlotName const &slot_name, tensor_guid_t const &tensor) { + [&](TensorSlotName const &slot_name, tensor_guid_t const &tensor) { + TensorAttrs attrs = get_tensor_attrs(cg, tensor); return std::pair{ DynamicTensorSlot{ /*slot_name=*/slot_name, @@ -36,7 +42,7 @@ DynamicOpenDataflowGraph }, DynamicValueAttrs{ /*tensor_guid=*/dynamic_tensor_guid_t{tensor}, - /*parallel_tensor_shape=*/std::nullopt, + /*parallel_tensor_shape=*/lift_to_parallel(attrs.shape), /*shard_coord=*/std::nullopt, /*accessor=*/std::nullopt, /*role=*/std::nullopt, @@ -46,7 +52,8 @@ DynamicOpenDataflowGraph std::unordered_map result_outputs = transform( get_outgoing_tensors(cg, layer), - [](TensorSlotName const &slot_name, tensor_guid_t const &tensor) { + [&](TensorSlotName const &slot_name, tensor_guid_t const &tensor) { + TensorAttrs attrs = get_tensor_attrs(cg, tensor); return std::pair{ DynamicTensorSlot{ /*slot_name=*/slot_name, @@ -54,7 +61,7 @@ DynamicOpenDataflowGraph }, DynamicValueAttrs{ /*tensor_guid=*/dynamic_tensor_guid_t{tensor}, - /*parallel_tensor_shape=*/std::nullopt, + /*parallel_tensor_shape=*/lift_to_parallel(attrs.shape), /*shard_coord=*/std::nullopt, /*accessor=*/std::nullopt, /*role=*/std::nullopt, @@ -62,7 +69,7 @@ DynamicOpenDataflowGraph }; }); - result.invocations.emplace(result_outputs, result_attrs, result_outputs); + result.invocations.emplace(result_inputs, result_attrs, result_outputs); } return result; diff --git a/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc index 2a1ae071fa..0cee06368f 100644 --- a/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc +++ b/lib/task-spec/src/task-spec/dynamic_graph/pass_expansion.cc @@ -31,7 +31,7 @@ bool graph_is_fully_pass_expanded(DynamicOpenDataflowGraph const &g) { DynamicTensorSlot pass_expand_slot(DynamicTensorSlot const &s, FwbTensorType tensor_type) { - ASSERT(s.slot_tensor_role == std::nullopt); + ASSERT(!slot_is_pass_expanded(s)); DynamicTensorSlot result = s; result.slot_tensor_role = diff --git a/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc index 66e7115a83..58a32db6c1 100644 --- a/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc +++ b/lib/task-spec/src/task-spec/dynamic_graph/update_insertion.cc @@ -1,9 +1,10 @@ #include "task-spec/dynamic_graph/update_insertion.h" +#include "op-attrs/pcg_operator_attrs.dtg.h" #include "pcg/optimizer_attrs.h" #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h" #include "task-spec/dynamic_graph/dynamic_tensor_role.h" -#include "task-spec/dynamic_graph/dynamic_tensor_slot.h" -#include "task-spec/dynamic_graph/dynamic_value_attrs.h" +#include "task-spec/dynamic_graph/dynamic_tensor_slot.dtg.h" +#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h" #include "task-spec/optimizer.h" #include "utils/containers/get_only.h" #include "utils/containers/map_from_pairs.h" @@ -13,7 +14,8 @@ namespace FlexFlow { static std::pair get_weight_output(DynamicNodeInvocation const &i) { - ASSERT(i.node_attrs.op_attrs.value().is_weight()); + ASSERT(i.node_attrs.op_attrs.value().is_pcg_op()); + ASSERT(i.node_attrs.op_attrs.value().require_pcg_op().is_weight()); ASSERT(i.inputs.size() == 0); auto [slot, value_attrs] = get_only(i.outputs); @@ -24,6 +26,23 @@ static std::pair }; } +static DynamicTensorSlot tensor_slot_with_role(DynamicTensorSlot const &slot, + DynamicTensorRole role) { + DynamicTensorSlot result = slot; + result.slot_tensor_role = role; + + return result; +} + +static DynamicValueAttrs + dynamic_value_attrs_with_role(DynamicValueAttrs const &attrs, + DynamicTensorRole role) { + DynamicValueAttrs result = attrs; + result.role = role; + + return result; +} + static DynamicNodeInvocation get_update_invocation_for_invocation( DynamicNodeInvocation const &i, OptimizerAttrs const &optimizer_attrs) { @@ -38,13 +57,13 @@ static DynamicNodeInvocation get_update_invocation_for_invocation( auto create_binding_for_role = [&](DynamicTensorRole const &role) -> std::pair { - DynamicTensorSlot binding_slot = decide_tensor_slot_role(slot, role); - DynamicValueAttrs value_attrs = decide_dynamic_value_attrs_role( + DynamicTensorSlot binding_slot = tensor_slot_with_role(slot, role); + DynamicValueAttrs binding_attrs = dynamic_value_attrs_with_role( value_attrs, mk_dynamic_tensor_role_fwd()); return std::pair{ binding_slot, - value_attrs, + binding_attrs, }; }; @@ -70,7 +89,8 @@ std::unordered_set OptimizerAttrs const &optimizer_attrs) { if (invocation.node_attrs.task_type.value() == DynamicTaskType::FWD && - invocation.node_attrs.op_attrs.value().is_weight()) { + invocation.node_attrs.op_attrs.value().is_pcg_op() && + invocation.node_attrs.op_attrs.value().require_pcg_op().is_weight()) { return std::unordered_set{ invocation, get_update_invocation_for_invocation(invocation, optimizer_attrs), diff --git a/lib/task-spec/src/task-spec/ops/impl/linear.cc b/lib/task-spec/src/task-spec/ops/impl/linear.cc index e90cbd2544..5985db64f7 100644 --- a/lib/task-spec/src/task-spec/ops/impl/linear.cc +++ b/lib/task-spec/src/task-spec/ops/impl/linear.cc @@ -47,8 +47,8 @@ static std::optional LinearAttrs attrs = acc.get_op_attrs().require_linear(); DeviceType kernel_device_type = acc.get_kernel_device_type(); ProfilingSettings profiling = acc.get_profiling_settings(); - LinearPerDeviceState per_device_state = - acc.get_per_device_op_state().require_linear().value(); + std::optional per_device_state = + acc.get_per_device_op_state().require_linear(); std::optional bias = std::nullopt; if (attrs.use_bias) { @@ -84,8 +84,8 @@ static std::optional LinearAttrs attrs = acc.get_op_attrs().require_linear(); DeviceType kernel_device_type = acc.get_kernel_device_type(); ProfilingSettings profiling = acc.get_profiling_settings(); - LinearPerDeviceState per_device_state = - acc.get_per_device_op_state().require_linear().value(); + std::optional per_device_state = + acc.get_per_device_op_state().require_linear(); std::optional bias_grad = std::nullopt; if (attrs.use_bias) { diff --git a/lib/task-spec/src/task-spec/optimizer.cc b/lib/task-spec/src/task-spec/optimizer.cc index 447f6095d2..bd21020085 100644 --- a/lib/task-spec/src/task-spec/optimizer.cc +++ b/lib/task-spec/src/task-spec/optimizer.cc @@ -10,8 +10,8 @@ namespace FlexFlow { static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { SGDOptimizerAttrs attrs = acc.get_optimizer_attrs().require_sgd_optimizer(); auto weight_grad = - acc.get_tensor_grad(TensorSlotName::WEIGHT); - auto weight = acc.get_tensor(TensorSlotName::WEIGHT); + acc.get_tensor_grad(TensorSlotName::OUTPUT); + auto weight = acc.get_tensor(TensorSlotName::OUTPUT); ProfilingSettings profiling = acc.get_profiling_settings(); DeviceType kernel_device_type = acc.get_kernel_device_type(); @@ -26,7 +26,7 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { std::optional sgd_v = std::nullopt; if (attrs.momentum > 0.0f) { - sgd_v = acc.get_optimizer_tensor(TensorSlotName::WEIGHT, + sgd_v = acc.get_optimizer_tensor(TensorSlotName::OUTPUT, OptimizerSlotName::SGD_V); ASSERT(sgd_v.value().shape == weight.shape); } @@ -54,8 +54,8 @@ TaskImplFunction get_sgd_update_task_impl() { static void adam_update_task_impl(TaskArgumentAccessor const &acc) { AdamOptimizerAttrs attrs = acc.get_optimizer_attrs().require_adam_optimizer(); auto weight_grad = - acc.get_tensor_grad(TensorSlotName::WEIGHT); - auto weight = acc.get_tensor(TensorSlotName::WEIGHT); + acc.get_tensor_grad(TensorSlotName::OUTPUT); + auto weight = acc.get_tensor(TensorSlotName::OUTPUT); auto v_tensor = acc.get_optimizer_tensor( TensorSlotName::WEIGHT, OptimizerSlotName::ADAM_V); auto m_tensor = acc.get_optimizer_tensor( diff --git a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc b/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc index 998d73e9ff..20e0d00c57 100644 --- a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc +++ b/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc @@ -46,7 +46,7 @@ task_id_with_noop_default_t return lift_task_id_t(task_id_t::ELEMENTBINARY_INIT_TASK_ID); }, [](ElementUnaryAttrs const &) { - return lift_task_id_t(task_id_t::ELEMENTBINARY_INIT_TASK_ID); + return lift_task_id_t(task_id_t::ELEMENTUNARY_INIT_TASK_ID); }, [](EmbeddingAttrs const &) { return default_noop_task(); }, [](FlatAttrs const &) { return default_noop_task(); }, @@ -111,7 +111,7 @@ task_id_with_noop_default_t return lift_task_id_t(task_id_t::ELEMENTBINARY_FWD_TASK_ID); }, [](ElementUnaryAttrs const &) { - return lift_task_id_t(task_id_t::ELEMENTBINARY_FWD_TASK_ID); + return lift_task_id_t(task_id_t::ELEMENTUNARY_FWD_TASK_ID); }, [](EmbeddingAttrs const &) { return lift_task_id_t(task_id_t::EMBED_FWD_TASK_ID); @@ -190,7 +190,7 @@ task_id_with_noop_default_t return lift_task_id_t(task_id_t::ELEMENTBINARY_BWD_TASK_ID); }, [](ElementUnaryAttrs const &) { - return lift_task_id_t(task_id_t::ELEMENTBINARY_BWD_TASK_ID); + return lift_task_id_t(task_id_t::ELEMENTUNARY_BWD_TASK_ID); }, [](EmbeddingAttrs const &) { return lift_task_id_t(task_id_t::EMBED_BWD_TASK_ID); diff --git a/lib/utils/include/utils/containers/map_keys_and_values.h b/lib/utils/include/utils/containers/map_keys_and_values.h new file mode 100644 index 0000000000..651ffb2aeb --- /dev/null +++ b/lib/utils/include/utils/containers/map_keys_and_values.h @@ -0,0 +1,32 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MAP_KEYS_AND_VALUES_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MAP_KEYS_AND_VALUES_H + +#include "utils/containers/keys.h" +#include +#include + +namespace FlexFlow { + +template , + typename V2 = std::invoke_result_t> +std::unordered_map map_keys_and_values( + std::unordered_map const &m, FK const &fk, FV const &fv) { + + std::unordered_map result; + for (auto const &kv : m) { + result.insert({fk(kv.first), fv(kv.second)}); + } + + ASSERT(keys(m).size() == keys(result).size(), + "keys passed to map_keys must be transformed into distinct keys"); + + return result; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/optional.h b/lib/utils/include/utils/optional.h index 5fb33e79c9..81b81fbb45 100644 --- a/lib/utils/include/utils/optional.h +++ b/lib/utils/include/utils/optional.h @@ -7,6 +7,15 @@ namespace FlexFlow { +template > +U and_then(std::optional const &o, F &&f) { + if (o.has_value()) { + return f(o.value()); + } else { + return std::nullopt; + } +} + template T or_else(std::optional const &o, F &&f) { if (o.has_value()) { diff --git a/lib/utils/src/utils/containers/map_keys_and_values.cc b/lib/utils/src/utils/containers/map_keys_and_values.cc new file mode 100644 index 0000000000..b3b306988e --- /dev/null +++ b/lib/utils/src/utils/containers/map_keys_and_values.cc @@ -0,0 +1,16 @@ +#include "utils/containers/map_keys_and_values.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using K = value_type<0>; +using V = value_type<1>; +using K2 = value_type<2>; +using V2 = value_type<3>; +using FK = std::function; +using FV = std::function; + +template std::unordered_map map_keys_and_values( + std::unordered_map const &, FK const &, FV const &); + +} // namespace FlexFlow diff --git a/lib/utils/test/src/utils/containers/map_keys_and_values.cc b/lib/utils/test/src/utils/containers/map_keys_and_values.cc new file mode 100644 index 0000000000..d50ed82bfc --- /dev/null +++ b/lib/utils/test/src/utils/containers/map_keys_and_values.cc @@ -0,0 +1,26 @@ +#include "utils/containers/map_keys_and_values.h" +#include "test/utils/doctest/fmt/unordered_map.h" +#include + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("map_keys_and_values") { + SUBCASE("Distinct keys after transformation") { + std::unordered_map m = {{1, "one"}, {2, "three"}}; + auto fk = [](int x) { return x * x; }; + auto fv = [](std::string const &s) { return s.size(); }; + std::unordered_map result = map_keys_and_values(m, fk, fv); + std::unordered_map correct = {{1, 3}, {4, 5}}; + CHECK(correct == result); + } + + SUBCASE("Non-distinct keys after transformation") { + std::unordered_map m = { + {1, "one"}, {2, "two"}, {-1, "minus one"}}; + auto fk = [](int x) { return std::abs(x); }; + auto fv = [](std::string const &s) { return s.size(); }; + CHECK_THROWS(map_keys_and_values(m, fk, fv)); + } + } +}