AI-Hypercomputer
diff --git a/‎recml/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎recml/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recml/core/data/tf_dataset_factory.py‎
Lines changed: 63 additions & 21 deletions b/‎recml/core/data/tf_dataset_factory.py‎
Lines changed: 63 additions & 21 deletions
diff --git a/‎recml/core/ops/hstu_ops.py‎
Lines changed: 8 additions & 8 deletions b/‎recml/core/ops/hstu_ops.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎recml/core/training/core.py‎
Lines changed: 32 additions & 26 deletions b/‎recml/core/training/core.py‎
Lines changed: 32 additions & 26 deletions
@@ -38,3 +38,4 @@
 from recml.core.utils.types import Factory
 from recml.core.utils.types import FactoryProtocol
 from recml.core.utils.types import ObjectFactory
+from recml.layers.common import EmbeddingSpec
@@ -24,6 +24,7 @@
 import re
 from typing import Any, Protocol
 
+from absl import flags
 from absl import logging
 import jax
 from recml.core.utils import types
@@ -162,12 +163,23 @@ class TFDatasetFactory(types.Factory[tf.data.Dataset]):
       Defaults to False.
     seed: An optional seed to use for deterministic shuffling / preprocessing.
       Defaults to None.
-    tf_data_service_address: An optional URI of a tf.data service to offload
-      preprocessing onto during training. The URI should be in the format
-      "protocol://address", e.g. "grpc://tf-data-service:5050". If `None` no
-      data service will be applied.
+    enable_tf_data_service: Whether to apply tf.data service for this dataset.
+      If True, flag `tf_data_service_address` must be set.
     tf_data_service_policy: Sharding policy to use for tf.data service when it
       is enabled.
+    tf_data_service_job_name: Job name to use for tf.data service. If None, the
+      default job name will be used.
+    offload_preprocessing_to_tf_data_service: Whether to offload preprocessing
+      to tf.data service. If True, enable_tf_data_service must also be True, and
+      the preprocessing transformation will be offloaded to tf data service
+      workers. Otherwise, the preprocessing transformation will be applied on
+      the host CPU. If tf data service is not enabled, this arg must be set
+      False. Defaults to False.
+    tf_data_service_replicate_on_split: Whether to replicate the file dataset on
+      split when distributing data to tf.data service workers. Note: it could be
+      used in the case where multiple datasets are processed together under
+      `Dynamic` mode. The dataset with `tf_data_service_replicate_on_split`
+      enabled is equivalent to having that dataset processed as `Off` mode.
     feature_spec: A mapping of feature keys to `FixedLenFeature`,
       `VarLenFeature`, `SparseFeature`, or `RaggedFeature` values. This will be
       used to parse the TF examples, or as context_features spec to parse TF
@@ -206,12 +218,13 @@ class TFDatasetFactory(types.Factory[tf.data.Dataset]):
       dataset. Defaults to `ShardingInfo(num_processes=jax.process_count(),
       process_index=jax.process_index())`. This is similar to `InputContext` in
       tensorflow.
+    cache_reading: Whether to cache the reading of the dataset. This is useful
+      for debugging and testing. Defaults to False.
     debug: An optional boolean indicating whether to debug input boundedness. If
       `True`, the dataset will consist of a single batch that's cached and
-      infinitely repeated
+      infinitely repeated.
   """
 
-  cache_reading: bool = False
   input_path: str | Sequence[str] = ""
   tfds_source: str | Sequence[str] = ""
   file_format: FileFormat = FileFormat.RECORDIO
@@ -231,10 +244,13 @@ class TFDatasetFactory(types.Factory[tf.data.Dataset]):
   readahead: str | None = None
   group_uris_by_dir: bool = False
   seed: int | None = None
-  tf_data_service_address: str | None = None
+  enable_tf_data_service: bool = False
+  tf_data_service_job_name: str | None = None
   tf_data_service_policy: tf.data.experimental.service.ShardingPolicy = (
       tf.data.experimental.service.ShardingPolicy.OFF
   )
+  offload_preprocessing_to_tf_data_service: bool = False
+  tf_data_service_replicate_on_split: bool = False
   feature_spec: Mapping[str, IO_Feature] | None = None
   sequence_feature_spec: Mapping[str, IO_Feature] | None = None
   tf_transform_output: TFTransformOutput | None = None
@@ -246,14 +262,26 @@ class TFDatasetFactory(types.Factory[tf.data.Dataset]):
   sharding_info: DatasetShardingInfo = dataclasses.field(
       default_factory=DatasetShardingInfo
   )
+  cache_reading: bool = False
   debug: bool = False
 
   def __post_init__(self):
-    if self.tf_data_service_address is not None:
+    if self.enable_tf_data_service:
+      if flags.FLAGS.tf_data_service_address is None:
+        raise ValueError(
+            "Flag `tf_data_service_address` must be set when"
+            " `enable_tf_data_service` is True."
+        )
       if self.seed is not None:
         raise ValueError("`seed` must be None for data service.")
       if self.sharding:
         raise ValueError("`sharding` must be set to False for data service.")
+    else:
+      if self.offload_preprocessing_to_tf_data_service:
+        raise ValueError(
+            "`offload_preprocessing_to_tf_data_service` must be False when"
+            " `enable_tf_data_service` is False."
+        )
 
   @functools.cached_property
   def tfds_metadata(self) -> TFDSMetadata | None:
@@ -464,6 +492,9 @@ def _file_group_reader(file_group: str) -> tf.data.Dataset:
     # Create a dataset of file / file group uris.
     dataset = tf.data.Dataset.from_tensor_slices(uris)
 
+    if self.tf_data_service_replicate_on_split:
+      dataset = tf.data.apply_rewrite(dataset, rewrite="replicate_on_split")
+
     # Repeat the dataset. We might need to repeat the dataset here in case the
     # issue is encountered: internal screenshot link:6jAKKoEMT3afXRe
     # even we do have enough shards for the input data.
@@ -478,7 +509,7 @@ def _file_group_reader(file_group: str) -> tf.data.Dataset:
       )
 
     # Generate a tf.Example dataset by cycling through all uris in parallel.
-    return dataset.interleave(
+    dataset = dataset.interleave(
         map_func=reader,
         cycle_length=self.cycle_length,
         block_length=self.block_length,
@@ -490,6 +521,12 @@ def _file_group_reader(file_group: str) -> tf.data.Dataset:
         deterministic=self.deterministic,
     )
 
+    # Cache the reading of examples from files.
+    if self.cache_reading:
+      dataset = dataset.cache()
+
+    return dataset
+
   def _parse_dataset(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
     """Batches and parses an examples dataset."""
     # Batch the dataset to the global or per replica batch size.
@@ -533,45 +570,51 @@ def _maybe_apply_tf_data_service(
       self, dataset: tf.data.Dataset
   ) -> tf.data.Dataset:
     """Applies the tf.data service to the dataset."""
-    if self.tf_data_service_address is None:
+    if not self.enable_tf_data_service:
       return dataset
 
+    tf_data_service_address = flags.FLAGS.tf_data_service_address
+
     per_proc_batch_size = self.sharding_info.per_process_batch_size(
         self.global_batch_size
     )
     logging.info(
         "Applying tf.data service with address %s and per replica batch"
         " size %s",
-        self.tf_data_service_address,
+        tf_data_service_address,
         per_proc_batch_size,
     )
     return dataset.apply(
         tf.data.experimental.service.distribute(
             processing_mode=self.tf_data_service_policy,
-            service=self.tf_data_service_address,
-            job_name=f"bs_{per_proc_batch_size}",
+            service=tf_data_service_address,
+            job_name=self.tf_data_service_job_name
+            or "tf_data_service_shared_job_name",
         )
     )
 
   def make(self) -> tf.data.Dataset:
     """Creates a `tf.data.Dataset` instance with all dataset ops applied."""
     # Create an examples dataset.
-    if self.cache_reading:
-      dataset = self._create_dataset().cache()
-    else:
-      dataset = self._create_dataset()
+    dataset = self._create_dataset()
     # Shuffle and repeat the dataset.
     dataset = self._maybe_shuffle_and_repeat(dataset)
     # Batch and parse the examples dataset.
     dataset = self._parse_dataset(dataset)
     # Apply filters to the batched dataset.
     dataset = self._maybe_filter_dataset(dataset)
-    # Apply data service.
-    dataset = self._maybe_apply_tf_data_service(dataset)
+    # Apply TF Data service before preprocessing.
+    if not self.offload_preprocessing_to_tf_data_service:
+      dataset = self._maybe_apply_tf_data_service(dataset)
+
     # Apply transformations on the dataset.
     for fn in self.map_fns:
       dataset = dataset.map(fn, num_parallel_calls=self.num_parallel_threads)
 
+    # Apply TF Data Service after preprocessing.
+    if self.offload_preprocessing_to_tf_data_service:
+      dataset = self._maybe_apply_tf_data_service(dataset)
+
     if self.debug:
       dataset = dataset.take(1).cache().repeat()
 
@@ -778,8 +821,7 @@ def _vectorized_filter(features: FeaturesDictType) -> FeaturesDictType:
       if isinstance(features[name], tf.SparseTensor):
         outputs[name] = tf.sparse_boolean_mask(features[name], mask)
       elif isinstance(features[name], tf.RaggedTensor):
-        # TODO(b/307323524): Support this when we start using Ragged tensors.
-        raise ValueError("Filtering ragged tensors is not supported.")
+        outputs[name] = tf.ragged.boolean_mask(features[name], mask)
       else:
         outputs[name] = tf.boolean_mask(features[name], mask)
     return outputs
 
@@ -125,9 +125,9 @@ def _apply_mask(
   masks = []
   if mask_ref is not None:
     if k_in_lanes:
-      mask = pl.load(mask_ref, (slice(None), k_slice))
+      mask = mask_ref[:, k_slice]
     else:
-      mask = pl.load(mask_ref, (k_slice, slice(None)))
+      mask = mask_ref[k_slice, :]
 
     snm = jnp.where(should_not_mask, 1, 0)
     masks.append(jnp.bitwise_or(mask, jnp.broadcast_to(snm, mask.shape)) != 0)
@@ -156,7 +156,7 @@ def _apply_mask(
       k_sequence = k_offset + jax.lax.broadcasted_iota(
           jnp.int32, (k_slice.size, bq), 0
       )
-      q_sequence = pl.load(q_sequence_ref, (pl.ds(1), slice(None)))  # [1, bq]
+      q_sequence = q_sequence_ref[:1, :]  # [1, bq]
       q_sequence = jnp.broadcast_to(q_sequence, (k_slice.size, bq))
 
     assert q_sequence.shape == k_sequence.shape
@@ -170,7 +170,7 @@ def _apply_mask(
 
   if q_segment_ids_ref is not None:
     if k_in_lanes:
-      kv_ids = pl.load(kv_segment_ids_ref, (pl.ds(1), k_slice))  # [1, k_slice]
+      kv_ids = kv_segment_ids_ref[:1, k_slice]  # [1, k_slice]
       repeats, rem = divmod(kv_ids.shape[1], NUM_LANES)
       if rem:
         raise NotImplementedError(f"block_kv must be a multiple of {NUM_LANES}")
@@ -181,9 +181,9 @@ def _apply_mask(
       if rem:
         raise NotImplementedError(f"block_q must be a multiple of {NUM_LANES}")
       kv_ids = pltpu.repeat(
-          pl.load(kv_segment_ids_ref, (k_slice, slice(None))), repeats, axis=1
+          kv_segment_ids_ref[k_slice, :], repeats, axis=1
       )  # [k_slice, bq]
-      q_ids = pl.load(q_segment_ids_ref, (pl.ds(1), slice(None)))  # [1, bq]
+      q_ids = q_segment_ids_ref[:1, :]  # [1, bq]
     masks.append(q_ids == kv_ids)
 
   if masks:
@@ -228,7 +228,7 @@ def body(kv_compute_index, _):
     slice_k = pl.ds(kv_compute_index * bkv_compute, bkv_compute)
 
     q = q_ref[...]
-    k = pl.load(k_ref, (slice_k, slice(None)))
+    k = k_ref[slice_k, :]
     qk = jax.lax.dot_general(
         q, k, NT_DIM_NUMBERS, preferred_element_type=jnp.float32
     )
@@ -256,7 +256,7 @@ def body(kv_compute_index, _):
     )
 
     sv_dims = NN_DIM_NUMBERS
-    v = pl.load(v_ref, (slice_k, slice(None)))
+    v = v_ref[slice_k, :]
 
     to_float32 = lambda x: x.astype(jnp.float32)
     v = to_float32(v)
 
@@ -13,12 +13,15 @@
 # limitations under the License.
 """Core training library for Jax."""
 
+from __future__ import annotations
+
 import abc
 from collections.abc import Mapping, Sequence
 import dataclasses
 import enum
 from typing import Any, Generic, TypeVar
 
+import fiddle as fdl
 import jax
 import jax.numpy as jnp
 from recml.core.data import iterator
@@ -37,7 +40,6 @@
 ORBAX_CHECKPOINT_DEFAULT_KEY = "default"
 
 DEFAULT_RNG_SEED = 0
-IN_TRAINER_CONTEXT = False  # Set to true when run from the main trainer.
 STATE_CHECKPOINT_KEY = "state"
 
 TaskT = TypeVar("TaskT")
@@ -57,6 +59,14 @@
 class Trainer(abc.ABC, Generic[TaskT]):
   """A base trainer interface for training and evaluation."""
 
+  class Mode(enum.StrEnum):
+    """Mode to run an experiment."""
+
+    TRAIN = "train"
+    EVAL = "eval"
+    TRAIN_AND_EVAL = "train_and_eval"
+    CONTINUOUS_EVAL = "continuous_eval"
+
   @abc.abstractmethod
   def __init__(self, model_dir: str, *args, **kwargs):
     """Initializes the instance."""
@@ -77,6 +87,23 @@ def train_and_evaluate(self, task: TaskT, *args, **kwargs) -> Logs | None:
   def evaluate_continuously(self, task: TaskT, *args, **kwargs) -> Logs | None:
     """Performs continuous evaluation until a condition is met."""
 
+  def run(self, task: TaskT, mode: Any) -> Logs | None:
+    """Runs the experiment in the given mode."""
+    if mode == Trainer.Mode.TRAIN_AND_EVAL:
+      return self.train_and_evaluate(task)
+    elif mode == Trainer.Mode.TRAIN:
+      return self.train(task)
+    elif mode == Trainer.Mode.EVAL:
+      return self.evaluate(task)
+    elif mode == Trainer.Mode.CONTINUOUS_EVAL:
+      return self.evaluate_continuously(task)
+    else:
+      raise ValueError(f"The job mode provided is not supported: {mode}.")
+
+  @classmethod
+  def setup_experiment(cls, experiment_cfg: fdl.Config[Experiment]):
+    """Sets up the experiment before it is instantiated."""
+
 
 @dataclasses.dataclass(frozen=True)
 class Experiment(Generic[TaskT]):
@@ -90,32 +117,13 @@ class Experiment(Generic[TaskT]):
     trainer: The trainer to use for the experiment.
   """
 
-  class Mode(enum.StrEnum):
-    """Mode to run an experiment."""
-
-    TRAIN = "train"
-    EVAL = "eval"
-    TRAIN_AND_EVAL = "train_and_eval"
-    CONTINUOUS_EVAL = "continuous_eval"
-
   task: TaskT
   trainer: Trainer[TaskT]
 
 
-def run_experiment(
-    experiment: Experiment, mode: Experiment.Mode
-) -> Logs | None:
+def run_experiment(experiment: Experiment, mode: Any) -> Logs | None:
   """Runs an experiment."""
-  if mode == Experiment.Mode.TRAIN_AND_EVAL:
-    return experiment.trainer.train_and_evaluate(experiment.task)
-  elif mode == Experiment.Mode.TRAIN:
-    return experiment.trainer.train(experiment.task)
-  elif mode == Experiment.Mode.EVAL:
-    return experiment.trainer.evaluate(experiment.task)
-  elif mode == Experiment.Mode.CONTINUOUS_EVAL:
-    return experiment.trainer.evaluate_continuously(experiment.task)
-  else:
-    raise ValueError(f"The job mode provided is not supported: {mode}.")
+  return experiment.trainer.run(experiment.task, mode)
 
 
 def get_iterators(
@@ -161,9 +169,7 @@ def get_iterators(
         k: iterator.TFDatasetIterator(v) for k, v in eval_datasets.items()
     }
 
-  if not all(
-      isinstance(v, iterator.Iterator) for v in eval_datasets.values()
-  ):
+  if not all(isinstance(v, iterator.Iterator) for v in eval_datasets.values()):
     raise ValueError(
         "Expected all values in the evaluation datasets mapping to be either"
         " `tf.data.Dataset` instances or CLU `DatasetIterator` instances,"
@@ -179,7 +185,7 @@ def get_shape(
   """Gets the shape of a dense / sparse / ragged tensor or tensor spec."""
   if isinstance(x, tf.SparseTensor):
     return [x.shape[0]] + [None for _ in x.shape[1:]]
-  return x.shape.as_list()
+  return x.shape.as_list()  # pylint: disable=attribute-error
 
 
 def in_tracing_context() -> bool: