Remove explicit sharding after applying LoRA.

The tunix Authors · The tunix Authors · commit f24fd6eadc8c · 2025-10-28T17:02:49.000-07:00
PiperOrigin-RevId: 825163138
diff --git a/examples/dpo_demo_gemma3.ipynb b/examples/dpo_demo_gemma3.ipynb
@@ -293,12 +293,6 @@
         "      base_model, lora_provider, **model_input\n",
         "  )\n",
         "\n",
-        "  with mesh:\n",
-        "    state = nnx.state(lora_model)\n",
-        "    pspecs = nnx.get_partition_spec(state)\n",
-        "    sharded_state = jax.lax.with_sharding_constraint(state, pspecs)\n",
-        "    nnx.update(lora_model, sharded_state)\n",
-        "\n",
         "  return lora_model"
       ]
     },
@@ -332,9 +326,9 @@
       },
       "outputs": [],
       "source": [
-        "TEMPLATE = \"\"\"\u003cstart_of_turn\u003euser\n",
-        "{question}\u003cend_of_turn\u003e\n",
-        "\u003cstart_of_turn\u003emodel\"\"\"\n",
+        "TEMPLATE = \"\"\"<start_of_turn>user\n",
+        "{question}<end_of_turn>\n",
+        "<start_of_turn>model\"\"\"\n",
         "\n",
         "\n",
         "def generate(\n",
@@ -426,10 +420,10 @@
         "        except:\n",
         "          print(\"SKIPPED accuracy check\")\n",
         "\n",
-        "        if corr_ctr_per_question \u003e 0:\n",
+        "        if corr_ctr_per_question > 0:\n",
         "          break\n",
         "\n",
-        "      if corr_ctr_per_question \u003e 0:\n",
+        "      if corr_ctr_per_question > 0:\n",
         "        corr += 1\n",
         "        if corr_lst and make_lst:\n",
         "          response_lst.append((question, answer, multiple_call_response))\n",
@@ -439,7 +433,7 @@
         "\n",
         "      total += 1\n",
         "      if total % 10 == 0:\n",
-        "        print(f\"===\u003e {corr=}, {total=}, {corr / total * 100=}\")\n",
+        "        print(f\"===> {corr=}, {total=}, {corr / total * 100=}\")\n",
         "\n",
         "  to_return = (\n",
         "      corr,\n",
@@ -459,13 +453,13 @@
       },
       "outputs": [],
       "source": [
-        "def extract_hash_answer(text: str) -\u003e str | None:\n",
+        "def extract_hash_answer(text: str) -> str | None:\n",
         "  if \"####\" not in text:\n",
         "    return None\n",
         "  return text.split(\"####\")[1].strip()\n",
         "\n",
         "\n",
-        "def get_dataset(data_dir, split=\"train\") -\u003e grain.MapDataset:\n",
+        "def get_dataset(data_dir, split=\"train\") -> grain.MapDataset:\n",
         "  # Download data\n",
         "  if not os.path.exists(data_dir):\n",
         "    os.makedirs(data_dir)\n",
@@ -548,7 +542,7 @@
       },
       "outputs": [],
       "source": [
-        "def get_dataset() -\u003e grain.MapDataset:\n",
+        "def get_dataset() -> grain.MapDataset:\n",
         "  dpo_dataset = load_dataset(\n",
         "      \"argilla/distilabel-intel-orca-dpo-pairs\", split=\"train\"\n",
         "  )\n",
@@ -565,7 +559,7 @@
         "  samples_to_add = total_samples_needed - num_gsm8k_train_samples\n",
         "  print(f\"Number of additional random samples needed: {samples_to_add}\")\n",
         "\n",
-        "  if samples_to_add \u003e 0:\n",
+        "  if samples_to_add > 0:\n",
         "    # Randomly select additional samples from the original dataset\n",
         "    # Ensure we don't sample more than the total available in the original dataset\n",
         "    random_samples = dpo_dataset.shuffle(seed=42).select(\n",
@@ -745,7 +739,7 @@
       },
       "outputs": [],
       "source": [
-        "# The first couple of training step might take up to 5 minutes to finish. Please be patient. If you experience long training steps, e.g. \u003e10 minutes per step, please open a bug. Really appreciated!\n",
+        "# The first couple of training step might take up to 5 minutes to finish. Please be patient. If you experience long training steps, e.g. >10 minutes per step, please open a bug. Really appreciated!\n",
         "\n",
         "if mesh is None:\n",
         "  dpo_trainer.train(train_dataset)\n",
diff --git a/examples/qlora_demo.ipynb b/examples/qlora_demo.ipynb
diff --git a/scripts/grpo_demo_llama3_qwen2.py b/scripts/grpo_demo_llama3_qwen2.py
@@ -422,12 +422,6 @@ def get_lora_model(base_model, model_mesh=None):
       base_model, lora_provider, **model_input
   )
 
-  with model_mesh:
-    state = nnx.state(lora_model)
-    pspecs = nnx.get_partition_spec(state)
-    sharded_state = jax.lax.with_sharding_constraint(state, pspecs)
-    nnx.update(lora_model, sharded_state)
-
   return lora_model
 
 
diff --git a/scripts/grpo_demo_sglang_jax_rollout.py b/scripts/grpo_demo_sglang_jax_rollout.py
@@ -380,12 +380,6 @@ def get_lora_model(base_model, mesh):
   #     base_model, lora_provider, **model_input
   # )
   lora_model = base_model
-  with mesh:
-    state = nnx.state(lora_model)
-    pspecs = nnx.get_partition_spec(state)
-    sharded_state = jax.lax.with_sharding_constraint(state, pspecs)
-    nnx.update(lora_model, sharded_state)
-
   return lora_model
 
 
diff --git a/tunix/cli/utils/model.py b/tunix/cli/utils/model.py
@@ -253,12 +253,6 @@ def apply_lora_to_model(base_model, mesh, lora_config):
       base_model, lora_provider, **model_input
   )
 
-  with mesh:
-    state = nnx.state(lora_model)
-    pspecs = nnx.get_partition_spec(state)
-    sharded_state = jax.lax.with_sharding_constraint(state, pspecs)
-    nnx.update(lora_model, sharded_state)
-
   return lora_model
 
 
diff --git a/tunix/tests/test_common.py b/tunix/tests/test_common.py
@@ -30,6 +30,8 @@
 import os
 import shutil
 import gc
+from tunix.rl import utils
+from tunix.rl import reshard
 
 if hasattr(flax_config, 'flax_always_shard_variable'):
   flax_config.update('flax_always_shard_variable', False)
@@ -158,12 +160,29 @@ def get_lora_model(
   lora_model = qwix.apply_lora_to_model(
       model, lora_provider, **dummy_model_input
   )
-  if mesh is not None:
+
+  # Reshard the model if the mesh is specified and the lora model mesh is not
+  # the same as the input mesh.
+  lora_model_mesh = utils.get_pytree_mesh_info(nnx.state(lora_model))
+  if (
+      lora_model_mesh is not None
+      and mesh is not None
+      and lora_model_mesh != mesh
+  ):
     with mesh:
-      state = nnx.state(lora_model)
-      pspecs = nnx.get_partition_spec(state)
-      sharded_state = jax.lax.with_sharding_constraint(state, pspecs)
-      nnx.update(lora_model, sharded_state)
+      graph_def, state = nnx.split(lora_model)
+      default_memory_kind = jax.devices()[0].default_memory().kind
+      dst_shardings = jax.tree_util.tree_map(
+          lambda x: jax.sharding.NamedSharding(
+              mesh,
+              x,
+              memory_kind=default_memory_kind,
+          ),
+          nnx.get_partition_spec(state),
+      )
+      lora_model = nnx.merge(
+          graph_def, reshard.reshard_pytree(state, dst_shardings)
+      )
   return lora_model