google
diff --git a/‎.github/workflows/build_and_test_tunix_nightly_regression.yml‎
Lines changed: 59 additions & 0 deletions b/‎.github/workflows/build_and_test_tunix_nightly_regression.yml‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎.github/workflows/tpu-nightly-regression.yml‎
Lines changed: 102 additions & 0 deletions b/‎.github/workflows/tpu-nightly-regression.yml‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎.github/workflows/tpu-tests.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/tpu-tests.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/deepscaler/math_eval_nb.py‎
Lines changed: 26 additions & 23 deletions b/‎examples/deepscaler/math_eval_nb.py‎
Lines changed: 26 additions & 23 deletions
@@ -0,0 +1,59 @@
+# Copyright 2025 Google LLC
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     https://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This workflow will build tunix python package and run tpu regression tests.
+
+name: Tunix Nightly Regression Tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Run the job every day at 2am
+    - cron:  '0 2 * * *'
+
+concurrency:
+  # Dedup scheduled runs but nothing else
+  group: >
+    ${{
+      github.event_name == 'schedule' && format('{0}-schedule', github.workflow) ||
+      github.run_id
+    }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+jobs:
+  build_tunix_package:
+    name: Build tunix package
+    uses: ./.github/workflows/build_package.yml
+
+  tunix_tpu_nightly_regression:
+    needs: build_tunix_package
+    uses: ./.github/workflows/tpu-nightly-regression.yml
+    secrets:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+  notify_failure:
+    name: Notify failed build # creates an issue or modifies last open existing issue for failed build
+    needs: [build_tunix_package, tunix_tpu_nightly_regression]
+    if: ${{ always() }}
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+    steps:
+    - name: Check whether one of the jobs failed
+      if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
+      uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b  # v1.2.0
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,102 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Tunix Nightly Regression
+
+on:
+  workflow_call:
+    secrets:
+      HF_TOKEN:
+        required: true
+        description: 'HuggingFace token for model downloads'
+
+concurrency:
+  # Dedup pull requests (canceling previous runs of the same workflow for same PR), and scheduled runs but nothing else
+  group: ${{ github.event_name == 'pull_request' && format('{0}-pr-{1}', github.workflow, github.event.pull_request.number) || github.event_name == 'schedule' && format('{0}-schedule', github.workflow) || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  HF_HOME: ~/.cache/huggingface
+  HF_HUB_ENABLE_HF_TRANSFER: "1"
+
+jobs:
+  run_prod:
+    runs-on: [linux-x86-ct5lp-224-8tpu]
+    environment: testing
+    container:
+      image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:jax0.7.1_rev1
+      options: --privileged
+      env:
+        CLOUD_TPU_ACCELERATOR: v5e-8
+        JAX_PLATFORMS: tpu
+    steps:
+
+    # Cache Hugging Face hub
+    - name: Cache HF hub
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/huggingface
+        key: hf-${{ runner.os }}-${{ hashFiles('pyproject.toml', 'requirements*.txt', 'constraints*.txt') }}
+        restore-keys: |
+          hf-${{ runner.os }}-
+
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
+    - name: Install tunix dependencies
+      run: |
+        pip install -e .[prod]
+        pip install pytest pytest-xdist
+
+    - name: Verify TPU availability
+      run: |
+        python -c "
+        import jax
+        print(f'JAX version: {jax.__version__}')
+        print(f'JAX devices: {jax.devices()}')
+
+        # Check if we have TPU devices specifically
+        devices = jax.devices()
+        has_tpu = len(devices) > 0 and all(device.platform == 'tpu' for device in devices)
+        print(f'TPU available: {has_tpu}')
+
+        if not has_tpu:
+            print('ERROR: No TPU devices found! Expected TPU devices but got:', [device.platform for device in devices])
+            exit(1)
+        else:
+            print(f'SUCCESS: Found {len(devices)} TPU device(s)')
+        "
+
+    - name: Run regression scripts
+      id: regression_tests
+      run: |
+        FAILED=0
+        echo "Running tunix/oss/examples/deepscaler/math_eval_nb.py..."
+        python tunix/oss/examples/deepscaler/math_eval_nb.py || FAILED=1
+
+        echo "Running tunix/oss/scripts/grpo_demo_llama3_qwen2.py..."
+        python tunix/oss/scripts/grpo_demo_llama3_qwen2.py || FAILED=1
+        
+        if [ "$FAILED" -ne 0 ]; then
+          echo "One or more scripts failed!"
+          exit 1
+        fi
+
+
+          
@@ -61,7 +61,8 @@ jobs:
 
     - name: Install tunix dependencies
       run: |
-        pip install -e .[prod]
+        pip install --upgrade pip
+        pip install -e .[prod] --force-reinstall
         pip install pytest pytest-xdist
 
     - name: Verify TPU availability
 
@@ -1,14 +1,16 @@
 # %%
 from pprint import pprint
-from datasets import Dataset
+import datasets as datasets_lib
 import grain
 import pandas as pd
 import os
 import fsspec
 
-from transformers import AutoTokenizer
+import transformers
 from tunix.generate import mappings
 
+Dataset = datasets_lib.Dataset
+AutoTokenizer = transformers.AutoTokenizer
 
 try:
   from GOOGLE_INTERNAL_PACKAGE_PATH.pyglib import gfile
@@ -38,15 +40,15 @@
   from tunix.generate import sampler as sampler_lib
   from tunix.utils import math_utils
 # %%
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 import jax
 from tqdm.auto import tqdm
 import re
 
 # Only used for Math500
 def extract_answer_robust(passage: str) -> str:
   if not passage:
-    return None
+    return ""
 
   # Pattern 1: Look for \boxed{...} with proper matching braces
   # This handles nested braces like \boxed{\frac{1}{2}}
@@ -107,7 +109,7 @@ def extract_answer_robust(passage: str) -> str:
         break
     return answer.strip().rstrip(".,;:)")
 
-  return None
+  return ""
 # %%
 
 # only used for AIME-2024
@@ -160,10 +162,6 @@ def evaluate_correctness(response: Any, ground_truths: Any) -> bool:
   return False
 # %%
 
-from transformers import AutoTokenizer
-from pprint import pprint
-import grain
-
 class Qwen25MathEvaluator:
 
   def __init__(
@@ -228,20 +226,20 @@ def load_model(self):
     )
 
     if self.sampler_type == "vanilla":
-      self.sampler = sampler_lib.Sampler(
+      self.sampler_vanilla = sampler_lib.Sampler(
           transformer=self.model,
           tokenizer=self.tokenizer,
           cache_config=cache_config,
       )
     elif self.sampler_type == "sglang-jax":
-      from tunix.generate import sglang_jax_sampler  # pylint: disable=g-import-not-at-top
+      from tunix.google.stubs import sglang_jax_sampler_stub as sglang_jax_sampler  # pylint: disable=g-import-not-at-top
 
       mapping_config = mappings.MappingConfig.build(
           mapping_obj=None,
           model=self.model,
           backend="sglang_jax",
       )
-      self.sampler = sglang_jax_sampler.SglangJaxSampler(
+      self.sampler_sglang = sglang_jax_sampler.SglangJaxSampler(
           tokenizer=self.tokenizer,
           config=sglang_jax_sampler.SglangJaxConfig(
               mesh=self.mesh,
@@ -328,8 +326,12 @@ def generate(
       temperature: float = 0.6,
       top_k: int = 50,
       top_p: float = 0.95,
-      seed: int = None,
+      seed: int | None = None,
   ) -> str:
+    if self.tokenizer is None:
+      raise RuntimeError(
+          "Model components not loaded. Call load_model() first."
+      )
     max_length = max(len(self.tokenizer.encode(p)) for p in prompts)
     cache_size = self.max_prompt_length + self.max_generation_steps + 100
     safe_gen_length = min(
@@ -346,7 +348,7 @@ def generate(
 
     # Generate
     if self.sampler_type == "vanilla":
-      out_data = self.sampler(
+      out_data = self.sampler_vanilla(
           input_strings=prompts,
           max_generation_steps=safe_gen_length,
           temperature=temperature,
@@ -357,7 +359,7 @@ def generate(
           seed=jax.random.PRNGKey(seed) if seed is not None else None,
       )
     elif self.sampler_type == "sglang-jax":
-      out_data = self.sampler(
+      out_data = self.sampler_sglang(
           input_strings=prompts,
           max_generation_steps=safe_gen_length,
           max_prompt_length=self.max_prompt_length,
@@ -370,22 +372,22 @@ def generate(
       )
     else:
       raise ValueError(f"Unsupported sampler type: {self.sampler_type}")
-    return out_data.text
+    return out_data.text[0]
 
   def evaluate(
       self,
       batch_size: int = 8,
-      num_batches: int = None,
+      num_batches: int | None = None,
       temperature: float = 0.6,
-      top_k: int = 50,
-      top_p: float = 0.95,
+      top_k: Optional[int] = 50,
+      top_p: Optional[float] = 0.95,
       num_passes: int = 1,
       debug_first_n: int = 3,  # NEW: Debug first N examples
   ) -> Dict[str, Any]:
     print("=" * 60)
     print("Starting Evaluation")
     print("=" * 60)
-    print(f"Configuration:")
+    print("Configuration:")
     print(f"  Batch size: {batch_size}")
     print(f"  Num batches: {num_batches or 'all'}")
     print(f"  Temperature: {temperature}")
@@ -467,7 +469,8 @@ def evaluate(
           print(f"Ground truth: {answer}")
           print("=" * 60 + "\n")
           print(f"Prompt (first 300 chars): {prompt[:]}")
-          print(f"Prompt length: {len(self.tokenizer.encode(prompt))} tokens")
+          if self.tokenizer is not None and hasattr(self.tokenizer, "encode"):
+            print(f"Prompt length: {len(self.tokenizer.encode(prompt))} tokens")
           print("=" * 60 + "\n")
           for i, (response, ans, cor) in enumerate(
               zip(responses, extracted_answers, answer_correct)
@@ -553,7 +556,7 @@ def evaluate(
 print("\nStarting evaluation...")
 results = evaluator.evaluate(
     batch_size=8,
-    # num_batches=3,
+    num_batches=None,
     temperature=0.6,
     top_k=50,
     top_p=0.95,
@@ -592,7 +595,7 @@ def evaluate(
 
 results = evaluator.evaluate(
     batch_size=1,
-    # num_batches=3,
+    num_batches=None,
     temperature=0.6,
     top_k=None,
     top_p=0.95,