feat: update gift eval experiment (#259)

AzulGarza · web-flow · commit 5f67f466be34 · 2025-11-11T20:41:52.000-06:00
diff --git a/docs/examples/gift-eval.ipynb b/docs/examples/gift-eval.ipynb
diff --git a/experiments/gift-eval/README.md b/experiments/gift-eval/README.md
@@ -3,23 +3,24 @@
 This section documents the evaluation of a foundation model ensemble built using the [TimeCopilot](https://timecopilot.dev) library on the [GIFT-Eval](https://huggingface.co/spaces/Salesforce/GIFT-Eval) benchmark.
 
 !!! success ""
-    With less than $30 in compute cost, TimeCopilot achieved first place in probabilistic accuracy (CRPS) among non-leaking models on this large-scale benchmark, which spans 24 datasets, 144k+ time series, and 177M data points.
+    With less than $30 in compute cost, TimeCopilot achieved first place in probabilistic accuracy (CRPS) among open-source solution on this large-scale benchmark, which spans 24 datasets, 144k+ time series, and 177M data points.
 
 
 TimeCopilot is an open‑source AI agent for time series forecasting that provides a unified interface to multiple forecasting approaches, from foundation models to classical statistical, machine learning, and deep learning methods, along with built‑in ensemble capabilities for robust and explainable forecasting.
 
-<img width="1002" height="1029" alt="image" src="https://github.com/user-attachments/assets/6fa8d459-0ca3-45ce-afe5-7fac8400167f" />
+<img width="1002" height="1029" alt="image" src="https://github.com/user-attachments/assets/69724886-d37e-46e6-8a10-d82396695b49" />
+
+
 
 
 
 ## Description
 
 This ensemble leverages [**TimeCopilot's MedianEnsemble**](https://timecopilot.dev/api/models/ensembles/#timecopilot.models.ensembles.median.MedianEnsemble) feature, which combines three state-of-the-art foundation models:
 
-- [**Moirai** (Salesforce AI Research)](https://timecopilot.dev/api/models/foundation/models/#timecopilot.models.foundation.moirai.Moirai).
-- [**Sundial** (THUML @ Tsinghua University)](https://timecopilot.dev/api/models/foundation/models/#timecopilot.models.foundation.sundial.Sundial) 
-- [**Toto** (DataDog)](https://timecopilot.dev/api/models/foundation/models/#timecopilot.models.foundation.toto.Toto).
-
+- [**Chronos-2** (AWS)](https://timecopilot.dev/api/models/foundation/models/#timecopilot.models.foundation.chronos.Chronos).
+- [**TimesFM-2.5** (Google Research)](https://timecopilot.dev/api/models/foundation/models/#timecopilot.models.foundation.timesfm.TimesFM).
+- [**TiRex** (NXAI)](https://timecopilot.dev/api/models/foundation/models/#timecopilot.models.foundation.tirex.TiRex).
 
 ## Setup
 
@@ -110,4 +111,10 @@ Results are saved to `results/timecopilot/all_results.csv` in GIFT-Eval format.
 
 ## Changelog
 
-- **2025-08-05**: GIFT‑Eval recently [enhanced its evaluation dashboard](https://github.com/SalesforceAIResearch/gift-eval?tab=readme-ov-file#2025-08-05) with a new flag that identifies models likely affected by data leakage (i.e., having seen parts of the test set during training). While the test set itself hasn’t changed, this new insight helps us better interpret model performance. To keep our results focused on truly unseen data, we’ve excluded any flagged models from this experiment and added the Sundial model to the ensemble. The previous experiment details remain available [here](https://github.com/AzulGarza/timecopilot/tree/v0.0.14/experiments/gift-eval).
+### **2025-11-06**
+
+We introduced newer models based on the most recent progress in the field: Chronos-2, TimesFM-2.5 and TiRex.
+
+### **2025-08-05**
+
+GIFT‑Eval recently [enhanced its evaluation dashboard](https://github.com/SalesforceAIResearch/gift-eval?tab=readme-ov-file#2025-08-05) with a new flag that identifies models likely affected by data leakage (i.e., having seen parts of the test set during training). While the test set itself hasn’t changed, this new insight helps us better interpret model performance. To keep our results focused on truly unseen data, we’ve excluded any flagged models from this experiment and added the Sundial model to the ensemble. The previous experiment details remain available [here](https://github.com/AzulGarza/timecopilot/tree/v0.0.14/experiments/gift-eval).
diff --git a/experiments/gift-eval/pyproject.toml b/experiments/gift-eval/pyproject.toml
@@ -2,12 +2,13 @@
 dependencies = [
   "modal>=1.0.5",
   "s3fs>=2023.12.1",
-  "timecopilot>=0.0.13",
+  "timecopilot>=0.0.21",
   "transformers<4.54",
+  "transformers==4.40.1 ; python_full_version < '3.12'",
   "typer>=0.16.0",
 ]
 description = "TimeCopilot experiments for GIFT-Eval"
 name = "timecopilot-gift-eval"
 readme = "README.md"
 requires-python = ">=3.11"
-version = "0.1.0"
+version = "0.2.0"
diff --git a/experiments/gift-eval/src/download_results.py b/experiments/gift-eval/src/download_results.py
@@ -18,8 +18,11 @@ def download_results():
             f"s3://{bucket}/results/timecopilot/{dataset_name}/{term}/all_results.csv"
         )
         logging.info(f"Downloading {csv_path}")
-        df = pd.read_csv(csv_path, storage_options={"anon": False})
-        dfs.append(df)
+        try:
+            df = pd.read_csv(csv_path, storage_options={"anon": False})
+            dfs.append(df)
+        except Exception as e:
+            logging.error(f"Error downloading {csv_path}: {e}")
 
     df = pd.concat(dfs, ignore_index=True)
     output_dir = Path("results/timecopilot")
diff --git a/experiments/gift-eval/src/run_modal.py b/experiments/gift-eval/src/run_modal.py
@@ -30,8 +30,8 @@
 @app.function(
     image=image,
     volumes=volume,
-    # 3 hours timeout
-    timeout=60 * 60 * 3,
+    # 6 hours timeout
+    timeout=60 * 60 * 6,
     gpu="A10G",
     # as my local
     cpu=8,
diff --git a/experiments/gift-eval/src/run_timecopilot.py b/experiments/gift-eval/src/run_timecopilot.py
@@ -6,9 +6,9 @@
 from timecopilot.gift_eval.eval import GIFTEval
 from timecopilot.gift_eval.gluonts_predictor import GluonTSPredictor
 from timecopilot.models.ensembles.median import MedianEnsemble
-from timecopilot.models.foundation.moirai import Moirai
-from timecopilot.models.foundation.sundial import Sundial
-from timecopilot.models.foundation.toto import Toto
+from timecopilot.models.foundation.chronos import Chronos
+from timecopilot.models.foundation.timesfm import TimesFM
+from timecopilot.models.foundation.tirex import TiRex
 
 logging.basicConfig(level=logging.INFO)
 
@@ -40,13 +40,15 @@ def run_timecopilot(
     predictor = GluonTSPredictor(
         forecaster=MedianEnsemble(
             models=[
-                Moirai(
-                    repo_id="Salesforce/moirai-1.1-R-large",
+                Chronos(
+                    repo_id="amazon/chronos-2",
                     batch_size=batch_size,
                 ),
-                Sundial(batch_size=batch_size),
-                Toto(
-                    context_length=1_024,
+                TimesFM(
+                    repo_id="google/timesfm-2.5-200m-pytorch",
+                    batch_size=batch_size,
+                ),
+                TiRex(
                     batch_size=batch_size,
                 ),
             ],
diff --git a/experiments/gift-eval/uv.lock b/experiments/gift-eval/uv.lock