From e1c6bba16a166a773286f0a0123272ac5089726c Mon Sep 17 00:00:00 2001
From: scott <scott@186k.group>
Date: Sat, 28 Feb 2026 13:54:16 -0500
Subject: [PATCH] fix: unify qwen tts cache dir for tokenizer loading

---
 backend/backends/pytorch_backend.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/backend/backends/pytorch_backend.py b/backend/backends/pytorch_backend.py
index d0cba11a..449fc846 100644
--- a/backend/backends/pytorch_backend.py
+++ b/backend/backends/pytorch_backend.py
@@ -178,6 +178,12 @@ def _load_model_sync(self, model_size: str):
                     status="downloading",
                 )
 
+            # Use a single cache root for both HF Hub + Transformers to avoid split-cache
+            # cases where speech_tokenizer files are downloaded to one location while
+            # AutoFeatureExtractor resolves from another.
+            from huggingface_hub import constants as hf_constants
+            tts_cache_dir = hf_constants.HF_HUB_CACHE
+
             # Load the model (tqdm is patched, but filters out non-download progress)
             try:
                 # Don't pass device_map on CPU: accelerate's meta-tensor mechanism
@@ -186,14 +192,16 @@ def _load_model_sync(self, model_size: str):
                 if self.device == "cpu":
                     self.model = Qwen3TTSModel.from_pretrained(
                         model_path,
-                        torch_dtype=torch.float32,
+                        cache_dir=tts_cache_dir,
+                        dtype=torch.float32,
                         low_cpu_mem_usage=False,
                     )
                 else:
                     self.model = Qwen3TTSModel.from_pretrained(
                         model_path,
+                        cache_dir=tts_cache_dir,
                         device_map=self.device,
-                        torch_dtype=torch.bfloat16,
+                        dtype=torch.bfloat16,
                     )
             finally:
                 # Exit the patch context