fix window attention

wanghaoyucn · wanghaoyucn · commit 4f950d89193a · 2025-09-23T11:54:39.000-07:00
diff --git a/eqnet/models/phasenet.py b/eqnet/models/phasenet.py
@@ -298,6 +298,7 @@ def __init__(
         event_time_loss_weight=1.0,
         polarity_loss_weight=1.0,
         prompt_loss_weight=1.0,
+        window_attention=False,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -341,6 +342,7 @@ def __init__(
                 add_polarity=add_polarity,
                 add_event=add_event,
                 add_prompt=add_prompt,
+                window_attention=window_attention,
                 **kwargs,
             )
         else:
@@ -455,12 +457,12 @@ def forward(self, batched_inputs: Tensor) -> Dict[str, Tensor]:
 def build_model(
     backbone="unet",
     log_scale=True,
-    shift_window=False,
+    window_attention=False,
     *args,
     **kwargs,
 ) -> PhaseNet:
     return PhaseNet(
         backbone=backbone,
         log_scale=log_scale,
-        shift_window=shift_window,
+        window_attention=window_attention,
     )
diff --git a/eqnet/models/phasenet_plus.py b/eqnet/models/phasenet_plus.py
@@ -10,7 +10,7 @@ def build_model(
     event_center_loss_weight=1.0,
     event_time_loss_weight=1.0,
     polarity_loss_weight=1.0,
-    shift_window=False,
+    window_attention=False,
     *args,
     **kwargs,
 ) -> PhaseNet:
@@ -23,5 +23,5 @@ def build_model(
         event_center_loss_weight=event_center_loss_weight,
         event_time_loss_weight=event_time_loss_weight,
         polarity_loss_weight=polarity_loss_weight,
-        shift_window=shift_window,
+        window_attention=window_attention,
     )
diff --git a/eqnet/models/x_unet.py b/eqnet/models/x_unet.py
@@ -343,7 +343,7 @@ def time_window_mod(b, h, q_idx, kv_idx):
 
         return prefix_mask | suffix_mask | mid_mask
         
-    block_mask = create_block_mask(time_window_mod, B, H, q_len, kv_len, device=device, BLOCK_SIZE=window_size, _compile=False)
+    block_mask = create_block_mask(time_window_mod, B, H, q_len, kv_len, device=device, BLOCK_SIZE=128, _compile=False)
     return block_mask
 
 @lru_cache
diff --git a/predict.py b/predict.py
@@ -443,12 +443,15 @@ def main(args):
         #     checkpoint = torch.load(glob(os.path.join(artifact_dir, "*.pth"))[0], map_location="cpu")
         #     model.load_state_dict(checkpoint["model"], strict=True)
 
+    model.load_state_dict(checkpoint["model"], strict=True)
+    if args.window_attention:
+        model = torch.compile(model)
     model_without_ddp = model
     if args.distributed:
         torch.distributed.barrier()
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
         model_without_ddp = model.module
-    model_without_ddp.load_state_dict(checkpoint["model"], strict=True)
+    #model_without_ddp.load_state_dict(checkpoint["model"], strict=True)
 
     if args.model == "phasenet_das":
         pred_phasenet_das(args, model, data_loader, pick_path, figure_path)
@@ -494,7 +497,7 @@ def get_args_parser(add_help=True):
     parser.add_argument("--result_path", type=str, default="results", help="path to result directory")
     parser.add_argument("--plot_figure", action="store_true", help="If plot figure for test")
     parser.add_argument("--min_prob", default=0.3, type=float, help="minimum probability for picking")
-    parser.add_argument("--shift_window", action="store_true", help="If use shift window for transformer")
+    parser.add_argument("--window-attention", action="store_true", help="If use shift window for transformer")
 
     ## Seismic
     parser.add_argument("--add_polarity", action="store_true", help="If use polarity information")