ai-post-transformers/queue.json at main · mcgrof/ai-post-transformers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{
  "bridge": [
    {
      "arxiv_id": "2601.22705",
      "title": "CONCUR: High-Throughput Agentic Batch Inference of LLM via Congestion-Based Concurrency Control",
      "abstract": "Batch inference for agentic workloads stresses the GPU key-value (KV) cache in a sustained and cumulative manner, often causing severe throughput degradation well before memory capacity is exhausted. We identify this phenomenon as middle-phase thrashing, a previously under-characterized pathology in which cache efficiency collapses as long-lived agents accumulate state over time.   We argue that mitigating this pathology requires moving beyond reactive, request-level cache management to proactive, agent-level admission control. Drawing inspiration from congestion control in distributed systems, we view the KV cache as a shared resource whose efficient utilization depends on feedback-driven regulation. Based on this insight, we present CONCUR, a lightweight control layer that regulates agent admission to bound aggregate cache pressure while preserving execution continuity. CONCUR adapts a cache-aware control algorithm to dynamically adjust the number of active agents using runtime cache signals.   Across large models and real-world agent workloads, CONCUR prevents middle-phase thrashing and improves batch inference throughput by up to 4.09x on Qwen3-32B and 1.9x on DeepSeek-V3, while remaining compatible with existing LLM serving systems.",
      "authors": [
        "Qiaoling Chen",
        "Zhisheng Ye",
        "Tian Tang",
        "Peng Sun",
        "Boyu Tian",
        "Guoteng Wang",
        "Shenggui Li",
        "Yonggang Wen",
        "Zhenhua Han",
        "Tianwei Zhang"
      ],
      "published_at": "2026-01-30T08:27:20",
      "categories": [
        "cs.DC"
      ],
      "url": "http://arxiv.org/abs/2601.22705v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "systems",
      "domain_bucket": "llm",
      "paper_type": "systems",
      "narrow_domain_flag": false,
      "sim_public": 0.5403847694396973,
      "sim_memory": 0.5972424745559692,
      "sim_negative": 0.12155820429325104,
      "broad_relevance": 0.5403847694396973,
      "momentum": 0.0,
      "teachability": 0.75,
      "novelty_score": 0.40668225288391113,
      "evidence_score": 0.53,
      "direct_memory_relevance": 0.5972424745559692,
      "systems_leverage": 0.7,
      "deployment_proximity": 0.7,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.2,
      "transferability_score": 0.72,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.5731177687644959,
      "memory_score": 0.7281727423667907,
      "quality_score": 0.525,
      "bridge_score": 0.5731177687644959,
      "max_axis_score": 0.7281727423667907,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "90d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Memory/Storage Core",
        "Bridge",
        "Systems",
        "Inference"
      ],
      "status": "Cover now",
      "why_now": "Agentic inference is stressing serving stacks in ways standard request-level schedulers do not handle well, and this paper names a concrete new failure mode around sustained KV-cache pressure. The congestion-control framing gives operators a practical systems idea that could matter quickly as long-context and multi-step agent workloads scale.",
      "why_not_higher": "The contribution is focused on inference control-plane behavior rather than a broader shift in model architecture or training, so its audience is strongest among serving and systems practitioners. The abstract shows strong throughput wins, but it leaves open how robust the method is across diverse schedulers, latency targets, and production constraints.",
      "downgrade_reasons": [
        "agentic batch inference is still a narrower slice than general LLM serving",
        "throughput gains are clearer than latency, fairness, or deployment tradeoffs",
        "evidence in the abstract is limited to a small set of models and workloads"
      ],
      "what_would_raise_priority": "Independent results across more serving systems, hardware setups, and latency-sensitive workloads would make this a stronger must-cover paper.",
      "one_sentence_episode_hook": "What if LLM serving needs something like TCP congestion control, not just better kernels, to stop KV cache thrashing before GPUs look full?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2512.00719",
      "title": "SIMPLE: Disaggregating Sampling from GPU Inference into a Decision Plane for Faster Distributed LLM Serving",
      "abstract": "As large language models (LLMs) scale out with tensor parallelism (TP) and pipeline parallelism (PP) and production stacks have aggressively optimized the data plane (attention/GEMM and KV cache), sampling, the decision plane that turns logits into tokens, becomes a new bottleneck. This creates a structural holdout: sampling neither expands with TP nor balances across PP stages, so its share of iteration time grows as GPUs get faster and it caps pipeline frequency at the last stage. We present SIMPLE, a stage-agnostic, sequence-parallel, overlappable decision plane that disaggregates sampling into a CPU-side service and shrinks its runtime footprint back to a minor, hidden role. SIMPLE combines: (1) sequence-parallel sampling, which shards work along the batch dimension and removes vocabulary-axis collectives; (2) a CPU-based algorithm with column-wise penalties and truncation-first filtering to realize single-pass, linear-time kernels; and (3) speculative hot-vocab sampling (SHVS), which samples on a small hot set with rejection-correctness and uses a simple sizing model to choose the hot-vocab size that maximizes throughput. In evaluation, SIMPLE improves end-to-end throughput by up to 96% and reduces P95 latency by 20-65%. Crucially, SIMPLE requires no user-side code changes and composes with existing data-plane optimizations, unlocking scaling benefits that compound with future GPU generations.",
      "authors": [
        "Bohan Zhao",
        "Zane Cao",
        "Yongchao He"
      ],
      "published_at": "2025-11-30T04:15:34",
      "categories": [
        "cs.DC"
      ],
      "url": "http://arxiv.org/abs/2512.00719v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "systems",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.5261570811271667,
      "sim_memory": 0.5420535802841187,
      "sim_negative": 0.2318219542503357,
      "broad_relevance": 0.5261570811271667,
      "momentum": 0.0,
      "teachability": 0.75,
      "novelty_score": 0.5010901987552643,
      "evidence_score": 0.5,
      "direct_memory_relevance": 0.5420535802841187,
      "systems_leverage": 0.85,
      "deployment_proximity": 0.7,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.72,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.5530106541514397,
      "memory_score": 0.5386160740852356,
      "quality_score": 0.525,
      "bridge_score": 0.5386160740852356,
      "max_axis_score": 0.5530106541514397,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Bridge",
        "Systems",
        "Inference",
        "Memory/Storage Adjacent"
      ],
      "status": "Cover now",
      "why_now": "Distributed LLM serving has squeezed major gains out of attention, GEMM, and KV-cache plumbing, so control-path bottlenecks like sampling are becoming the next real limiter. A paper that reassigns that work off GPU and shows large end-to-end throughput and tail-latency gains is timely for anyone operating inference stacks.",
      "why_not_higher": "The memory/storage angle is real but secondary: this is mainly a serving-systems and decision-plane paper, not a direct KV-cache, paging, or offload mechanism paper. Its broad appeal also depends on whether the evaluation spans enough real models and deployment settings beyond the abstract's headline gains.",
      "downgrade_reasons": [
        "Memory connection is adjacent rather than core",
        "Abstract does not yet prove breadth across many serving stacks",
        "Could land as an optimization paper if implementation details dominate"
      ],
      "what_would_raise_priority": "Clear evidence that SIMPLE works across multiple production-grade LLM serving frameworks, model sizes, and heterogeneous hardware would raise it further.",
      "one_sentence_episode_hook": "What if the next big LLM serving speedup comes not from faster attention or smaller KV caches, but from kicking token sampling off the GPU entirely?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2602.00328",
      "title": "Harvest: Opportunistic Peer-to-Peer GPU Caching for LLM Inference",
      "abstract": "Large Language Model (LLM) inference is increasingly constrained by GPU memory capacity rather than compute throughput, driven by growing model sizes and the linear growth of the key-value (KV) cache during autoregressive decoding. Existing approaches mitigate memory pressure by offloading model state and KV tensors to host memory, but incur substantial latency due to limited PCIe bandwidth. We present Harvest, an opportunistic GPU cache management framework that exploits high-bandwidth peer-to-peer GPU interconnects to dynamically place model weights and KV cache in unused GPU memory. Harvest treats peer GPU memory as a transient cache tier, preserving correctness while reducing data movement overhead under dynamic memory availability. We demonstrate significant throughput speedup of more than 2 times by using Harvest to accelerate the retrieval of two widely-used inference components: expert layer weights and KV cache entries.",
      "authors": [
        "Nikhil Gopal",
        "Kostis Kaffes"
      ],
      "published_at": "2026-01-30T21:29:04",
      "categories": [
        "cs.LG"
      ],
      "url": "http://arxiv.org/abs/2602.00328v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.437610387802124,
      "sim_memory": 0.5737738609313965,
      "sim_negative": 0.20268841087818146,
      "broad_relevance": 0.437610387802124,
      "momentum": 0.0,
      "teachability": 0.85,
      "novelty_score": 0.5027258396148682,
      "evidence_score": 0.45,
      "direct_memory_relevance": 0.5737738609313965,
      "systems_leverage": 0.25,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 1.0,
      "transferability_score": 0.8799999999999999,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.5366919922828675,
      "memory_score": 0.685132158279419,
      "quality_score": 0.575,
      "bridge_score": 0.5366919922828675,
      "max_axis_score": 0.685132158279419,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "90d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Bridge",
        "Memory/Storage Core",
        "Systems",
        "Hardware",
        "Inference"
      ],
      "status": "Cover now",
      "why_now": "LLM serving is hitting memory-capacity and KV-cache limits right now, and this paper targets that bottleneck directly with a concrete systems mechanism. Using idle peer GPU memory as a transient cache tier is timely because multi-GPU inference stacks are already wrestling with offload bandwidth and KV movement costs.",
      "why_not_higher": "The win appears tied to specific hardware conditions: spare peer GPU memory and fast P2P interconnects, which narrows immediate applicability. From the abstract alone, the evaluation breadth across models, workloads, and production-like serving setups is not yet fully clear.",
      "downgrade_reasons": [
        "depends on unused peer GPU memory being available",
        "benefits may be topology- and interconnect-sensitive",
        "abstract does not show full production-serving coverage"
      ],
      "what_would_raise_priority": "Broader evidence across realistic serving workloads, heterogeneous cluster conditions, and multiple interconnect topologies would raise confidence and priority.",
      "one_sentence_episode_hook": "What if the fastest fix for KV-cache pressure is not CPU offload at all, but turning neighboring GPUs' idle memory into a live opportunistic cache?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2512.24449",
      "title": "PackKV: Reducing KV Cache Memory Footprint through LLM-Aware Lossy Compression",
      "abstract": "Transformer-based large language models (LLMs) have demonstrated remarkable potential across a wide range of practical applications. However, long-context inference remains a significant challenge due to the substantial memory requirements of the key-value (KV) cache, which can scale to several gigabytes as sequence length and batch size increase. In this paper, we present \\textbf{PackKV}, a generic and efficient KV cache management framework optimized for long-context generation. %, which synergistically supports both latency-critical and throughput-critical inference scenarios. PackKV introduces novel lossy compression techniques specifically tailored to the characteristics of KV cache data, featuring a careful co-design of compression algorithms and system architecture. Our approach is compatible with the dynamically growing nature of the KV cache while preserving high computational efficiency. Experimental results show that, under the same and minimum accuracy drop as state-of-the-art quantization methods, PackKV achieves, on average, \\textbf{153.2}\\% higher memory reduction rate for the K cache and \\textbf{179.6}\\% for the V cache. Furthermore, PackKV delivers extremely high execution throughput, effectively eliminating decompression overhead and accelerating the matrix-vector multiplication operation. Specifically, PackKV achieves an average throughput improvement of \\textbf{75.7}\\% for K and \\textbf{171.7}\\% for V across A100 and RTX Pro 6000 GPUs, compared to cuBLAS matrix-vector multiplication kernels, while demanding less GPU memory bandwidth. Code available on https://github.com/BoJiang03/PackKV",
      "authors": [
        "Bo Jiang",
        "Taolue Yang",
        "Youyuan Liu",
        "Xubin He",
        "Sheng Di",
        "Sian Jin"
      ],
      "published_at": "2025-12-30T20:05:32",
      "categories": [
        "cs.DC",
        "cs.AI"
      ],
      "url": "http://arxiv.org/abs/2512.24449v2",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "systems",
      "domain_bucket": "llm",
      "paper_type": "application",
      "narrow_domain_flag": false,
      "sim_public": 0.4577004909515381,
      "sim_memory": 0.7419873476028442,
      "sim_negative": 0.24648335576057434,
      "broad_relevance": 0.4577004909515381,
      "momentum": 0.0,
      "teachability": 0.75,
      "novelty_score": 0.3073432445526123,
      "evidence_score": 0.4,
      "direct_memory_relevance": 0.7419873476028442,
      "systems_leverage": 0.5499999999999999,
      "deployment_proximity": 0.7,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.4,
      "transferability_score": 0.9199999999999999,
      "clarity": 0.7,
      "reproducibility": 0.3,
      "public_interest_score": 0.4684116339683533,
      "memory_score": 0.7625962042808532,
      "quality_score": 0.505,
      "bridge_score": 0.4684116339683533,
      "max_axis_score": 0.7625962042808532,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Memory/Storage Core",
        "Systems",
        "Hardware",
        "Inference"
      ],
      "status": "Cover now",
      "why_now": "Long-context serving is still bottlenecked by KV cache footprint and bandwidth, so a method that compresses KV while also improving throughput is directly relevant to current deployment pain. This lands in an active wave of inference-systems work where concrete memory savings on modern GPUs matter immediately.",
      "why_not_higher": "The topic is highly relevant, but KV-cache compression is already a crowded lane, so novelty needs to clear a high bar. The abstract emphasizes benchmark gains over prior quantization baselines, but it is still unclear how broadly the results hold across models, sequence regimes, and production-serving stacks.",
      "downgrade_reasons": [
        "Crowded KV-cache optimization area",
        "Evidence in the abstract is benchmark-heavy",
        "Broad-audience appeal is lower than memory-lens appeal"
      ],
      "what_would_raise_priority": "Independent evidence across multiple frontier LLMs and real serving workloads would raise it further.",
      "one_sentence_episode_hook": "If KV cache is what makes long-context inference so expensive, PackKV asks a sharp systems question: can you throw away just enough cache information to save memory and bandwidth without throwing away model quality?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2603.01175",
      "title": "HAVEN: High-Bandwidth Flash Augmented Vector Engine for Large-Scale Approximate Nearest-Neighbor Search Acceleration",
      "abstract": "Retrieval-Augmented Generation (RAG) relies on large-scale Approximate Nearest Neighbor Search (ANNS) to retrieve semantically relevant context for large language models. Among ANNS methods, IVF-PQ offers an attractive balance between memory efficiency and search accuracy. However, achieving high recall requires reranking which fetches full-precision vectors for reranking, and the billion-scale vector databases need to reside in CPU DRAM or SSD due to the limited capacity of GPU HBM. This off-GPU data movement introduces substantial latency and throughput degradation.   We propose HAVEN, a GPU architecture augmented with High-Bandwidth Flash (HBF) which is a recently introduced die-stacked 3D NAND technology engineered to deliver terabyte-scale capacity and hundreds of GB/s read bandwidth. By integrating HBF and near-storage search unit as an on-package complement to HBM, HAVEN enables the full-precision vector database to reside entirely on-device, eliminating PCIe and DDR bottlenecks during reranking.   Through detailed modeling of re-architected 3D NAND subarrays, power-constrained HBF bandwidth, and end-to-end IVF-PQ pipelines, we demonstrate that HAVEN improves reranking throughput by up to 20x and latency up to 40x across billion-scale datasets compared to GPU-DRAM and GPU-SSD systems. Our results show that HBF-augmented GPUs enable high-recall retrieval at throughput previously achievable only without reranking, offering a promising direction for memory-centric AI accelerators.",
      "authors": [
        "Po-Kai Hsu",
        "Weihong Xu",
        "Qunyou Liu",
        "Tajana Rosing",
        "Shimeng Yu"
      ],
      "published_at": "2026-03-01T16:34:18",
      "categories": [
        "cs.AR",
        "cs.ET"
      ],
      "url": "http://arxiv.org/abs/2603.01175v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "hardware",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.32415521144866943,
      "sim_memory": 0.5237234830856323,
      "sim_negative": 0.17485398054122925,
      "broad_relevance": 0.32415521144866943,
      "momentum": 0.0,
      "teachability": 0.7,
      "novelty_score": 0.6018909811973572,
      "evidence_score": 0.34,
      "direct_memory_relevance": 0.5237234830856323,
      "systems_leverage": 0.4,
      "deployment_proximity": 0.5,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 1.0,
      "transferability_score": 0.64,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.4475302106142044,
      "memory_score": 0.7421170449256896,
      "quality_score": 0.545,
      "bridge_score": 0.4475302106142044,
      "max_axis_score": 0.7421170449256896,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "90d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Memory/Storage Core",
        "Bridge",
        "Systems",
        "Hardware",
        "Inference"
      ],
      "status": "Cover now",
      "why_now": "RAG and vector retrieval remain live serving bottlenecks, and this paper attacks the exact bandwidth and capacity wall that appears when reranking billion-scale indexes. It is a direct memory-centric architecture proposal with clear relevance to how future retrieval systems could be built for LLM inference stacks.",
      "why_not_higher": "The paper is still a specialized ANN accelerator story rather than a change to mainstream model training or serving across the whole AI stack. The evidence is based on detailed architectural modeling rather than deployed hardware, which limits immediate confidence.",
      "downgrade_reasons": [
        "Narrowly centered on IVF-PQ reranking for ANN search",
        "Modeled architecture rather than shipping system evidence",
        "Broad-audience appeal is lower than for general LLM serving papers"
      ],
      "what_would_raise_priority": "A stronger cross-system comparison showing real end-to-end gains on production RAG workloads or broader applicability beyond IVF-PQ reranking would raise it.",
      "one_sentence_episode_hook": "What if the real bottleneck in RAG is not the model but the trip full-precision vectors take through the memory hierarchy, and fixing it requires putting flash on the GPU package itself?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.22001",
      "title": "Heterogeneous Computing: The Key to Powering the Future of AI Agent Inference",
      "abstract": "AI agent inference is driving an inference heavy datacenter future and exposes bottlenecks beyond compute - especially memory capacity, memory bandwidth and high-speed interconnect. We introduce two metrics - Operational Intensity (OI) and Capacity Footprint (CF) - that jointly explain regimes the classic roofline analysis misses, including the memory capacity wall. Across agentic workflows (chat, coding, web use, computer use) and base model choices (GQA/MLA, MoE, quantization), OI/CF can shift dramatically, with long context KV cache making decode highly memory bound. These observations motivate disaggregated serving and system level heterogeneity: specialized prefill and decode accelerators, broader scale up networking, and decoupled compute-memory enabled by optical I/O. We further hypothesize agent-hardware co design, multiple inference accelerators within one system, and high bandwidth, large capacity memory disaggregation as foundations for adaptation to evolving OI/CF. Together, these directions chart a path to sustain efficiency and capability for large scale agentic AI inference.",
      "authors": [
        "Yiren Zhao",
        "Junyi Liu"
      ],
      "published_at": "2026-01-29T17:11:46",
      "categories": [
        "cs.AI",
        "cs.AR",
        "cs.DC"
      ],
      "url": "http://arxiv.org/abs/2601.22001v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "other",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.4255807399749756,
      "sim_memory": 0.6131550073623657,
      "sim_negative": 0.19539843499660492,
      "broad_relevance": 0.4255807399749756,
      "momentum": 0.0,
      "teachability": 0.75,
      "novelty_score": 0.45346856117248535,
      "evidence_score": 0.32,
      "direct_memory_relevance": 0.6131550073623657,
      "systems_leverage": 0.4,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.25,
      "bandwidth_capacity": 0.8,
      "transferability_score": 0.7,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.5256945061683655,
      "memory_score": 0.7189465022087096,
      "quality_score": 0.525,
      "bridge_score": 0.5256945061683655,
      "max_axis_score": 0.7189465022087096,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "90d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Memory/Storage Core",
        "Bridge",
        "Systems",
        "Hardware",
        "Inference"
      ],
      "status": "Monitor",
      "why_now": "Agentic inference is exposing memory-capacity, bandwidth, and interconnect limits that standard compute-centric framing misses, so a paper that centers those bottlenecks is timely. Its OI/CF lens also matches the industry's shift toward long-context serving, disaggregation, and heterogeneous inference stacks.",
      "why_not_higher": "From the abstract, this looks partly like a systems framing and agenda-setting paper rather than a fully validated deployment study with hard comparative evidence. Several key claims are still stated as hypotheses about future architectures rather than demonstrated wins on production-grade end-to-end systems.",
      "downgrade_reasons": [
        "Evidence appears stronger on diagnosis than on validated system improvements",
        "Some architecture recommendations seem forward-looking or speculative",
        "May overlap with existing discussion around disaggregated serving and KV-cache-driven memory bottlenecks"
      ],
      "what_would_raise_priority": "A stronger priority case would come from clear measurements on realistic agent workloads showing OI/CF predicts system choices better than existing serving heuristics and leads to demonstrated efficiency gains.",
      "one_sentence_episode_hook": "If AI agents are turning inference into a memory-and-interconnect problem, this paper argues the next breakthrough is not one better chip but heterogeneous systems built around the capacity wall.",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2510.27257",
      "title": "Synergistic Tensor and Pipeline Parallelism",
      "abstract": "In the machine learning system, the hybrid model parallelism combining tensor parallelism (TP) and pipeline parallelism (PP) has become the dominant solution for distributed training of Large Language Models~(LLMs) and Multimodal LLMs (MLLMs). However, TP introduces significant collective communication overheads, while PP suffers from synchronization inefficiencies such as pipeline bubbles. Existing works primarily address these challenges from isolated perspectives, focusing either on overlapping TP communication or on flexible PP scheduling to mitigate pipeline bubbles. In this paper, we propose a new synergistic tensor and pipeline parallelism schedule that simultaneously reduces both types of bubbles. Our proposed schedule decouples the forward and backward passes in PP into fine-grained computation units, which are then braided to form a composite computation sequence. This compositional structure enables near-complete elimination of TP-related bubbles. Building upon this structure, we further design the PP schedule to minimize PP bubbles. Experimental results demonstrate that our approach improves training throughput by up to 12% for LLMs and 16% for MLLMs compared to existing scheduling methods. Our source code is avaiable at https://github.com/MICLAB-BUPT/STP.",
      "authors": [
        "Mengshi Qi",
        "Jiaxuan Peng",
        "Jie Zhang",
        "Juan Zhu",
        "Yong Li",
        "Huadong Ma"
      ],
      "published_at": "2025-10-31T07:53:40",
      "categories": [
        "cs.DC"
      ],
      "url": "http://arxiv.org/abs/2510.27257v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "systems",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.6211696863174438,
      "sim_memory": 0.6670984625816345,
      "sim_negative": 0.28806158900260925,
      "broad_relevance": 0.6211696863174438,
      "momentum": 0.0,
      "teachability": 0.85,
      "novelty_score": 0.5031062960624695,
      "evidence_score": 0.48000000000000004,
      "direct_memory_relevance": 0.6670984625816345,
      "systems_leverage": 0.7,
      "deployment_proximity": 0.7,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.6699999999999999,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.5518168503046036,
      "memory_score": 0.5191295387744903,
      "quality_score": 0.545,
      "bridge_score": 0.5191295387744903,
      "max_axis_score": 0.5518168503046036,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Bridge",
        "Systems",
        "Training",
        "Memory/Storage Adjacent"
      ],
      "status": "Monitor",
      "why_now": "Large-model training is still constrained by communication and synchronization overheads, so a schedule that jointly attacks tensor-parallel and pipeline bubbles is relevant to current frontier training stacks. The paper is timely because hybrid TP+PP remains standard for scaling both LLMs and multimodal models.",
      "why_not_higher": "This is a training-systems optimization paper, not a broadly legible shift for most AI practitioners, and the abstract emphasizes throughput gains without enough hardware and workload detail to judge how robust they are across stacks. Its memory angle is real but indirect: the core contribution is parallel scheduling and communication overlap rather than cache, offload, paging, or storage mechanisms.",
      "downgrade_reasons": [
        "Broad appeal is limited outside distributed training practitioners",
        "Memory/storage connection is adjacent rather than core",
        "Abstract lacks enough detail on hardware realism and generality of the reported gains",
        "Risks reading as an incremental scheduler improvement rather than a major conceptual shift"
      ],
      "what_would_raise_priority": "Clear evidence that the schedule transfers across widely used training frameworks, model scales, and interconnect regimes with consistent end-to-end gains would raise it.",
      "one_sentence_episode_hook": "What if the next training speedup for giant models comes not from new hardware, but from braiding tensor and pipeline parallelism so the bubbles in each cancel the other?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.11577",
      "title": "Computation-Bandwidth-Memory Trade-offs: A Unified Paradigm for AI Infrastructure",
      "abstract": "Large-scale artificial intelligence models are transforming industries and redefining human machine collaboration. However, continued scaling exposes critical limitations in hardware, including constraints on computation, bandwidth, and memory. These dimensions are tightly interconnected, so improvements in one often create bottlenecks in others, making isolated optimizations less effective. Balancing them to maximize system efficiency remains a central challenge in scalable AI design. To address this challenge, we introduce {Computation-Bandwidth-Memory Trade-offs}, termed the {AI Trinity}, a unified paradigm that positions {computation}, {bandwidth}, and {memory} as coequal pillars for next-generation AI infrastructure. AI Trinity enables dynamic allocation of resources across these pillars, alleviating single-resource bottlenecks and adapting to diverse scenarios to optimize system performance. Within this framework, AI Trinity identifies three fundamental trade-offs: (1) {More Computation$\\rightarrow$Less Bandwidth}, wherein computational resources are exploited to reduce data transmission under limited bandwidth conditions, (2) {More Bandwidth$\\rightarrow$Less Memory}, which exploits abundant communication capacity to populate or refresh memory when local storage resources are constrained, and (3) {More Memory$\\rightarrow$Less Computation}, whereby storage capacity are utilized to mitigate redundant computation when computational costs are prohibitive. We illustrate the effectiveness of AI Trinity through representative system designs spanning edge-cloud communication, large-scale distributed training, and model inference. The innovations embodied in AI Trinity advance a new paradigm for scalable AI infrastructure, providing both a conceptual foundation and practical guidance for a broad range of application scenarios.",
      "authors": [
        "Yuankai Fan",
        "Qizhen Weng",
        "Xuelong Li"
      ],
      "published_at": "2025-12-30T17:35:14",
      "categories": [
        "cs.DC"
      ],
      "url": "http://arxiv.org/abs/2601.11577v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "systems",
      "domain_bucket": "robotics",
      "paper_type": "systems",
      "narrow_domain_flag": false,
      "sim_public": 0.4794940948486328,
      "sim_memory": 0.5329809188842773,
      "sim_negative": 0.17475129663944244,
      "broad_relevance": 0.4794940948486328,
      "momentum": 0.0,
      "teachability": 0.75,
      "novelty_score": 0.5189214944839478,
      "evidence_score": 0.28,
      "direct_memory_relevance": 0.5329809188842773,
      "systems_leverage": 0.4,
      "deployment_proximity": 0.7,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.4,
      "transferability_score": 0.6799999999999999,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.491686452627182,
      "memory_score": 0.5848942756652832,
      "quality_score": 0.525,
      "bridge_score": 0.491686452627182,
      "max_axis_score": 0.5848942756652832,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Public AI",
        "Memory/Storage Core",
        "Bridge",
        "Systems",
        "Hardware",
        "Training",
        "Inference"
      ],
      "status": "Monitor",
      "why_now": "As frontier models keep running into serving and training bottlenecks, a paper that frames compute, bandwidth, and memory as a single design space is timely for both infrastructure builders and a broad AI audience. It also maps well onto current pressure around KV cache growth, interconnect limits, and system-level efficiency.",
      "why_not_higher": "This reads more like a unifying perspective paper than a decisive new systems result with strong empirical proof on realistic end-to-end deployments. The abstract promises representative examples, but not enough concrete workload, hardware, or benchmark evidence to rank it as an immediate must-cover episode.",
      "downgrade_reasons": [
        "Conceptual framing outweighs demonstrated system gains",
        "Unclear empirical depth on realistic workloads and hardware",
        "May overlap with existing AI systems bottleneck discussions without a sharp new mechanism"
      ],
      "what_would_raise_priority": "Clear quantitative results on modern LLM training or inference stacks showing when each trade-off wins in practice would raise it substantially.",
      "one_sentence_episode_hook": "What if the real bottleneck in AI is not compute, memory, or bandwidth alone, but our failure to treat them as one coupled system?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2603.28239",
      "title": "A Switch-Centric In-Network Architecture for Accelerating LLM Inference in Shared-Memory Network",
      "abstract": "In-network computing techniques, exemplified by NVLink Sharp (NVLS), offer a promising approach to addressing the communication bottlenecks in LLM inference by offloading collective operations, such as All-Reduce, to switches. However, the accelerator-centric architecture of NVLS suffers from two fundamental limitations: 1) it relies on GPU load instructions to trigger reduction operations, which means that the data reduced in the switch must be additionally transferred back to the initiating GPU rather than being broadcast directly, thereby introducing unnecessary communication overhead; 2) due to its architectural constraints, NVLS cannot offload operators that are not decomposable into memory-semantic instructions, such as the in-network quantization (INQ) proposed in this work. As a result, All-Reduce in NVLS must operate at FP16/BF16 precision, leading to substantial bandwidth waste.To address these limitations, we propose SCIN, the first switch-centric in-network architecture for shared-memory networks of AI accelerators, enabling both low-latency and high-bandwidth All-Reduce. Specifically, we introduce an in-switch accelerator (ISA) capable of initiating memory-semantic operations for in-network processing, together with a co-designed communication fabric that incurs negligible protocol overhead. By eliminating redundant data movement, SCIN delivers lower All-Reduce latency than NVLS. Moreover, by integrating a quantization module into the ISA, SCIN enables INQ for All-Reduce, reducing its precision to 8 bits and nearly doubling bandwidth with negligible accuracy loss. We also present a prototype of SCIN on a multi-FPGA system to demonstrate its feasibility and effectiveness. Experimental results show that our design accelerates All-Reduce by up to 8.7x for small messages and 3.8x for large messages, leading up to 1.74x faster TTFT and 1.34x faster TPOT on LLaMA-2 models.",
      "authors": [
        "Aojie Jiang",
        "Kang Zhu",
        "Zhiheng Zhang",
        "Zhengxu Su",
        "Juntao Liu",
        "Yuan Du",
        "Li Du"
      ],
      "published_at": "2026-03-30T09:59:11",
      "categories": [
        "cs.AR"
      ],
      "url": "http://arxiv.org/abs/2603.28239v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "hardware",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.46526843309402466,
      "sim_memory": 0.521072506904602,
      "sim_negative": 0.18257898092269897,
      "broad_relevance": 0.46526843309402466,
      "momentum": 0.0,
      "teachability": 0.7,
      "novelty_score": 0.5278318524360657,
      "evidence_score": 0.48000000000000004,
      "direct_memory_relevance": 0.521072506904602,
      "systems_leverage": 0.4,
      "deployment_proximity": 0.5,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.6000000000000001,
      "transferability_score": 0.63,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.4787553077936172,
      "memory_score": 0.6613217520713807,
      "quality_score": 0.545,
      "bridge_score": 0.4787553077936172,
      "max_axis_score": 0.6613217520713807,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "30d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Memory/Storage Core",
        "Bridge",
        "Systems",
        "Hardware",
        "Inference"
      ],
      "status": "Monitor",
      "why_now": "LLM serving is increasingly bottlenecked by collective communication, and this paper targets that pain directly with a switch-centric design plus in-network quantization. It matters now because inference latency and bandwidth efficiency are becoming first-order deployment constraints for larger shared-memory accelerator systems.",
      "why_not_higher": "The idea is important, but the paper is still heavily architecture- and prototype-driven rather than something most AI teams can act on soon. Its impact is strongest for specialized high-end serving stacks, not the broad median practitioner.",
      "downgrade_reasons": [
        "specialized hardware dependency",
        "prototype evidence rather than production deployment",
        "limited immediate applicability for most AI builders",
        "episode risk: could skew into benchmark-heavy architecture detail"
      ],
      "what_would_raise_priority": "A stronger priority case would come from evidence on production-like GPU clusters or adoption paths showing the design transfers beyond a custom multi-FPGA prototype.",
      "one_sentence_episode_hook": "What if the fastest way to serve LLMs is to stop treating the network switch as plumbing and turn it into an active participant in all-reduce and quantization?",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2510.26730",
      "title": "ExpertFlow: Adaptive Expert Scheduling and Memory Coordination for Efficient MoE Inference",
      "abstract": "The expansion of large language models is increasingly limited by the constrained memory capacity of modern GPUs. To mitigate this, Mixture-of-Experts (MoE) architectures activate only a small portion of parameters during inference, significantly lowering both memory demand and computational overhead. However, conventional MoE inference approaches, which select active experts independently at each layer, often introduce considerable latency because of frequent parameter transfers between host and GPU memory. In addition, current cross-layer prediction strategies, which are typically based on fixed steps, lack adaptability across different hardware platforms and workloads, thereby reducing their robustness and effectiveness.   To address these challenges, we present ExpertFlow, a runtime system for MoE inference that combines adaptive expert prefetching and cache-aware routing. ExpertFlow continuously adjusts its prediction horizon for expert activation by leveraging runtime statistics such as transfer bandwidth, parameter dimensionality, and model feedback signals. Furthermore, it incorporates a hybrid cross-layer prediction scheme that fuses pregating information with intermediate computational states to anticipate future expert needs. By adaptively refining prefetching decisions and aligning them with actual usage behavior, ExpertFlow effectively decreases cache misses and removes latency caused by expert swap-ins. Our evaluation demonstrates that ExpertFlow reduces model stall time to less than 0.1% of the baseline, highlighting its capability to optimize MoE inference under stringent memory constraints.",
      "authors": [
        "Zixu Shen",
        "Kexin Chu",
        "Yifan Zhang",
        "Dawei Xiang",
        "Runxin Wu",
        "Wei Zhang"
      ],
      "published_at": "2025-10-30T17:29:27",
      "categories": [
        "cs.DC",
        "cs.AI",
        "cs.PF"
      ],
      "url": "http://arxiv.org/abs/2510.26730v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "systems",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.5477154850959778,
      "sim_memory": 0.5541058778762817,
      "sim_negative": 0.27488723397254944,
      "broad_relevance": 0.5477154850959778,
      "momentum": 0.0,
      "teachability": 0.75,
      "novelty_score": 0.4982467293739319,
      "evidence_score": 0.34,
      "direct_memory_relevance": 0.5541058778762817,
      "systems_leverage": 0.5499999999999999,
      "deployment_proximity": 0.7,
      "memory_adjacent_future_value": 0.5,
      "bandwidth_capacity": 0.4,
      "transferability_score": 0.69,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.4790516549348831,
      "memory_score": 0.7082317633628845,
      "quality_score": 0.525,
      "bridge_score": 0.4790516549348831,
      "max_axis_score": 0.7082317633628845,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": false,
      "social_score": 0.0,
      "scoring_sources": [],
      "time_window": "180d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [
        "Memory/Storage Core",
        "Systems",
        "Inference",
        "Bridge"
      ],
      "status": "Monitor",
      "why_now": "MoE deployment keeps running into GPU memory limits and expert swap latency, so adaptive prefetching and cache-aware scheduling are timely systems problems. This paper sits directly on the current serve-more-model-with-less-HBM pressure point.",
      "why_not_higher": "The paper looks operationally relevant but still fairly MoE-serving specific rather than broadly changing mainstream LLM practice. The abstract also does not yet show enough workload and hardware detail to tell whether the gains hold beyond a favorable setup.",
      "downgrade_reasons": [
        "MoE inference is important but still narrower than general LLM serving",
        "Evidence in the abstract is promising but underspecified",
        "Could end up as a strong optimization result rather than a field-shifting method"
      ],
      "what_would_raise_priority": "Clear results on widely used MoE models and heterogeneous real serving hardware, with strong latency-throughput tradeoff comparisons against current MoE runtime baselines, would raise it.",
      "one_sentence_episode_hook": "If MoE models are supposed to save memory, why do they still stall on expert swaps, and can adaptive prefetching finally make sparse models behave like fast ones in production?",
      "source": "digest",
      "added": "",
      "issue_number": null
    }
  ],
  "public": [
    {
      "arxiv_id": "2512.02010",
      "title": "Four Over Six: More Accurate NVFP4 Quantization with Adaptive Block Scaling",
      "abstract": "As large language models have grown larger, interest has grown in low-precision numerical formats such as NVFP4 as a way to improve speed and reduce memory usage. However, quantizing models to NVFP4 remains difficult as the lack of precision generally degrades model performance. In this work, we address this issue with Four Over Six (4/6), a modification to the block-scaled NVFP4 quantization algorithm that yields reduced quantization error. Unlike integer formats, floating point formats have non-uniform step sizes which create larger quantization error on larger values. 4/6 takes advantage of this by adaptively scaling some blocks to smaller FP4 values, making the distribution of representable values more uniform and reducing quantization error for near-maximal values. We show that 4/6 can be implemented efficiently on NVIDIA Blackwell GPUs, resulting in performance gains during both pre-training and inference with minimal computational overhead. In pre-training experiments with the Nemotron 3 Nano 30B-A3B model architecture, we find that 4/6 brings training loss closer to BF16 compared to models trained with current state-of-the-art NVFP4 training recipes. Our code is available at http://github.com/mit-han-lab/fouroversix.",
      "authors": [
        "Jack Cook",
        "Junxian Guo",
        "Guangxuan Xiao",
        "Yujun Lin",
        "Song Han"
      ],
      "published_at": "2025-12-01T18:59:45",
      "categories": [
        "cs.CL",
        "cs.LG"
      ],
      "url": "http://arxiv.org/abs/2512.02010v3",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "llm",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.5141356587409973,
      "sim_memory": 0.6465221047401428,
      "sim_negative": 0.23632439970970154,
      "broad_relevance": 0.5141356587409973,
      "momentum": 0.7500000000000001,
      "teachability": 0.85,
      "novelty_score": 0.5399790108203888,
      "evidence_score": 0.4,
      "direct_memory_relevance": 0.6465221047401428,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.7999999999999999,
      "clarity": 0.7,
      "reproducibility": 0.5,
      "public_interest_score": 0.6152375492453577,
      "memory_score": 0.2899566314220428,
      "quality_score": 0.575,
      "bridge_score": 0.2899566314220428,
      "max_axis_score": 0.6152375492453577,
      "influencer_boost": 0.225,
      "influencer_matches": [
        "Song Han"
      ],
      "pwc_trending_flag": false,
      "social_score": 0.225,
      "scoring_sources": [
        "influencer:Song Han",
        "social_momentum",
        "compound_window:180d"
      ],
      "time_window": "180d",
      "compound_window_boost": 0.045,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [],
      "status": "Monitor",
      "why_now": "",
      "why_not_higher": "",
      "downgrade_reasons": [],
      "what_would_raise_priority": "",
      "one_sentence_episode_hook": "",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2603.26639",
      "title": "Make Geometry Matter for Spatial Reasoning",
      "abstract": "Empowered by large-scale training, vision-language models (VLMs) achieve strong image and video understanding, yet their ability to perform spatial reasoning in both static scenes and dynamic videos remains limited. Recent advances try to handle this limitation by injecting geometry tokens from pretrained 3D foundation models into VLMs. Nevertheless, we observe that naive token fusion followed by standard fine-tuning in this line of work often leaves such geometric cues underutilized for spatial reasoning, as VLMs tend to rely heavily on 2D visual cues. In this paper, we propose GeoSR, a framework designed to make geometry matter by encouraging VLMs to actively reason with geometry tokens. GeoSR introduces two key components: (1) Geometry-Unleashing Masking, which strategically masks portions of 2D vision tokens during training to weaken non-geometric shortcuts and force the model to consult geometry tokens for spatial reasoning; and (2) Geometry-Guided Fusion, a gated routing mechanism that adaptively amplifies geometry token contributions in regions where geometric evidence is critical. Together, these designs unleash the potential of geometry tokens for spatial reasoning tasks. Extensive experiments on both static and dynamic spatial reasoning benchmarks demonstrate that GeoSR consistently outperforms prior methods and establishes new state-of-the-art performance by effectively leveraging geometric information. The project page is available at https://suhzhang.github.io/GeoSR/.",
      "authors": [
        "Shihua Zhang",
        "Qiuhong Shen",
        "Shizun Wang",
        "Tianbo Pan",
        "Xinchao Wang"
      ],
      "published_at": "2026-03-27T17:45:12",
      "categories": [
        "cs.CV",
        "cs.AI"
      ],
      "url": "http://arxiv.org/abs/2603.26639v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": true,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "application",
      "domain_bucket": "vision",
      "paper_type": "benchmark",
      "narrow_domain_flag": false,
      "sim_public": 0.3662530779838562,
      "sim_memory": 0.2911671996116638,
      "sim_negative": 0.4095418453216553,
      "broad_relevance": 0.3662530779838562,
      "momentum": 1.0,
      "teachability": 0.7,
      "novelty_score": 0.5752736330032349,
      "evidence_score": 0.4,
      "direct_memory_relevance": 0.2911671996116638,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.25,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.6,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.5961669683456421,
      "memory_score": 0.20835015988349911,
      "quality_score": 0.545,
      "bridge_score": 0.20835015988349911,
      "max_axis_score": 0.5961669683456421,
      "influencer_boost": 0.0,
      "influencer_matches": [],
      "pwc_trending_flag": true,
      "social_score": 0.1,
      "scoring_sources": [
        "pwc_trending",
        "social_momentum",
        "pwc_trending"
      ],
      "time_window": "30d",
      "compound_window_boost": 0.0,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [],
      "status": "Monitor",
      "why_now": "",
      "why_not_higher": "",
      "downgrade_reasons": [],
      "what_would_raise_priority": "",
      "one_sentence_episode_hook": "",
      "source": "digest",
      "added": "",
      "issue_number": null
    },
    {
      "arxiv_id": "2601.00844",
      "title": "Value-guided action planning with JEPA world models",
      "abstract": "Building deep learning models that can reason about their environment requires capturing its underlying dynamics. Joint-Embedded Predictive Architectures (JEPA) provide a promising framework to model such dynamics by learning representations and predictors through a self-supervised prediction objective. However, their ability to support effective action planning remains limited. We propose an approach to enhance planning with JEPA world models by shaping their representation space so that the negative goal-conditioned value function for a reaching cost in a given environment is approximated by a distance (or quasi-distance) between state embeddings. We introduce a practical method to enforce this constraint during training and show that it leads to significantly improved planning performance compared to standard JEPA models on simple control tasks.",
      "authors": [
        "Matthieu Destrade",
        "Oumayma Bounou",
        "Quentin Le Lidec",
        "Jean Ponce",
        "Yann LeCun"
      ],
      "published_at": "2025-12-28T20:17:49",
      "categories": [
        "cs.LG",
        "cs.AI",
        "cs.RO"
      ],
      "url": "http://arxiv.org/abs/2601.00844v1",
      "code_url": "",
      "github_submission_flag": false,
      "hf_trending_flag": false,
      "citation_count": 0,
      "influential_citation_count": 0,
      "scope_bucket": "foundation",
      "domain_bucket": "robotics",
      "paper_type": "empirical",
      "narrow_domain_flag": false,
      "sim_public": 0.3781801462173462,
      "sim_memory": 0.3515470027923584,
      "sim_negative": 0.3398587107658386,
      "broad_relevance": 0.3781801462173462,
      "momentum": 0.7833333333333333,
      "teachability": 0.85,
      "novelty_score": 0.4997263550758362,
      "evidence_score": 0.4,
      "direct_memory_relevance": 0.3515470027923584,
      "systems_leverage": 0.1,
      "deployment_proximity": 0.2,
      "memory_adjacent_future_value": 0.0,
      "bandwidth_capacity": 0.0,
      "transferability_score": 0.6,
      "clarity": 0.7999999999999999,
      "reproducibility": 0.5,
      "public_interest_score": 0.575079663793246,
      "memory_score": 0.2014641008377075,
      "quality_score": 0.545,
      "bridge_score": 0.2014641008377075,
      "max_axis_score": 0.575079663793246,
      "influencer_boost": 0.2375,
      "influencer_matches": [
        "Yann LeCun"
      ],
      "pwc_trending_flag": false,
      "social_score": 0.2375,
      "scoring_sources": [
        "influencer:Yann LeCun",
        "social_momentum",
        "compound_window:180d"
      ],
      "time_window": "180d",
      "compound_window_boost": 0.045,
      "first_seen_date": "",
      "fatigue_penalty": 0.0,
      "negative_profile_penalty": 0.0,
      "badges": [],
      "status": "Monitor",
      "why_now": "",
      "why_not_higher": "",
      "downgrade_reasons": [],
      "what_would_raise_priority": "",
      "one_sentence_episode_hook": "",
      "source": "digest",
      "added": "",