From 2f340cd63ad33abc5e95ae10eeea279e74a4d4a0 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Mon, 17 Nov 2025 10:00:34 +0100 Subject: [PATCH 001/247] Revert "UBUNTU: [Packaging] Install compressed vmlinuz.efi on arm64" BugLink: https://bugs.launchpad.net/bugs/2131154 This reverts commit 14a07812867bbe143f1da8208f9e37fdaed794fc as this would require newer kexec tools in Noble. This was found with backported nvidia kernels but also affects the HWE kernel. Signed-off-by: Stefan Bader Signed-off-by: Jacob Martin --- debian.master/rules.d/arm64.mk | 4 ++-- debian.nvidia-6.17/rules.d/arm64.mk | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/debian.master/rules.d/arm64.mk b/debian.master/rules.d/arm64.mk index ad6fbd06ab703..702e011fe5631 100644 --- a/debian.master/rules.d/arm64.mk +++ b/debian.master/rules.d/arm64.mk @@ -1,8 +1,8 @@ build_arch = arm64 defconfig = defconfig flavours = generic generic-64k -build_image = vmlinuz.efi -kernel_file = arch/$(build_arch)/boot/vmlinuz.efi +build_image = Image.gz +kernel_file = arch/$(build_arch)/boot/Image.gz install_file = vmlinuz no_dumpfile = true uefi_signed = true diff --git a/debian.nvidia-6.17/rules.d/arm64.mk b/debian.nvidia-6.17/rules.d/arm64.mk index 6a59fae676a5b..b1d68d2e7108e 100644 --- a/debian.nvidia-6.17/rules.d/arm64.mk +++ b/debian.nvidia-6.17/rules.d/arm64.mk @@ -1,8 +1,8 @@ build_arch = arm64 defconfig = defconfig flavours = nvidia nvidia-64k -build_image = vmlinuz.efi -kernel_file = arch/$(build_arch)/boot/vmlinuz.efi +build_image = Image.gz +kernel_file = arch/$(build_arch)/boot/Image.gz install_file = vmlinuz no_dumpfile = true uefi_signed = true From 9e18a4b8369e8186deeed240dcb824f4fba4cfb6 Mon Sep 17 00:00:00 2001 From: Jacob Martin Date: Mon, 17 Nov 2025 16:17:33 -0600 Subject: [PATCH 002/247] UBUNTU: Ubuntu-nvidia-6.17-6.17.0-1004.4 Signed-off-by: Jacob Martin --- debian.nvidia-6.17/changelog | 111 +++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 5 deletions(-) diff --git a/debian.nvidia-6.17/changelog b/debian.nvidia-6.17/changelog index c03962f98acd6..5f93b16954c88 100644 --- a/debian.nvidia-6.17/changelog +++ b/debian.nvidia-6.17/changelog @@ -1,10 +1,111 @@ -linux-nvidia-6.17 (6.17.0-1004.4) UNRELEASED; urgency=medium +linux-nvidia-6.17 (6.17.0-1004.4) noble; urgency=medium - CHANGELOG: Do not edit directly. Autogenerated at release. - CHANGELOG: Use the printchanges target to see the current changes. - CHANGELOG: Use the insertchanges target to create the final log. + * noble/linux-nvidia-6.17: 6.17.0-1003.3 -proposed tracker (LP: #2131581) - -- Jacob Martin Fri, 14 Nov 2025 21:37:58 -0600 + * kexec reports it cannot determine the file type of arm64 kernel images + (LP: #2131154) + - Revert "UBUNTU: [Packaging] Install compressed vmlinuz.efi on arm64" + + * Race condition in perf build causes build failure due to missing + unistd_64.h header on arm64 (LP: #2131702) + - perf tools: Fix arm64 libjvmti build by generating unistd_64.h + + * Packaging resync (LP: #1786013) + - [Packaging] debian.nvidia-6.17/dkms-versions -- update from kernel- + versions (main/d2025.11.04) + + * Binaries perf and bpftool missing from linux-tools-6.14.0-*nvidia{,-64k} + packages (LP: #2127953) + - [Packaging] Add do_tools_noble_hwe to include perf and bpftool in + SRCPKGNAME-tools-$(abi_release) + - [Packaging] nvidia-6.17: enable do_tools_noble_hwe + + * KVM initialization hitting exception at boot time with kernel 6.17 + (LP: #2130289) + - KVM: arm64: Guard PMSCR_EL1 initialization with SPE presence check + + * r8127: fix kernel panic when dump all registers (LP: #2130445) + - NVIDIA: SAUCE: r8127: fix a kernel panic when dump all registers + - NVIDIA: SAUCE: r8127: add support for RTL8127 cable diagnostic test + + * QSPI Transfer failed with timeout: 0 (LP: #2126589) + - spi: tegra210-quad: Fix timeout handling + - spi: tegra210-quad: Refactor error handling into helper functions + - spi: tegra210-quad: Check hardware status on timeout + + * Backport arm64: cpufeature: Add Olympus MIDR to BBML2 allow list + (LP: #2131047) + - arm64: cpufeature: Add Olympus MIDR to BBML2 allow list + + * Set CONFIG_IOMMU_DEFAULT_PASSTHROUGH as default for Nvidia CPUs + (LP: #2129776) + - NVIDIA: SAUCE: iommu/arm-smmu-v3: Set DGX Spark iGPU default domain type + to DMA + - [Config] nvidia-6.17: Update annotations to set + CONFIG_IOMMU_DEFAULT_PASSTHROUGH + + * mt7925: Introduce CSA support in non-MLO mode (LP: #2129209) + - NVIDIA: SAUCE: wifi: mt76: mt7925: introduce CSA support in non-MLO mode + + * IOMMU: Support contiguous bit in translation tables (LP: #2112600) + - NVIDIA: SAUCE: iommu/io-pgtable-arm: backport contiguous bit support + + [ Ubuntu: 6.17.0-7.7 ] + + * questing/linux: 6.17.0-7.7 -proposed tracker (LP: #2128695) + * Fix incorrect bug number for CONFIG_KERNEL_ZSTD (LP: #2127676) + - [Config] Fix bug note for CONFIG_KERNEL_ZSTD + * support Panter Lake CPU performance preferences (LP: #2127187) + - thermal: intel: int340x: Add support for power slider + - thermal: intel: int340x: Enable power slider interface for Panther Lake + - thermal: intel: int340x: Add module parameter for balanced Slider + - thermal: intel: int340x: Add module parameter to change slider offset + - thermal: intel: int340x: Power Slider: Validate slider_balance range + * [SRU][Q/P/N:hwe-6.14] mt7925: Add MBSS support (LP: #2119479) + - wifi: mt76: mt7925: add MBSSID support + * Plucky preinstalled server fails to boot on rb3gen2 (LP: #2106681) // + Questing preinstalled server fails to boot on sa8775p boards + (LP: #2121347) + - [Config] move more qcom interconnect/pinctrl/gcc options to builtin + * Packaging resync (LP: #1786013) + - [Packaging] update Ubuntu.md + * r8169 can not wake on LAN via SFP moudule (LP: #2123901) + - r8169: set EEE speed down ratio to 1 + * System hangs when running the memory stress test (LP: #2103680) + - mm: page_alloc: avoid kswapd thrashing due to NUMA restrictions + * Questing update: v6.17.2 upstream stable release (LP: #2128209) + - drm/amdgpu: Enable MES lr_compute_wa by default + - USB: serial: option: add SIMCom 8230C compositions + - Bluetooth: btusb: Add USB ID 2001:332a for D-Link AX9U rev. A1 + - wifi: rtlwifi: rtl8192cu: Don't claim USB ID 07b8:8188 + - wifi: rtl8xxxu: Don't claim USB ID 07b8:8188 + - rust: drm: fix `srctree/` links + - rust: block: fix `srctree/` links + - rust: pci: fix incorrect platform reference in PCI driver probe doc + comment + - rust: pci: fix incorrect platform reference in PCI driver unbind doc + comment + - serial: qcom-geni: Fix blocked task + - nvmem: layouts: fix automatic module loading + - drivers/misc/amd-sbi/Kconfig: select REGMAP_I2C + - binder: fix double-free in dbitmap + - serial: stm32: allow selecting console when the driver is module + - [Config] stm32: do not select console when driver is module + - staging: axis-fifo: fix maximum TX packet length check + - staging: axis-fifo: fix TX handling on copy_from_user() failure + - staging: axis-fifo: flush RX FIFO on read errors + - driver core: faux: Set power.no_pm for faux devices + - driver core/PM: Set power.no_callbacks along with power.no_pm + - Revert "crypto: testmgr - desupport SHA-1 for FIPS 140" + - crypto: zstd - Fix compression bug caused by truncation + - crypto: rng - Ensure set_ent is always present + - net/9p: fix double req put in p9_fd_cancelled + - KVM: x86: Don't (re)check L1 intercepts when completing userspace I/O + - f2fs: fix to do sanity check on node footer for non inode dnode + - ring buffer: Propagate __rb_map_vma return value to caller + - Linux 6.17.2 + + -- Jacob Martin Mon, 17 Nov 2025 16:17:33 -0600 linux-nvidia-6.17 (6.17.0-1002.2) noble; urgency=medium From c31115ca05a5ff9589928538123697a503bc1ca6 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Wed, 5 Nov 2025 17:08:37 +0530 Subject: [PATCH 003/247] NVIDIA: SAUCE: cpufreq: CPPC: Add generic helpers for sysfs show/store BugLink: https://bugs.launchpad.net/bugs/2131705 Add generic show/store helper functions for u64 sysfs attributes: - cppc_cpufreq_sysfs_show_u64() - cppc_cpufreq_sysfs_store_u64() Refactor auto_act_window and energy_performance_preference_val attributes to use these helpers, eliminating code duplication. No functional changes. Signed-off-by: Sumit Gupta (backported from https://lore.kernel.org/all/20251105113844.4086250-1-sumitg@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/cpufreq/cppc_cpufreq.c | 57 ++++++++++++---------------------- 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index dd8efe4fb967f..01658170af493 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -838,72 +838,53 @@ static ssize_t store_auto_select(struct cpufreq_policy *policy, return count; } -static ssize_t show_auto_act_window(struct cpufreq_policy *policy, char *buf) +static ssize_t cppc_cpufreq_sysfs_show_u64(unsigned int cpu, int (*get_func)(int, u64 *), char *buf) { u64 val; - int ret; - - ret = cppc_get_auto_act_window(policy->cpu, &val); + int ret = get_func(cpu, &val); - /* show "" when this register is not supported by cpc */ if (ret == -EOPNOTSUPP) return sysfs_emit(buf, "\n"); - if (ret) return ret; return sysfs_emit(buf, "%llu\n", val); } -static ssize_t store_auto_act_window(struct cpufreq_policy *policy, - const char *buf, size_t count) +static ssize_t cppc_cpufreq_sysfs_store_u64(unsigned int cpu, int (*set_func)(int, u64), + const char *buf, size_t count) { - u64 usec; + u64 val; int ret; - ret = kstrtou64(buf, 0, &usec); + ret = kstrtou64(buf, 0, &val); if (ret) return ret; - ret = cppc_set_auto_act_window(policy->cpu, usec); - if (ret) - return ret; + ret = set_func((int)cpu, val); - return count; + return ret ? ret : count; } -static ssize_t show_energy_performance_preference_val(struct cpufreq_policy *policy, char *buf) +static ssize_t show_auto_act_window(struct cpufreq_policy *policy, char *buf) { - u64 val; - int ret; - - ret = cppc_get_epp_perf(policy->cpu, &val); - - /* show "" when this register is not supported by cpc */ - if (ret == -EOPNOTSUPP) - return sysfs_emit(buf, "\n"); + return cppc_cpufreq_sysfs_show_u64(policy->cpu, cppc_get_auto_act_window, buf); +} - if (ret) - return ret; +static ssize_t store_auto_act_window(struct cpufreq_policy *policy, const char *buf, size_t count) +{ + return cppc_cpufreq_sysfs_store_u64(policy->cpu, cppc_set_auto_act_window, buf, count); +} - return sysfs_emit(buf, "%llu\n", val); +static ssize_t show_energy_performance_preference_val(struct cpufreq_policy *policy, char *buf) +{ + return cppc_cpufreq_sysfs_show_u64(policy->cpu, cppc_get_epp_perf, buf); } static ssize_t store_energy_performance_preference_val(struct cpufreq_policy *policy, const char *buf, size_t count) { - u64 val; - int ret; - - ret = kstrtou64(buf, 0, &val); - if (ret) - return ret; - - ret = cppc_set_epp(policy->cpu, val); - if (ret) - return ret; - - return count; + return cppc_cpufreq_sysfs_store_u64(policy->cpu, cppc_set_epp, buf, count); } cpufreq_freq_attr_ro(freqdomain_cpus); From d3d7c5f5279c47a94f30fe62f38e6a3c264cfb44 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Wed, 5 Nov 2025 17:08:38 +0530 Subject: [PATCH 004/247] NVIDIA: SAUCE: ACPI: CPPC: Add cppc_get_perf() API to read performance controls BugLink: https://bugs.launchpad.net/bugs/2131705 Add cppc_get_perf() function to read values of performance control registers including desired_perf, min_perf, max_perf, and energy_perf. This provides a read interface to complement the existing cppc_set_perf() write interface for performance control registers. Signed-off-by: Sumit Gupta (backported from https://lore.kernel.org/all/20251105113844.4086250-1-sumitg@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/acpi/cppc_acpi.c | 73 ++++++++++++++++++++++++++++++++++++++++ include/acpi/cppc_acpi.h | 5 +++ 2 files changed, 78 insertions(+) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 002c3dde283ff..371cee30aae6d 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1731,6 +1731,79 @@ int cppc_set_enable(int cpu, bool enable) return cppc_set_reg_val(cpu, ENABLE, enable); } EXPORT_SYMBOL_GPL(cppc_set_enable); +/** + * cppc_get_perf - Get a CPU's performance controls. + * @cpu: CPU for which to get performance controls. + * @perf_ctrls: ptr to cppc_perf_ctrls. See cppc_acpi.h + * + * Return: 0 for success with perf_ctrls, -ERRNO otherwise. + */ +int cppc_get_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +{ + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); + struct cpc_register_resource *desired_perf_reg, *min_perf_reg, *max_perf_reg, + *energy_perf_reg; + u64 desired_perf = 0, min = 0, max = 0, energy_perf = 0; + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret = 0, regs_in_pcc = 0; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpu); + return -ENODEV; + } + + if (!perf_ctrls) { + pr_debug("Invalid perf_ctrls pointer\n"); + return -EINVAL; + } + + desired_perf_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; + min_perf_reg = &cpc_desc->cpc_regs[MIN_PERF]; + max_perf_reg = &cpc_desc->cpc_regs[MAX_PERF]; + energy_perf_reg = &cpc_desc->cpc_regs[ENERGY_PERF]; + + /* Are any of the regs PCC ?*/ + if (CPC_IN_PCC(desired_perf_reg) || CPC_IN_PCC(min_perf_reg) || + CPC_IN_PCC(max_perf_reg) || CPC_IN_PCC(energy_perf_reg)) { + if (pcc_ss_id < 0) { + pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu); + return -ENODEV; + } + pcc_ss_data = pcc_data[pcc_ss_id]; + regs_in_pcc = 1; + down_write(&pcc_ss_data->pcc_lock); + /* Ring doorbell once to update PCC subspace */ + if (send_pcc_cmd(pcc_ss_id, CMD_READ) < 0) { + pr_debug("Failed to send PCC command for CPU:%d, ret:%d\n", cpu, ret); + ret = -EIO; + goto out_err; + } + } + + /* Read optional elements if present */ + if (CPC_SUPPORTED(max_perf_reg)) + cpc_read(cpu, max_perf_reg, &max); + perf_ctrls->max_perf = max; + + if (CPC_SUPPORTED(min_perf_reg)) + cpc_read(cpu, min_perf_reg, &min); + perf_ctrls->min_perf = min; + + if (CPC_SUPPORTED(desired_perf_reg)) + cpc_read(cpu, desired_perf_reg, &desired_perf); + perf_ctrls->desired_perf = desired_perf; + + if (CPC_SUPPORTED(energy_perf_reg)) + cpc_read(cpu, energy_perf_reg, &energy_perf); + perf_ctrls->energy_perf = energy_perf; + +out_err: + if (regs_in_pcc) + up_write(&pcc_ss_data->pcc_lock); + return ret; +} +EXPORT_SYMBOL_GPL(cppc_get_perf); /** * cppc_set_perf - Set a CPU's performance controls. diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 20f3d62e7a16a..213bd389ec571 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -151,6 +151,7 @@ extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf); extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf); extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); +extern int cppc_get_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_set_enable(int cpu, bool enable); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); @@ -192,6 +193,10 @@ static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ { return -EOPNOTSUPP; } +static inline int cppc_get_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +{ + return -EOPNOTSUPP; +} static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) { return -EOPNOTSUPP; From ff886080662dcc0b34b52e6c4538c8053acf0357 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Wed, 5 Nov 2025 17:08:39 +0530 Subject: [PATCH 005/247] NVIDIA: SAUCE: ACPI: CPPC: extend APIs to support auto_sel and epp BugLink: https://bugs.launchpad.net/bugs/2131705 - Add auto_sel read support in cppc_get_perf_caps(). - Add write of both auto_sel and energy_perf in cppc_set_epp_perf(). - Remove redundant energy_perf field from 'struct cppc_perf_caps' as the same is available in 'struct cppc_perf_ctrls' which is used. Signed-off-by: Sumit Gupta (backported from https://lore.kernel.org/all/20251105113844.4086250-1-sumitg@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/acpi/cppc_acpi.c | 42 ++++++++++++++++++++++++++++++++-------- include/acpi/cppc_acpi.h | 1 - 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 371cee30aae6d..720bf8977619e 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1344,8 +1344,8 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum); struct cpc_register_resource *highest_reg, *lowest_reg, *lowest_non_linear_reg, *nominal_reg, *guaranteed_reg, - *low_freq_reg = NULL, *nom_freq_reg = NULL; - u64 high, low, guaranteed, nom, min_nonlinear, low_f = 0, nom_f = 0; + *low_freq_reg = NULL, *nom_freq_reg = NULL, *auto_sel_reg = NULL; + u64 high, low, guaranteed, nom, min_nonlinear, low_f = 0, nom_f = 0, auto_sel = 0; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpunum); struct cppc_pcc_data *pcc_ss_data = NULL; int ret = 0, regs_in_pcc = 0; @@ -1362,11 +1362,12 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) low_freq_reg = &cpc_desc->cpc_regs[LOWEST_FREQ]; nom_freq_reg = &cpc_desc->cpc_regs[NOMINAL_FREQ]; guaranteed_reg = &cpc_desc->cpc_regs[GUARANTEED_PERF]; + auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; /* Are any of the regs PCC ?*/ if (CPC_IN_PCC(highest_reg) || CPC_IN_PCC(lowest_reg) || CPC_IN_PCC(lowest_non_linear_reg) || CPC_IN_PCC(nominal_reg) || - CPC_IN_PCC(low_freq_reg) || CPC_IN_PCC(nom_freq_reg)) { + CPC_IN_PCC(low_freq_reg) || CPC_IN_PCC(nom_freq_reg) || CPC_IN_PCC(auto_sel_reg)) { if (pcc_ss_id < 0) { pr_debug("Invalid pcc_ss_id\n"); return -ENODEV; @@ -1414,6 +1415,9 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) perf_caps->lowest_freq = low_f; perf_caps->nominal_freq = nom_f; + if (CPC_SUPPORTED(auto_sel_reg)) + cpc_read(cpunum, auto_sel_reg, &auto_sel); + perf_caps->auto_sel = (bool)auto_sel; out_err: if (regs_in_pcc) @@ -1555,6 +1559,8 @@ int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) struct cpc_register_resource *auto_sel_reg; struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); struct cppc_pcc_data *pcc_ss_data = NULL; + bool autosel_support_in_ffh_or_sysmem; + bool epp_support_in_ffh_or_sysmem; int ret; if (!cpc_desc) { @@ -1565,6 +1571,11 @@ int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) auto_sel_reg = &cpc_desc->cpc_regs[AUTO_SEL_ENABLE]; epp_set_reg = &cpc_desc->cpc_regs[ENERGY_PERF]; + epp_support_in_ffh_or_sysmem = CPC_SUPPORTED(epp_set_reg) && + (CPC_IN_FFH(epp_set_reg) || CPC_IN_SYSTEM_MEMORY(epp_set_reg)); + autosel_support_in_ffh_or_sysmem = CPC_SUPPORTED(auto_sel_reg) && + (CPC_IN_FFH(auto_sel_reg) || CPC_IN_SYSTEM_MEMORY(auto_sel_reg)); + if (CPC_IN_PCC(epp_set_reg) || CPC_IN_PCC(auto_sel_reg)) { if (pcc_ss_id < 0) { pr_debug("Invalid pcc_ss_id for CPU:%d\n", cpu); @@ -1589,14 +1600,29 @@ int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable) /* after writing CPC, transfer the ownership of PCC to platform */ ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); up_write(&pcc_ss_data->pcc_lock); - } else if (osc_cpc_flexible_adr_space_confirmed && - CPC_SUPPORTED(epp_set_reg) && CPC_IN_FFH(epp_set_reg)) { - ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf); + } else if (osc_cpc_flexible_adr_space_confirmed) { + if (!epp_support_in_ffh_or_sysmem && !autosel_support_in_ffh_or_sysmem) { + ret = -EOPNOTSUPP; + } else { + if (autosel_support_in_ffh_or_sysmem) { + ret = cpc_write(cpu, auto_sel_reg, enable); + if (ret) + return ret; + } + + if (epp_support_in_ffh_or_sysmem) { + ret = cpc_write(cpu, epp_set_reg, perf_ctrls->energy_perf); + if (ret) + return ret; + } + } } else { - ret = -ENOTSUPP; - pr_debug("_CPC in PCC and _CPC in FFH are not supported\n"); + ret = -EOPNOTSUPP; } + if (ret == -EOPNOTSUPP) + pr_debug("_CPC in PCC and _CPC in FFH are not supported\n"); + return ret; } EXPORT_SYMBOL_GPL(cppc_set_epp_perf); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 213bd389ec571..3babc6d6e70a1 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -119,7 +119,6 @@ struct cppc_perf_caps { u32 lowest_nonlinear_perf; u32 lowest_freq; u32 nominal_freq; - u32 energy_perf; bool auto_sel; }; From 2ee6cf08f1b28f9dfc545f0adf5a113d85e5e8ac Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Wed, 5 Nov 2025 17:08:40 +0530 Subject: [PATCH 006/247] NVIDIA: SAUCE: ACPI: CPPC: add APIs and sysfs interface for min/max_perf BugLink: https://bugs.launchpad.net/bugs/2131705 CPPC allows platforms to specify minimum and maximum performance limits that constrain the operating range for CPU performance scaling when Autonomous Selection is enabled. These limits can be dynamically adjusted to implement power management policies or workload-specific optimizations. Add cppc_get_min_perf() and cppc_set_min_perf() functions to read and write the MIN_PERF register, allowing dynamic adjustment of the minimum performance floor. Add cppc_get_max_perf() and cppc_set_max_perf() functions to read and write the MAX_PERF register, enabling dynamic ceiling control for maximum performance. Expose these capabilities through cpufreq sysfs attributes that accept frequency values in kHz (which are converted to/from performance values internally): - /sys/.../cpufreq/policy*/min_perf: Read/write min perf as freq (kHz) - /sys/.../cpufreq/policy*/max_perf: Read/write max perf as freq (kHz) The frequency-based interface provides a user-friendly abstraction which is similar to other cpufreq sysfs interfaces, while the driver handles conversion to hardware performance values. Also update EPP constants for better clarity: - Rename CPPC_ENERGY_PERF_MAX to CPPC_EPP_ENERGY_EFFICIENCY_PREF - Add CPPC_EPP_PERFORMANCE_PREF for the performance-oriented setting Signed-off-by: Sumit Gupta (backported from https://lore.kernel.org/all/20251105113844.4086250-1-sumitg@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/acpi/cppc_acpi.c | 55 ++++++++++- drivers/cpufreq/cppc_cpufreq.c | 166 +++++++++++++++++++++++++++++++++ include/acpi/cppc_acpi.h | 23 ++++- 3 files changed, 242 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 720bf8977619e..d89226558ff3b 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1634,7 +1634,7 @@ EXPORT_SYMBOL_GPL(cppc_set_epp_perf); */ int cppc_set_epp(int cpu, u64 epp_val) { - if (epp_val > CPPC_ENERGY_PERF_MAX) + if (epp_val > CPPC_EPP_ENERGY_EFFICIENCY_PREF) return -EINVAL; return cppc_set_reg_val(cpu, ENERGY_PERF, epp_val); @@ -1757,6 +1757,59 @@ int cppc_set_enable(int cpu, bool enable) return cppc_set_reg_val(cpu, ENABLE, enable); } EXPORT_SYMBOL_GPL(cppc_set_enable); + +/** + * cppc_get_min_perf - Get the min performance register value. + * @cpu: CPU from which to get min performance. + * @min_perf: Return address. + * + * Return: 0 for success, -EIO on register access failure, -EOPNOTSUPP if not supported. + */ +int cppc_get_min_perf(int cpu, u64 *min_perf) +{ + return cppc_get_reg_val(cpu, MIN_PERF, min_perf); +} +EXPORT_SYMBOL_GPL(cppc_get_min_perf); + +/** + * cppc_set_min_perf() - Write the min performance register. + * @cpu: CPU on which to write register. + * @min_perf: Value to write to the MIN_PERF register. + * + * Return: 0 for success, -EIO otherwise. + */ +int cppc_set_min_perf(int cpu, u64 min_perf) +{ + return cppc_set_reg_val(cpu, MIN_PERF, min_perf); +} +EXPORT_SYMBOL_GPL(cppc_set_min_perf); + +/** + * cppc_get_max_perf - Get the max performance register value. + * @cpu: CPU from which to get max performance. + * @max_perf: Return address. + * + * Return: 0 for success, -EIO on register access failure, -EOPNOTSUPP if not supported. + */ +int cppc_get_max_perf(int cpu, u64 *max_perf) +{ + return cppc_get_reg_val(cpu, MAX_PERF, max_perf); +} +EXPORT_SYMBOL_GPL(cppc_get_max_perf); + +/** + * cppc_set_max_perf() - Write the max performance register. + * @cpu: CPU on which to write register. + * @max_perf: Value to write to the MAX_PERF register. + * + * Return: 0 for success, -EIO otherwise. + */ +int cppc_set_max_perf(int cpu, u64 max_perf) +{ + return cppc_set_reg_val(cpu, MAX_PERF, max_perf); +} +EXPORT_SYMBOL_GPL(cppc_set_max_perf); + /** * cppc_get_perf - Get a CPU's performance controls. * @cpu: CPU for which to get performance controls. diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 01658170af493..2699f43eac21a 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -23,10 +23,12 @@ #include #include +#include #include static struct cpufreq_driver cppc_cpufreq_driver; +static DEFINE_MUTEX(cppc_cpufreq_update_autosel_config_lock); #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE static enum { @@ -584,6 +586,68 @@ static void cppc_cpufreq_put_cpu_data(struct cpufreq_policy *policy) policy->driver_data = NULL; } +/** + * cppc_cpufreq_set_mperf_limit - Generic function to set min/max performance limit + * @policy: cpufreq policy + * @val: performance value to set + * @update_reg: whether to update hardware register + * @update_policy: whether to update policy constraints + * @is_min: true for min_perf, false for max_perf + */ +static int cppc_cpufreq_set_mperf_limit(struct cpufreq_policy *policy, u64 val, + bool update_reg, bool update_policy, bool is_min) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; + unsigned int cpu = policy->cpu; + struct freq_qos_request *req; + unsigned int freq; + u32 perf; + int ret; + + perf = clamp(val, caps->lowest_perf, caps->highest_perf); + freq = cppc_perf_to_khz(caps, perf); + + pr_debug("cpu%d, %s_perf:%llu, update_reg:%d, update_policy:%d\n", cpu, + is_min ? "min" : "max", (u64)perf, update_reg, update_policy); + + guard(mutex)(&cppc_cpufreq_update_autosel_config_lock); + + if (update_reg) { + ret = is_min ? cppc_set_min_perf(cpu, perf) : cppc_set_max_perf(cpu, perf); + if (ret) { + if (ret != -EOPNOTSUPP) + pr_warn("Failed to set %s_perf (%llu) on CPU%d (%d)\n", + is_min ? "min" : "max", (u64)perf, cpu, ret); + return ret; + } + + if (is_min) + cpu_data->perf_ctrls.min_perf = perf; + else + cpu_data->perf_ctrls.max_perf = perf; + } + + if (update_policy) { + req = is_min ? policy->min_freq_req : policy->max_freq_req; + + ret = freq_qos_update_request(req, freq); + if (ret < 0) { + pr_warn("Failed to update %s_freq constraint for CPU%d: %d\n", + is_min ? "min" : "max", cpu, ret); + return ret; + } + } + + return 0; +} + +#define cppc_cpufreq_set_min_perf(policy, val, update_reg, update_policy) \ + cppc_cpufreq_set_mperf_limit(policy, val, update_reg, update_policy, true) + +#define cppc_cpufreq_set_max_perf(policy, val, update_reg, update_policy) \ + cppc_cpufreq_set_mperf_limit(policy, val, update_reg, update_policy, false) + static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; @@ -887,16 +951,118 @@ static ssize_t store_energy_performance_preference_val(struct cpufreq_policy *po return cppc_cpufreq_sysfs_store_u64(policy->cpu, cppc_set_epp, buf, count); } +/** + * show_min_perf - Show minimum performance as frequency (kHz) + * + * Reads the MIN_PERF register and converts the performance value to + * frequency (kHz) for user-space consumption. + */ +static ssize_t show_min_perf(struct cpufreq_policy *policy, char *buf) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + u64 perf; + int ret; + + ret = cppc_get_min_perf(policy->cpu, &perf); + if (ret == -EOPNOTSUPP) + return sysfs_emit(buf, "\n"); + if (ret) + return ret; + + /* Convert performance to frequency (kHz) for user */ + return sysfs_emit(buf, "%u\n", cppc_perf_to_khz(&cpu_data->perf_caps, perf)); +} + +/** + * store_min_perf - Set minimum performance from frequency (kHz) + * + * Converts the user-provided frequency (kHz) to a performance value + * and writes it to the MIN_PERF register. + */ +static ssize_t store_min_perf(struct cpufreq_policy *policy, const char *buf, size_t count) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + unsigned int freq_khz; + u64 perf; + int ret; + + ret = kstrtouint(buf, 0, &freq_khz); + if (ret) + return ret; + + /* Convert frequency (kHz) to performance value */ + perf = cppc_khz_to_perf(&cpu_data->perf_caps, freq_khz); + + ret = cppc_cpufreq_set_min_perf(policy, perf, true, cpu_data->perf_caps.auto_sel); + if (ret) + return ret; + + return count; +} + +/** + * show_max_perf - Show maximum performance as frequency (kHz) + * + * Reads the MAX_PERF register and converts the performance value to + * frequency (kHz) for user-space consumption. + */ +static ssize_t show_max_perf(struct cpufreq_policy *policy, char *buf) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + u64 perf; + int ret; + + ret = cppc_get_max_perf(policy->cpu, &perf); + if (ret == -EOPNOTSUPP) + return sysfs_emit(buf, "\n"); + if (ret) + return ret; + + /* Convert performance to frequency (kHz) for user */ + return sysfs_emit(buf, "%u\n", cppc_perf_to_khz(&cpu_data->perf_caps, perf)); +} + +/** + * store_max_perf - Set maximum performance from frequency (kHz) + * + * Converts the user-provided frequency (kHz) to a performance value + * and writes it to the MAX_PERF register. + */ +static ssize_t store_max_perf(struct cpufreq_policy *policy, const char *buf, size_t count) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + unsigned int freq_khz; + u64 perf; + int ret; + + ret = kstrtouint(buf, 0, &freq_khz); + if (ret) + return ret; + + /* Convert frequency (kHz) to performance value */ + perf = cppc_khz_to_perf(&cpu_data->perf_caps, freq_khz); + + ret = cppc_cpufreq_set_max_perf(policy, perf, true, cpu_data->perf_caps.auto_sel); + if (ret) + return ret; + + return count; +} + cpufreq_freq_attr_ro(freqdomain_cpus); cpufreq_freq_attr_rw(auto_select); cpufreq_freq_attr_rw(auto_act_window); cpufreq_freq_attr_rw(energy_performance_preference_val); +cpufreq_freq_attr_rw(min_perf); +cpufreq_freq_attr_rw(max_perf); static struct freq_attr *cppc_cpufreq_attr[] = { &freqdomain_cpus, &auto_select, &auto_act_window, &energy_performance_preference_val, + &min_perf, + &max_perf, NULL, }; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 3babc6d6e70a1..fc7614eb9dcb6 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -39,7 +39,8 @@ /* CPPC_AUTO_ACT_WINDOW_MAX_SIG is 127, so 128 and 129 will decay to 127 when writing */ #define CPPC_AUTO_ACT_WINDOW_SIG_CARRY_THRESH 129 -#define CPPC_ENERGY_PERF_MAX (0xFF) +#define CPPC_EPP_PERFORMANCE_PREF 0x00 +#define CPPC_EPP_ENERGY_EFFICIENCY_PREF 0xFF /* Each register has the folowing format. */ struct cpc_reg { @@ -172,6 +173,10 @@ extern int cppc_get_auto_act_window(int cpu, u64 *auto_act_window); extern int cppc_set_auto_act_window(int cpu, u64 auto_act_window); extern int cppc_get_auto_sel(int cpu, bool *enable); extern int cppc_set_auto_sel(int cpu, bool enable); +extern int cppc_get_min_perf(int cpu, u64 *min_perf); +extern int cppc_set_min_perf(int cpu, u64 min_perf); +extern int cppc_get_max_perf(int cpu, u64 *max_perf); +extern int cppc_set_max_perf(int cpu, u64 max_perf); extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf); extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator); extern int amd_detect_prefcore(bool *detected); @@ -264,6 +269,22 @@ static inline int cppc_set_auto_sel(int cpu, bool enable) { return -EOPNOTSUPP; } +static inline int cppc_get_min_perf(int cpu, u64 *min_perf) +{ + return -EOPNOTSUPP; +} +static inline int cppc_set_min_perf(int cpu, u64 min_perf) +{ + return -EOPNOTSUPP; +} +static inline int cppc_get_max_perf(int cpu, u64 *max_perf) +{ + return -EOPNOTSUPP; +} +static inline int cppc_set_max_perf(int cpu, u64 max_perf) +{ + return -EOPNOTSUPP; +} static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) { return -ENODEV; From ac08eb1e4a45113f9d382cb7e33896f5efe6a830 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Wed, 5 Nov 2025 17:08:41 +0530 Subject: [PATCH 007/247] NVIDIA: SAUCE: ACPI: CPPC: add APIs and sysfs interface for perf_limited register BugLink: https://bugs.launchpad.net/bugs/2131705 Add sysfs interface to read/write the Performance Limited register. The Performance Limited register indicates to the OS that an unpredictable event (like thermal throttling) has limited processor performance. This register is sticky and remains set until reset or OS clears it by writing 0. The interface is exposed as: /sys/devices/system/cpu/cpuX/cpufreq/perf_limited Signed-off-by: Sumit Gupta (backported from https://lore.kernel.org/all/20251105113844.4086250-1-sumitg@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/acpi/cppc_acpi.c | 26 ++++++++++++++++++++++++++ drivers/cpufreq/cppc_cpufreq.c | 12 ++++++++++++ include/acpi/cppc_acpi.h | 10 ++++++++++ 3 files changed, 48 insertions(+) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index d89226558ff3b..b6b1bf0bdd212 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1810,6 +1810,32 @@ int cppc_set_max_perf(int cpu, u64 max_perf) } EXPORT_SYMBOL_GPL(cppc_set_max_perf); +/** + * cppc_get_perf_limited - Get the Performance Limited register value. + * @cpu: CPU from which to get Performance Limited register. + * @perf_limited: Pointer to store the Performance Limited value. + * + * Return: 0 for success, -EIO on register access failure, -EOPNOTSUPP if not supported. + */ +int cppc_get_perf_limited(int cpu, u64 *perf_limited) +{ + return cppc_get_reg_val(cpu, PERF_LIMITED, perf_limited); +} +EXPORT_SYMBOL_GPL(cppc_get_perf_limited); + +/** + * cppc_set_perf_limited() - Write the Performance Limited register. + * @cpu: CPU on which to write register. + * @perf_limited: Value to write to the perf_limited register. + * + * Return: 0 for success, -EIO on register access failure, -EOPNOTSUPP if not supported. + */ +int cppc_set_perf_limited(int cpu, u64 perf_limited) +{ + return cppc_set_reg_val(cpu, PERF_LIMITED, perf_limited); +} +EXPORT_SYMBOL_GPL(cppc_set_perf_limited); + /** * cppc_get_perf - Get a CPU's performance controls. * @cpu: CPU for which to get performance controls. diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 2699f43eac21a..d87f0b0273325 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -1049,12 +1049,23 @@ static ssize_t store_max_perf(struct cpufreq_policy *policy, const char *buf, si return count; } +static ssize_t show_perf_limited(struct cpufreq_policy *policy, char *buf) +{ + return cppc_cpufreq_sysfs_show_u64(policy->cpu, cppc_get_perf_limited, buf); +} + +static ssize_t store_perf_limited(struct cpufreq_policy *policy, const char *buf, size_t count) +{ + return cppc_cpufreq_sysfs_store_u64(policy->cpu, cppc_set_perf_limited, buf, count); +} + cpufreq_freq_attr_ro(freqdomain_cpus); cpufreq_freq_attr_rw(auto_select); cpufreq_freq_attr_rw(auto_act_window); cpufreq_freq_attr_rw(energy_performance_preference_val); cpufreq_freq_attr_rw(min_perf); cpufreq_freq_attr_rw(max_perf); +cpufreq_freq_attr_rw(perf_limited); static struct freq_attr *cppc_cpufreq_attr[] = { &freqdomain_cpus, @@ -1063,6 +1074,7 @@ static struct freq_attr *cppc_cpufreq_attr[] = { &energy_performance_preference_val, &min_perf, &max_perf, + &perf_limited, NULL, }; diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index fc7614eb9dcb6..9fc28fb1890be 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -177,6 +177,8 @@ extern int cppc_get_min_perf(int cpu, u64 *min_perf); extern int cppc_set_min_perf(int cpu, u64 min_perf); extern int cppc_get_max_perf(int cpu, u64 *max_perf); extern int cppc_set_max_perf(int cpu, u64 max_perf); +extern int cppc_get_perf_limited(int cpu, u64 *perf_limited); +extern int cppc_set_perf_limited(int cpu, u64 perf_limited); extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf); extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator); extern int amd_detect_prefcore(bool *detected); @@ -285,6 +287,14 @@ static inline int cppc_set_max_perf(int cpu, u64 max_perf) { return -EOPNOTSUPP; } +static inline int cppc_get_perf_limited(int cpu, u64 *perf_limited) +{ + return -EOPNOTSUPP; +} +static inline int cppc_set_perf_limited(int cpu, u64 perf_limited) +{ + return -EOPNOTSUPP; +} static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) { return -ENODEV; From 882adb86c0c4e954332ebd0c839c8b8a093ff9e4 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Wed, 5 Nov 2025 17:08:42 +0530 Subject: [PATCH 008/247] NVIDIA: SAUCE: cpufreq: CPPC: Add sysfs for min/max_perf and perf_limited BugLink: https://bugs.launchpad.net/bugs/2131705 Add sysfs interfaces for Minimum Performance, Maximum Performance and Performance Limited Register in the cppc_cpufreq driver. Reviewed-by: Randy Dunlap Signed-off-by: Sumit Gupta (backported from https://lore.kernel.org/all/20251105113844.4086250-1-sumitg@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- .../ABI/testing/sysfs-devices-system-cpu | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 8aed6d94c4cd0..6f1f70696000a 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -327,6 +327,52 @@ Description: Energy performance preference This file is only present if the cppc-cpufreq driver is in use. +What: /sys/devices/system/cpu/cpuX/cpufreq/min_perf +Date: December 2025 +Contact: linux-pm@vger.kernel.org +Description: Minimum Performance Frequency + + Read/write a frequency value in kHz from/to this file. This + file conveys the minimum performance level (as frequency) at + which the platform may run. The frequency value is internally + converted to a performance value and must correspond to a + performance level in the range [Lowest Performance, Highest + Performance], inclusive. The minimum must be less than or equal + to the maximum performance. The performance range can be checked + from nodes: + /sys/devices/system/cpu/cpuX/acpi_cppc/highest_perf + /sys/devices/system/cpu/cpuX/acpi_cppc/lowest_perf + + This file is only present if the cppc-cpufreq driver is in use. + +What: /sys/devices/system/cpu/cpuX/cpufreq/max_perf +Date: December 2025 +Contact: linux-pm@vger.kernel.org +Description: Maximum Performance Frequency + + Read/write a frequency value in kHz from/to this file. This + file conveys the maximum performance level (as frequency) at + which the platform may run. The frequency value is internally + converted to a performance value and must correspond to a + performance level in the range [Lowest Performance, Highest + Performance], inclusive. The performance range can be checked + from nodes: + /sys/devices/system/cpu/cpuX/acpi_cppc/highest_perf + /sys/devices/system/cpu/cpuX/acpi_cppc/lowest_perf + + This file is only present if the cppc-cpufreq driver is in use. + +What: /sys/devices/system/cpu/cpuX/cpufreq/perf_limited +Date: December 2025 +Contact: linux-pm@vger.kernel.org +Description: Performance Limited + + Read/write a 32 bits value from/to this file. This file indicates + to OSPM that an unpredictable event has limited processor + performance, and the delivered performance may be less than + desired/minimum performance. + + This file is only present if the cppc-cpufreq driver is in use. What: /sys/devices/system/cpu/cpu*/cache/index3/cache_disable_{0,1} Date: August 2008 From deb6227d25bf7c6c2cc4462a237bc9b12a2db912 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Wed, 5 Nov 2025 17:08:43 +0530 Subject: [PATCH 009/247] NVIDIA: SAUCE: cpufreq: CPPC: update policy min/max when toggling auto_select BugLink: https://bugs.launchpad.net/bugs/2131705 When CPPC autonomous selection (auto_select) is enabled or disabled, the policy min/max frequency limits should be updated appropriately to reflect the new operating mode. Currently, toggling auto_select only changes the hardware register but doesn't update the cpufreq policy constraints, which can lead to inconsistent behavior between the hardware state and the policy limits visible to userspace and other kernel components. When auto_select is enabled, preserve the current min/max performance values to maintain user-configured limits. When disabled, the hardware operates in a default mode where the OS directly controls performance, so update the policy limits accordingly. Signed-off-by: Sumit Gupta (backported from https://lore.kernel.org/all/20251105113844.4086250-1-sumitg@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/cpufreq/cppc_cpufreq.c | 67 ++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index d87f0b0273325..14facc7bbc952 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -648,6 +648,26 @@ static int cppc_cpufreq_set_mperf_limit(struct cpufreq_policy *policy, u64 val, #define cppc_cpufreq_set_max_perf(policy, val, update_reg, update_policy) \ cppc_cpufreq_set_mperf_limit(policy, val, update_reg, update_policy, false) +static int cppc_cpufreq_update_autosel_val(struct cpufreq_policy *policy, bool auto_sel) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + unsigned int cpu = policy->cpu; + int ret; + + pr_debug("cpu%d, auto_sel curr:%u, new:%d\n", cpu, cpu_data->perf_caps.auto_sel, auto_sel); + + guard(mutex)(&cppc_cpufreq_update_autosel_config_lock); + + ret = cppc_set_auto_sel(cpu, auto_sel); + if (ret) { + pr_warn("Failed to set auto_sel=%d for CPU%d (%d)\n", auto_sel, cpu, ret); + return ret; + } + cpu_data->perf_caps.auto_sel = auto_sel; + + return 0; +} + static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; @@ -885,8 +905,49 @@ static ssize_t show_auto_select(struct cpufreq_policy *policy, char *buf) return sysfs_emit(buf, "%d\n", val); } -static ssize_t store_auto_select(struct cpufreq_policy *policy, - const char *buf, size_t count) +/** + * cppc_cpufreq_update_auto_select - Update autonomous selection config for policy->cpu + * @policy: cpufreq policy + * @enable: enable/disable autonomous selection + */ +static int cppc_cpufreq_update_auto_select(struct cpufreq_policy *policy, bool enable) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; + u64 min_perf = caps->lowest_nonlinear_perf; + u64 max_perf = caps->nominal_perf; + int ret; + + if (enable) { + if (cpu_data->perf_ctrls.min_perf) + min_perf = cpu_data->perf_ctrls.min_perf; + if (cpu_data->perf_ctrls.max_perf) + max_perf = cpu_data->perf_ctrls.max_perf; + } + + /* + * Set min/max performance registers and update policy constraints. + * When enabling: update both registers and policy. + * When disabling: update policy only. + * Continue even if min/max are not supported, as EPP and autosel + * might still be supported. + */ + ret = cppc_cpufreq_set_min_perf(policy, min_perf, enable, true); + if (ret && ret != -EOPNOTSUPP) + return ret; + + ret = cppc_cpufreq_set_max_perf(policy, max_perf, enable, true); + if (ret && ret != -EOPNOTSUPP) + return ret; + + ret = cppc_cpufreq_update_autosel_val(policy, enable); + if (ret) + return ret; + + return 0; +} + +static ssize_t store_auto_select(struct cpufreq_policy *policy, const char *buf, size_t count) { bool val; int ret; @@ -895,7 +956,7 @@ static ssize_t store_auto_select(struct cpufreq_policy *policy, if (ret) return ret; - ret = cppc_set_auto_sel(policy->cpu, val); + ret = cppc_cpufreq_update_auto_select(policy, val); if (ret) return ret; From d67f315b1e5b400a87b3176d7ceb3548819fcd7a Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Wed, 5 Nov 2025 17:08:44 +0530 Subject: [PATCH 010/247] NVIDIA: SAUCE: cpufreq: CPPC: add autonomous mode boot parameter support BugLink: https://bugs.launchpad.net/bugs/2131705 Add kernel boot parameter 'cppc_cpufreq.auto_sel_mode' to enable CPPC autonomous performance selection at system startup. When autonomous mode is enabled, the hardware automatically adjusts CPU performance based on workload demands using Energy Performance Preference (EPP) hints. This parameter allows to configure the autonomous mode on all CPUs without requiring runtime sysfs manipulation if the 'auto_sel' register is present. When auto_sel_mode=1: - All CPUs are configured for autonomous operation during module init - EPP is set to performance preference (0x0) by default - Min/max performance bounds use defaults - CPU frequency scaling is handled by hardware instead of OS governor For Documentation/: Reviewed-by: Randy Dunlap Signed-off-by: Sumit Gupta (backported from https://lore.kernel.org/all/20251105113844.4086250-1-sumitg@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Acked-by: nvmochs Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- .../admin-guide/kernel-parameters.txt | 12 ++ drivers/cpufreq/cppc_cpufreq.c | 197 +++++++++++++++--- 2 files changed, 182 insertions(+), 27 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index aeea7e9288376..330f8560c6c14 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -911,6 +911,18 @@ Format: ,,,[,] + cppc_cpufreq.auto_sel_mode= + [CPU_FREQ] Enable ACPI CPPC autonomous performance selection. + When enabled, hardware automatically adjusts CPU frequency + on all CPUs based on workload demands. In Autonomous mode, + Energy Performance Preference(EPP) hints guide hardware + toward performance(0x0) or energy efficiency (0xff). + Requires ACPI CPPC autonomous selection register support. + Format: + Default: 0 (disabled) + 0: use cpufreq governors + 1: enable if supoorted by hardware + cpuidle.off=1 [CPU_IDLE] disable the cpuidle sub-system diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 14facc7bbc952..08d3fe50d3f79 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -28,8 +28,12 @@ #include static struct cpufreq_driver cppc_cpufreq_driver; + static DEFINE_MUTEX(cppc_cpufreq_update_autosel_config_lock); +/* Autonomous Selection */ +static bool auto_sel_mode; + #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE static enum { FIE_UNSET = -1, @@ -274,8 +278,13 @@ static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, freqs.old = policy->cur; freqs.new = target_freq; + /* + * In autonomous selection mode, hardware handles frequency scaling directly + * based on workload and EPP hints. So, skip the OS frequency set requests. + */ cpufreq_freq_transition_begin(policy, &freqs); - ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); + if (!cpu_data->perf_caps.auto_sel) + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); cpufreq_freq_transition_end(policy, &freqs, ret != 0); if (ret) @@ -567,6 +576,12 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) goto free_mask; } + ret = cppc_get_perf(cpu, &cpu_data->perf_ctrls); + if (ret) { + pr_debug("Err reading CPU%d perf ctrls: ret:%d\n", cpu, ret); + goto free_mask; + } + return cpu_data; free_mask: @@ -668,11 +683,81 @@ static int cppc_cpufreq_update_autosel_val(struct cpufreq_policy *policy, bool a return 0; } +static int cppc_cpufreq_update_epp_val(struct cpufreq_policy *policy, u32 epp) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + unsigned int cpu = policy->cpu; + int ret; + + pr_debug("cpu%d, epp curr:%u, new:%u\n", cpu, cpu_data->perf_ctrls.energy_perf, epp); + + guard(mutex)(&cppc_cpufreq_update_autosel_config_lock); + + ret = cppc_set_epp(cpu, epp); + if (ret) { + pr_warn("failed to set energy_perf for cpu:%d (%d)\n", cpu, ret); + return ret; + } + cpu_data->perf_ctrls.energy_perf = epp; + + return 0; +} + +/** + * cppc_cpufreq_update_autosel_config - Update Autonomous selection configuration + * @policy: cpufreq policy for the CPU + * @min_perf: minimum performance value to set + * @max_perf: maximum performance value to set + * @auto_sel: autonomous selection mode enable/disable (also controls min/max perf reg updates) + * @epp_val: energy performance preference value + * @update_epp: whether to update EPP register + * @update_policy: whether to update policy constraints + * + * Return: 0 on success, negative error code on failure + */ +static int cppc_cpufreq_update_autosel_config(struct cpufreq_policy *policy, + u64 min_perf, u64 max_perf, bool auto_sel, + u32 epp_val, bool update_epp, bool update_policy) +{ + const unsigned int cpu = policy->cpu; + int ret; + + /* + * Set min/max performance registers and update policy constraints. + * When enabling: update both registers and policy. + * When disabling: update policy only. + * Continue even if min/max are not supported, as EPP and autosel + * might still be supported. + */ + ret = cppc_cpufreq_set_min_perf(policy, min_perf, auto_sel, update_policy); + if (ret && ret != -EOPNOTSUPP) + return ret; + + ret = cppc_cpufreq_set_max_perf(policy, max_perf, auto_sel, update_policy); + if (ret && ret != -EOPNOTSUPP) + return ret; + + if (update_epp) { + ret = cppc_cpufreq_update_epp_val(policy, epp_val); + if (ret) + return ret; + } + + ret = cppc_cpufreq_update_autosel_val(policy, auto_sel); + if (ret) + return ret; + + pr_debug("Updated autonomous config [%llu-%llu] for CPU%d\n", min_perf, max_perf, cpu); + + return 0; +} + static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; struct cppc_cpudata *cpu_data; struct cppc_perf_caps *caps; + u64 min_perf, max_perf; int ret; cpu_data = cppc_cpufreq_get_cpu_data(cpu); @@ -736,11 +821,31 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->cur = cppc_perf_to_khz(caps, caps->highest_perf); cpu_data->perf_ctrls.desired_perf = caps->highest_perf; - ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) { - pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", - caps->highest_perf, cpu, ret); - goto out; + if (cpu_data->perf_caps.auto_sel) { + ret = cppc_set_enable(cpu, true); + if (ret) { + pr_err("Failed to enable CPPC on cpu%d (%d)\n", cpu, ret); + goto out; + } + + min_perf = cpu_data->perf_ctrls.min_perf ? + cpu_data->perf_ctrls.min_perf : caps->lowest_nonlinear_perf; + max_perf = cpu_data->perf_ctrls.max_perf ? + cpu_data->perf_ctrls.max_perf : caps->nominal_perf; + + ret = cppc_cpufreq_update_autosel_config(policy, min_perf, max_perf, true, + CPPC_EPP_PERFORMANCE_PREF, true, false); + if (ret) { + cppc_set_enable(cpu, false); + goto out; + } + } else { + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); + if (ret) { + pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", + caps->highest_perf, cpu, ret); + goto out; + } } cppc_cpufreq_cpu_fie_init(policy); @@ -916,7 +1021,6 @@ static int cppc_cpufreq_update_auto_select(struct cpufreq_policy *policy, bool e struct cppc_perf_caps *caps = &cpu_data->perf_caps; u64 min_perf = caps->lowest_nonlinear_perf; u64 max_perf = caps->nominal_perf; - int ret; if (enable) { if (cpu_data->perf_ctrls.min_perf) @@ -925,26 +1029,8 @@ static int cppc_cpufreq_update_auto_select(struct cpufreq_policy *policy, bool e max_perf = cpu_data->perf_ctrls.max_perf; } - /* - * Set min/max performance registers and update policy constraints. - * When enabling: update both registers and policy. - * When disabling: update policy only. - * Continue even if min/max are not supported, as EPP and autosel - * might still be supported. - */ - ret = cppc_cpufreq_set_min_perf(policy, min_perf, enable, true); - if (ret && ret != -EOPNOTSUPP) - return ret; - - ret = cppc_cpufreq_set_max_perf(policy, max_perf, enable, true); - if (ret && ret != -EOPNOTSUPP) - return ret; - - ret = cppc_cpufreq_update_autosel_val(policy, enable); - if (ret) - return ret; - - return 0; + return cppc_cpufreq_update_autosel_config(policy, min_perf, max_perf, enable, + 0, false, true); } static ssize_t store_auto_select(struct cpufreq_policy *policy, const char *buf, size_t count) @@ -1152,13 +1238,61 @@ static struct cpufreq_driver cppc_cpufreq_driver = { .name = "cppc_cpufreq", }; +static int cppc_cpufreq_set_epp_autosel_allcpus(bool auto_sel, u64 epp) +{ + int cpu, ret; + + for_each_present_cpu(cpu) { + ret = cppc_set_epp(cpu, epp); + if (ret) { + pr_warn("Failed to set EPP on CPU%d (%d)\n", cpu, ret); + goto disable_all; + } + + ret = cppc_set_auto_sel(cpu, auto_sel); + if (ret) { + pr_warn("Failed to set auto_sel on CPU%d (%d)\n", cpu, ret); + goto disable_all; + } + } + + return 0; + +disable_all: + pr_warn("Disabling auto_sel for all CPUs\n"); + for_each_present_cpu(cpu) + cppc_set_auto_sel(cpu, false); + + return -EIO; +} + static int __init cppc_cpufreq_init(void) { + bool auto_sel; int ret; if (!acpi_cpc_valid()) return -ENODEV; + if (auto_sel_mode) { + /* + * Check if autonomous selection is supported by testing CPU 0. + * If supported, enable autonomous mode on all CPUs. + */ + ret = cppc_get_auto_sel(0, &auto_sel); + if (!ret) { + pr_info("Enabling auto_sel_mode (autonomous selection mode)\n"); + ret = cppc_cpufreq_set_epp_autosel_allcpus(true, CPPC_EPP_PERFORMANCE_PREF); + if (ret) { + pr_warn("Disabling auto_sel_mode, fallback to standard\n"); + auto_sel_mode = false; + } + } else { + pr_warn("Disabling auto_sel_mode as not supported by hardware\n"); + auto_sel_mode = false; + } + } + cppc_freq_invariance_init(); populate_efficiency_class(); @@ -1171,10 +1305,19 @@ static int __init cppc_cpufreq_init(void) static void __exit cppc_cpufreq_exit(void) { + int cpu; + + for_each_present_cpu(cpu) + cppc_set_auto_sel(cpu, false); + auto_sel_mode = false; + cpufreq_unregister_driver(&cppc_cpufreq_driver); cppc_freq_invariance_exit(); } +module_param(auto_sel_mode, bool, 0000); +MODULE_PARM_DESC(auto_sel_mode, "Enable Autonomous Performance Level Selection"); + module_exit(cppc_cpufreq_exit); MODULE_AUTHOR("Ashwin Chaugule"); MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+ spec"); From 8c5ed9dd43fc9df31c4023a501d290a9cb498a62 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Thu, 20 Nov 2025 08:10:15 -0800 Subject: [PATCH 011/247] NVIDIA: SAUCE: iommu/arm-smmu-v3: Add two more DGX Spark iGPU IDs for existing iommu quirk BugLink: https://bugs.launchpad.net/bugs/2132033 Add two more device IDs for the existing Spark iommu quirk. Link: https://bugs.launchpad.net/ubuntu/+source/linux-nvidia-6.14/+bug/2132033 Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6a25da00cfc37..3046f496a8422 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3671,7 +3671,9 @@ static int arm_smmu_def_domain_type(struct device *dev) if (IS_HISI_PTT_DEVICE(pdev)) return IOMMU_DOMAIN_IDENTITY; - if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && pdev->device == 0x2E12) + if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && + (pdev->device == 0x2E12 || pdev->device == 0x2E2A || + pdev->device == 0x2E2B)) return IOMMU_DOMAIN_DMA; } From be4761bf7beaee1fc00180aa8a503c4cb241f162 Mon Sep 17 00:00:00 2001 From: Nathan Chen Date: Sat, 15 Nov 2025 03:55:16 +0000 Subject: [PATCH 012/247] NVIDIA: SAUCE: vfio/nvgrace-egm: Prevent double-unregister of pfn_address_space BugLink: https://bugs.launchpad.net/bugs/2131582 Check for the registration state of egm_region's pfn_address_space, preventing double-unregisters that can occur when using an external management app such as Libvirt to launch VMs with vEGM enabled. This addresses a null pointer dereference in __rb_erase_color when using shutting down a Libvirt VM with vEGM enabled. When launching a qemu + Libvirt VM with vEGM, the EGM regions on the host are opened and mmaped, populating the member at egm_region.pfn_address_space.node.__rb_parent_color. During VM shutdown, a close is issued and the region is unregistered. However, the extra conditional this commit introduces is required because after open_count goes to 0 and the address space is unregistered, another open follows and the open_count goes back to 1. But there's no mmap so the address space is never registered. However, the unregister function gets called again on the following close, calling interval_tree_remove() which causes the __rb_erase_color null pointer dereference to occur when the pfn_address_space.node.__rb_parent_color member is a stale pointer from the previous registration: [217296.439595] Call trace: [217296.442180] __rb_erase_color+0xc4/0x2a8 (P) [217296.446632] interval_tree_remove+0x184/0x2e8 [217296.451171] unregister_pfn_address_space+0x4c/0xc0 [217296.456255] nvgrace_egm_release+0x98/0xd8 [nvgrace_egm] [217296.461780] __fput+0xe4/0x328 [217296.464990] fput_close_sync+0x4c/0x138 [217296.468995] __arm64_sys_close+0x44/0xa0 [217296.473089] invoke_syscall.constprop.0+0x7c/0xf8 [217296.477991] do_el0_svc+0x4c/0x100 [217296.481553] el0_svc+0x48/0x200 [217296.484848] el0t_64_sync_handler+0xc0/0x108 [217296.489300] el0t_64_sync+0x1b8/0x1c0 [217296.493132] Code: f9400674 eb03029f 54fffbe1 f9400a74 (f9400280) Signed-off-by: Nathan Chen Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/vfio/pci/nvgrace-gpu/egm.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 6cdcec03d03f6..25b0ecc76c098 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -33,6 +33,7 @@ struct egm_region { DECLARE_HASHTABLE(htbl, 0x10); #ifdef CONFIG_MEMORY_FAILURE struct pfn_address_space pfn_address_space; + bool pfn_space_registered; #endif }; @@ -140,7 +141,10 @@ static int nvgrace_egm_release(struct inode *inode, struct file *file) if (atomic_dec_and_test(®ion->open_count)) { #ifdef CONFIG_MEMORY_FAILURE - unregister_pfn_address_space(®ion->pfn_address_space); + if (region->pfn_space_registered) { + unregister_pfn_address_space(®ion->pfn_address_space); + region->pfn_space_registered = false; + } #endif file->private_data = NULL; } @@ -169,7 +173,10 @@ static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &nvgrace_egm_mmap_ops; ret = nvgrace_egm_register_pfn_range(region, vma); + if (ret == 0) + region->pfn_space_registered = true; #endif + return ret; } @@ -458,6 +465,9 @@ int register_egm_node(struct pci_dev *pdev) region->egmpxm = egmpxm; hash_init(region->htbl); +#ifdef CONFIG_MEMORY_FAILURE + region->pfn_space_registered = false; +#endif INIT_LIST_HEAD(®ion->gpus); atomic_set(®ion->open_count, 0); From b726be19cd7639a280e597ac6e0cca8a5c9f039c Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 30 Sep 2025 00:26:01 +0000 Subject: [PATCH 013/247] perf/arm_cspmu: Add callback to reset filter config BugLink: https://bugs.launchpad.net/bugs/2131267 Implementer may need to reset a filter config when stopping a counter, thus adding a callback for this. Reviewed-by: Ilkka Koskinen Reviewed-by: Suzuki K Poulose Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit a2573bc7908da8e6eb63dc4e449b7c1724e3849b linux-next) Signed-off-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/perf/arm_cspmu/arm_cspmu.c | 4 ++++ drivers/perf/arm_cspmu/arm_cspmu.h | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index efa9b229e7012..82d7ed6202f18 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -815,6 +815,10 @@ static void arm_cspmu_stop(struct perf_event *event, int pmu_flags) return; arm_cspmu_disable_counter(cspmu, hwc->idx); + + if (cspmu->impl.ops.reset_ev_filter) + cspmu->impl.ops.reset_ev_filter(cspmu, event); + arm_cspmu_event_update(event); hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index 19684b76bd969..23bfc4a58064b 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -152,11 +152,13 @@ struct arm_cspmu_impl_ops { bool (*is_cycle_counter_event)(const struct perf_event *event); /* Decode event type/id from configs */ u32 (*event_type)(const struct perf_event *event); - /* Set event filters */ + /* Set/reset event filters */ void (*set_cc_filter)(struct arm_cspmu *cspmu, const struct perf_event *event); void (*set_ev_filter)(struct arm_cspmu *cspmu, const struct perf_event *event); + void (*reset_ev_filter)(struct arm_cspmu *cspmu, + const struct perf_event *event); /* Implementation specific event validation */ int (*validate_event)(struct arm_cspmu *cspmu, struct perf_event *event); From 3f7335866a43af7135fc28469c88b23b60edda7b Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 30 Sep 2025 00:26:02 +0000 Subject: [PATCH 014/247] perf/arm_cspmu: Add pmpidr support BugLink: https://bugs.launchpad.net/bugs/2131267 The PMIIDR value is composed by the values in PMPIDR registers. We can use PMPIDR registers as alternative for device identification for systems that do not implement PMIIDR. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit 04330be8dc7fddf36f4adb1271932788ad47e7ad linux-next) Signed-off-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/perf/arm_cspmu/arm_cspmu.c | 44 +++++++++++++++++++++++++-- drivers/perf/arm_cspmu/arm_cspmu.h | 35 +++++++++++++++++++-- drivers/perf/arm_cspmu/nvidia_cspmu.c | 2 +- 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index 82d7ed6202f18..33ad2cab5c160 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -322,14 +322,14 @@ static struct arm_cspmu_impl_match impl_match[] = { { .module_name = "nvidia_cspmu", .pmiidr_val = ARM_CSPMU_IMPL_ID_NVIDIA, - .pmiidr_mask = ARM_CSPMU_PMIIDR_IMPLEMENTER, + .pmiidr_mask = PMIIDR_IMPLEMENTER, .module = NULL, .impl_init_ops = NULL, }, { .module_name = "ampere_cspmu", .pmiidr_val = ARM_CSPMU_IMPL_ID_AMPERE, - .pmiidr_mask = ARM_CSPMU_PMIIDR_IMPLEMENTER, + .pmiidr_mask = PMIIDR_IMPLEMENTER, .module = NULL, .impl_init_ops = NULL, }, @@ -351,6 +351,44 @@ static struct arm_cspmu_impl_match *arm_cspmu_impl_match_get(u32 pmiidr) return NULL; } +static u32 arm_cspmu_get_pmiidr(struct arm_cspmu *cspmu) +{ + u32 pmiidr, pmpidr; + + pmiidr = readl(cspmu->base0 + PMIIDR); + + if (pmiidr != 0) + return pmiidr; + + /* Construct PMIIDR value from PMPIDRs. */ + + pmpidr = readl(cspmu->base0 + PMPIDR0); + pmiidr |= FIELD_PREP(PMIIDR_PRODUCTID_PART_0, + FIELD_GET(PMPIDR0_PART_0, pmpidr)); + + pmpidr = readl(cspmu->base0 + PMPIDR1); + pmiidr |= FIELD_PREP(PMIIDR_PRODUCTID_PART_1, + FIELD_GET(PMPIDR1_PART_1, pmpidr)); + pmiidr |= FIELD_PREP(PMIIDR_IMPLEMENTER_DES_0, + FIELD_GET(PMPIDR1_DES_0, pmpidr)); + + pmpidr = readl(cspmu->base0 + PMPIDR2); + pmiidr |= FIELD_PREP(PMIIDR_VARIANT, + FIELD_GET(PMPIDR2_REVISION, pmpidr)); + pmiidr |= FIELD_PREP(PMIIDR_IMPLEMENTER_DES_1, + FIELD_GET(PMPIDR2_DES_1, pmpidr)); + + pmpidr = readl(cspmu->base0 + PMPIDR3); + pmiidr |= FIELD_PREP(PMIIDR_REVISION, + FIELD_GET(PMPIDR3_REVAND, pmpidr)); + + pmpidr = readl(cspmu->base0 + PMPIDR4); + pmiidr |= FIELD_PREP(PMIIDR_IMPLEMENTER_DES_2, + FIELD_GET(PMPIDR4_DES_2, pmpidr)); + + return pmiidr; +} + #define DEFAULT_IMPL_OP(name) .name = arm_cspmu_##name static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu) @@ -361,7 +399,7 @@ static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu) /* Start with a default PMU implementation */ cspmu->impl.module = THIS_MODULE; - cspmu->impl.pmiidr = readl(cspmu->base0 + PMIIDR); + cspmu->impl.pmiidr = arm_cspmu_get_pmiidr(cspmu); cspmu->impl.ops = (struct arm_cspmu_impl_ops) { DEFAULT_IMPL_OP(get_event_attrs), DEFAULT_IMPL_OP(get_format_attrs), diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index 23bfc4a58064b..cd65a58dbd884 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -86,6 +86,11 @@ #define PMCFGR 0xE00 #define PMCR 0xE04 #define PMIIDR 0xE08 +#define PMPIDR0 0xFE0 +#define PMPIDR1 0xFE4 +#define PMPIDR2 0xFE8 +#define PMPIDR3 0xFEC +#define PMPIDR4 0xFD0 /* PMCFGR register field */ #define PMCFGR_NCG GENMASK(31, 28) @@ -115,8 +120,34 @@ #define PMCR_E BIT(0) /* PMIIDR register field */ -#define ARM_CSPMU_PMIIDR_IMPLEMENTER GENMASK(11, 0) -#define ARM_CSPMU_PMIIDR_PRODUCTID GENMASK(31, 20) +#define PMIIDR_IMPLEMENTER GENMASK(11, 0) +#define PMIIDR_IMPLEMENTER_DES_0 GENMASK(3, 0) +#define PMIIDR_IMPLEMENTER_DES_1 GENMASK(6, 4) +#define PMIIDR_IMPLEMENTER_DES_2 GENMASK(11, 8) +#define PMIIDR_REVISION GENMASK(15, 12) +#define PMIIDR_VARIANT GENMASK(19, 16) +#define PMIIDR_PRODUCTID GENMASK(31, 20) +#define PMIIDR_PRODUCTID_PART_0 GENMASK(27, 20) +#define PMIIDR_PRODUCTID_PART_1 GENMASK(31, 28) + +/* PMPIDR0 register field */ +#define PMPIDR0_PART_0 GENMASK(7, 0) + +/* PMPIDR1 register field */ +#define PMPIDR1_DES_0 GENMASK(7, 4) +#define PMPIDR1_PART_1 GENMASK(3, 0) + +/* PMPIDR2 register field */ +#define PMPIDR2_REVISION GENMASK(7, 4) +#define PMPIDR2_DES_1 GENMASK(2, 0) + +/* PMPIDR3 register field */ +#define PMPIDR3_REVAND GENMASK(7, 4) +#define PMPIDR3_CMOD GENMASK(3, 0) + +/* PMPIDR4 register field */ +#define PMPIDR4_SIZE GENMASK(7, 4) +#define PMPIDR4_DES_2 GENMASK(3, 0) /* JEDEC-assigned JEP106 identification code */ #define ARM_CSPMU_IMPL_ID_NVIDIA 0x36B diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index dc6d4e3e2a1ba..b6cec351a1422 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -322,7 +322,7 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) if (!ctx) return -ENOMEM; - prodid = FIELD_GET(ARM_CSPMU_PMIIDR_PRODUCTID, cspmu->impl.pmiidr); + prodid = FIELD_GET(PMIIDR_PRODUCTID, cspmu->impl.pmiidr); /* Find matching PMU. */ for (; match->prodid; match++) { From 40d9b952fdfacb71d96df4986c268df38327ffa7 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 30 Sep 2025 00:26:03 +0000 Subject: [PATCH 015/247] perf/arm_cspmu: nvidia: Add revision id matching BugLink: https://bugs.launchpad.net/bugs/2131267 Distinguish NVIDIA devices by revision and variant bits in PMIIDR register in addition to product id. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit 82dfd72bfb0362a3900179595032b65be11582da linux-next) Signed-off-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/perf/arm_cspmu/nvidia_cspmu.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index b6cec351a1422..ac91dc46501d1 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -23,7 +23,7 @@ #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) -#define NV_PRODID_MASK GENMASK(31, 0) +#define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) #define NV_FORMAT_NAME_GENERIC 0 @@ -220,7 +220,7 @@ struct nv_cspmu_match { static const struct nv_cspmu_match nv_cspmu_match[] = { { - .prodid = 0x103, + .prodid = 0x10300000, .prodid_mask = NV_PRODID_MASK, .filter_mask = NV_PCIE_FILTER_ID_MASK, .filter_default_val = NV_PCIE_FILTER_ID_MASK, @@ -230,7 +230,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .format_attr = pcie_pmu_format_attrs }, { - .prodid = 0x104, + .prodid = 0x10400000, .prodid_mask = NV_PRODID_MASK, .filter_mask = NV_NVL_C2C_FILTER_ID_MASK, .filter_default_val = NV_NVL_C2C_FILTER_ID_MASK, @@ -240,7 +240,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .format_attr = nvlink_c2c_pmu_format_attrs }, { - .prodid = 0x105, + .prodid = 0x10500000, .prodid_mask = NV_PRODID_MASK, .filter_mask = NV_NVL_C2C_FILTER_ID_MASK, .filter_default_val = NV_NVL_C2C_FILTER_ID_MASK, @@ -250,7 +250,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .format_attr = nvlink_c2c_pmu_format_attrs }, { - .prodid = 0x106, + .prodid = 0x10600000, .prodid_mask = NV_PRODID_MASK, .filter_mask = NV_CNVL_FILTER_ID_MASK, .filter_default_val = NV_CNVL_FILTER_ID_MASK, @@ -260,7 +260,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .format_attr = cnvlink_pmu_format_attrs }, { - .prodid = 0x2CF, + .prodid = 0x2CF00000, .prodid_mask = NV_PRODID_MASK, .filter_mask = 0x0, .filter_default_val = 0x0, @@ -312,7 +312,6 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) { - u32 prodid; struct nv_cspmu_ctx *ctx; struct device *dev = cspmu->dev; struct arm_cspmu_impl_ops *impl_ops = &cspmu->impl.ops; @@ -322,13 +321,12 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) if (!ctx) return -ENOMEM; - prodid = FIELD_GET(PMIIDR_PRODUCTID, cspmu->impl.pmiidr); - /* Find matching PMU. */ for (; match->prodid; match++) { const u32 prodid_mask = match->prodid_mask; - if ((match->prodid & prodid_mask) == (prodid & prodid_mask)) + if ((match->prodid & prodid_mask) == + (cspmu->impl.pmiidr & prodid_mask)) break; } From 8ed8e25594800a856131cca74d78d0fc979c1043 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 30 Sep 2025 00:26:04 +0000 Subject: [PATCH 016/247] perf/arm_cspmu: nvidia: Add pmevfiltr2 support BugLink: https://bugs.launchpad.net/bugs/2131267 Support NVIDIA PMU that utilizes the optional event filter2 register. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon (cherry picked from commit decc3684c24112286c527188bb09dd6eaf720cc0 linux-next) Signed-off-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/perf/arm_cspmu/nvidia_cspmu.c | 176 +++++++++++++++++++------- 1 file changed, 133 insertions(+), 43 deletions(-) diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index ac91dc46501d1..e06a06d3407b1 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -40,10 +40,21 @@ struct nv_cspmu_ctx { const char *name; - u32 filter_mask; - u32 filter_default_val; + struct attribute **event_attr; struct attribute **format_attr; + + u32 filter_mask; + u32 filter_default_val; + u32 filter2_mask; + u32 filter2_default_val; + + u32 (*get_filter)(const struct perf_event *event); + u32 (*get_filter2)(const struct perf_event *event); + + void *data; + + int (*init_data)(struct arm_cspmu *cspmu); }; static struct attribute *scf_pmu_event_attrs[] = { @@ -144,6 +155,7 @@ static struct attribute *cnvlink_pmu_format_attrs[] = { static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, + ARM_CSPMU_FORMAT_FILTER2_ATTR, NULL, }; @@ -184,13 +196,36 @@ static u32 nv_cspmu_event_filter(const struct perf_event *event) return filter_val; } +static u32 nv_cspmu_event_filter2(const struct perf_event *event) +{ + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + + const u32 filter_val = event->attr.config2 & ctx->filter2_mask; + + if (filter_val == 0) + return ctx->filter2_default_val; + + return filter_val; +} + static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu, const struct perf_event *event) { - u32 filter = nv_cspmu_event_filter(event); - u32 offset = PMEVFILTR + (4 * event->hw.idx); + u32 filter, offset; + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + offset = 4 * event->hw.idx; - writel(filter, cspmu->base0 + offset); + if (ctx->get_filter) { + filter = ctx->get_filter(event); + writel(filter, cspmu->base0 + PMEVFILTR + offset); + } + + if (ctx->get_filter2) { + filter = ctx->get_filter2(event); + writel(filter, cspmu->base0 + PMEVFILT2R + offset); + } } static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, @@ -210,74 +245,120 @@ enum nv_cspmu_name_fmt { struct nv_cspmu_match { u32 prodid; u32 prodid_mask; - u64 filter_mask; - u32 filter_default_val; const char *name_pattern; enum nv_cspmu_name_fmt name_fmt; - struct attribute **event_attr; - struct attribute **format_attr; + struct nv_cspmu_ctx template_ctx; + struct arm_cspmu_impl_ops ops; }; static const struct nv_cspmu_match nv_cspmu_match[] = { { .prodid = 0x10300000, .prodid_mask = NV_PRODID_MASK, - .filter_mask = NV_PCIE_FILTER_ID_MASK, - .filter_default_val = NV_PCIE_FILTER_ID_MASK, .name_pattern = "nvidia_pcie_pmu_%u", .name_fmt = NAME_FMT_SOCKET, - .event_attr = mcf_pmu_event_attrs, - .format_attr = pcie_pmu_format_attrs + .template_ctx = { + .event_attr = mcf_pmu_event_attrs, + .format_attr = pcie_pmu_format_attrs, + .filter_mask = NV_PCIE_FILTER_ID_MASK, + .filter_default_val = NV_PCIE_FILTER_ID_MASK, + .filter2_mask = 0x0, + .filter2_default_val = 0x0, + .get_filter = nv_cspmu_event_filter, + .get_filter2 = NULL, + .data = NULL, + .init_data = NULL + }, }, { .prodid = 0x10400000, .prodid_mask = NV_PRODID_MASK, - .filter_mask = NV_NVL_C2C_FILTER_ID_MASK, - .filter_default_val = NV_NVL_C2C_FILTER_ID_MASK, .name_pattern = "nvidia_nvlink_c2c1_pmu_%u", .name_fmt = NAME_FMT_SOCKET, - .event_attr = mcf_pmu_event_attrs, - .format_attr = nvlink_c2c_pmu_format_attrs + .template_ctx = { + .event_attr = mcf_pmu_event_attrs, + .format_attr = nvlink_c2c_pmu_format_attrs, + .filter_mask = NV_NVL_C2C_FILTER_ID_MASK, + .filter_default_val = NV_NVL_C2C_FILTER_ID_MASK, + .filter2_mask = 0x0, + .filter2_default_val = 0x0, + .get_filter = nv_cspmu_event_filter, + .get_filter2 = NULL, + .data = NULL, + .init_data = NULL + }, }, { .prodid = 0x10500000, .prodid_mask = NV_PRODID_MASK, - .filter_mask = NV_NVL_C2C_FILTER_ID_MASK, - .filter_default_val = NV_NVL_C2C_FILTER_ID_MASK, .name_pattern = "nvidia_nvlink_c2c0_pmu_%u", .name_fmt = NAME_FMT_SOCKET, - .event_attr = mcf_pmu_event_attrs, - .format_attr = nvlink_c2c_pmu_format_attrs + .template_ctx = { + .event_attr = mcf_pmu_event_attrs, + .format_attr = nvlink_c2c_pmu_format_attrs, + .filter_mask = NV_NVL_C2C_FILTER_ID_MASK, + .filter_default_val = NV_NVL_C2C_FILTER_ID_MASK, + .filter2_mask = 0x0, + .filter2_default_val = 0x0, + .get_filter = nv_cspmu_event_filter, + .get_filter2 = NULL, + .data = NULL, + .init_data = NULL + }, }, { .prodid = 0x10600000, .prodid_mask = NV_PRODID_MASK, - .filter_mask = NV_CNVL_FILTER_ID_MASK, - .filter_default_val = NV_CNVL_FILTER_ID_MASK, .name_pattern = "nvidia_cnvlink_pmu_%u", .name_fmt = NAME_FMT_SOCKET, - .event_attr = mcf_pmu_event_attrs, - .format_attr = cnvlink_pmu_format_attrs + .template_ctx = { + .event_attr = mcf_pmu_event_attrs, + .format_attr = cnvlink_pmu_format_attrs, + .filter_mask = NV_CNVL_FILTER_ID_MASK, + .filter_default_val = NV_CNVL_FILTER_ID_MASK, + .filter2_mask = 0x0, + .filter2_default_val = 0x0, + .get_filter = nv_cspmu_event_filter, + .get_filter2 = NULL, + .data = NULL, + .init_data = NULL + }, }, { .prodid = 0x2CF00000, .prodid_mask = NV_PRODID_MASK, - .filter_mask = 0x0, - .filter_default_val = 0x0, .name_pattern = "nvidia_scf_pmu_%u", .name_fmt = NAME_FMT_SOCKET, - .event_attr = scf_pmu_event_attrs, - .format_attr = scf_pmu_format_attrs + .template_ctx = { + .event_attr = scf_pmu_event_attrs, + .format_attr = scf_pmu_format_attrs, + .filter_mask = 0x0, + .filter_default_val = 0x0, + .filter2_mask = 0x0, + .filter2_default_val = 0x0, + .get_filter = nv_cspmu_event_filter, + .get_filter2 = NULL, + .data = NULL, + .init_data = NULL + }, }, { .prodid = 0, .prodid_mask = 0, - .filter_mask = NV_GENERIC_FILTER_ID_MASK, - .filter_default_val = NV_GENERIC_FILTER_ID_MASK, .name_pattern = "nvidia_uncore_pmu_%u", .name_fmt = NAME_FMT_GENERIC, - .event_attr = generic_pmu_event_attrs, - .format_attr = generic_pmu_format_attrs + .template_ctx = { + .event_attr = generic_pmu_event_attrs, + .format_attr = generic_pmu_format_attrs, + .filter_mask = NV_GENERIC_FILTER_ID_MASK, + .filter_default_val = NV_GENERIC_FILTER_ID_MASK, + .filter2_mask = NV_GENERIC_FILTER_ID_MASK, + .filter2_default_val = NV_GENERIC_FILTER_ID_MASK, + .get_filter = nv_cspmu_event_filter, + .get_filter2 = nv_cspmu_event_filter2, + .data = NULL, + .init_data = NULL + }, }, }; @@ -310,6 +391,14 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, return name; } +#define SET_OP(name, impl, match, default_op) \ + do { \ + if (match->ops.name) \ + impl->name = match->ops.name; \ + else if (default_op != NULL) \ + impl->name = default_op; \ + } while (false) + static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) { struct nv_cspmu_ctx *ctx; @@ -330,20 +419,21 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) break; } - ctx->name = nv_cspmu_format_name(cspmu, match); - ctx->filter_mask = match->filter_mask; - ctx->filter_default_val = match->filter_default_val; - ctx->event_attr = match->event_attr; - ctx->format_attr = match->format_attr; + /* Initialize the context with the matched template. */ + memcpy(ctx, &match->template_ctx, sizeof(struct nv_cspmu_ctx)); + ctx->name = nv_cspmu_format_name(cspmu, match); cspmu->impl.ctx = ctx; /* NVIDIA specific callbacks. */ - impl_ops->set_cc_filter = nv_cspmu_set_cc_filter; - impl_ops->set_ev_filter = nv_cspmu_set_ev_filter; - impl_ops->get_event_attrs = nv_cspmu_get_event_attrs; - impl_ops->get_format_attrs = nv_cspmu_get_format_attrs; - impl_ops->get_name = nv_cspmu_get_name; + SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter); + SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter); + SET_OP(get_event_attrs, impl_ops, match, nv_cspmu_get_event_attrs); + SET_OP(get_format_attrs, impl_ops, match, nv_cspmu_get_format_attrs); + SET_OP(get_name, impl_ops, match, nv_cspmu_get_name); + + if (ctx->init_data) + return ctx->init_data(cspmu); return 0; } From d80483ecc6e9db81059e882bbfa9e5d67fee3d4e Mon Sep 17 00:00:00 2001 From: Kartik Rajput Date: Fri, 10 Oct 2025 15:43:30 +0530 Subject: [PATCH 017/247] gpio: tegra186: Use generic macro for port definitions BugLink: https://bugs.launchpad.net/bugs/2131269 Introduce a generic macro TEGRA_GPIO_PORT to define SoC specific ports macros. This simplifies the code and avoids unnecessary duplication. Suggested-by: Jon Hunter Signed-off-by: Kartik Rajput Reviewed-by: Jon Hunter Acked-by: Thierry Reding Signed-off-by: Bartosz Golaszewski (backported from commit f75db6f7f907c10bf4d45a6cfdae03bb1b631841 linux-next) [mochs: minor context cleanup due to Tegra256 support being absent] Signed-off-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/gpio/gpio-tegra186.c | 78 +++++++++++------------------------- 1 file changed, 23 insertions(+), 55 deletions(-) diff --git a/drivers/gpio/gpio-tegra186.c b/drivers/gpio/gpio-tegra186.c index 5fd3ec3e2c53d..d5b28a4f643e4 100644 --- a/drivers/gpio/gpio-tegra186.c +++ b/drivers/gpio/gpio-tegra186.c @@ -1001,14 +1001,17 @@ static int tegra186_gpio_probe(struct platform_device *pdev) return devm_gpiochip_add_data(&pdev->dev, &gpio->gpio, gpio); } -#define TEGRA186_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ - [TEGRA186_MAIN_GPIO_PORT_##_name] = { \ - .name = #_name, \ - .bank = _bank, \ - .port = _port, \ - .pins = _pins, \ +#define TEGRA_GPIO_PORT(_prefix, _name, _bank, _port, _pins) \ + [_prefix##_GPIO_PORT_##_name] = { \ + .name = #_name, \ + .bank = _bank, \ + .port = _port, \ + .pins = _pins, \ } +#define TEGRA186_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA186_MAIN, _name, _bank, _port, _pins) + static const struct tegra_gpio_port tegra186_main_ports[] = { TEGRA186_MAIN_GPIO_PORT( A, 2, 0, 7), TEGRA186_MAIN_GPIO_PORT( B, 3, 0, 7), @@ -1044,13 +1047,8 @@ static const struct tegra_gpio_soc tegra186_main_soc = { .has_vm_support = false, }; -#define TEGRA186_AON_GPIO_PORT(_name, _bank, _port, _pins) \ - [TEGRA186_AON_GPIO_PORT_##_name] = { \ - .name = #_name, \ - .bank = _bank, \ - .port = _port, \ - .pins = _pins, \ - } +#define TEGRA186_AON_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA186_AON, _name, _bank, _port, _pins) static const struct tegra_gpio_port tegra186_aon_ports[] = { TEGRA186_AON_GPIO_PORT( S, 0, 1, 5), @@ -1072,13 +1070,8 @@ static const struct tegra_gpio_soc tegra186_aon_soc = { .has_vm_support = false, }; -#define TEGRA194_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ - [TEGRA194_MAIN_GPIO_PORT_##_name] = { \ - .name = #_name, \ - .bank = _bank, \ - .port = _port, \ - .pins = _pins, \ - } +#define TEGRA194_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA194_MAIN, _name, _bank, _port, _pins) static const struct tegra_gpio_port tegra194_main_ports[] = { TEGRA194_MAIN_GPIO_PORT( A, 1, 2, 8), @@ -1128,13 +1121,8 @@ static const struct tegra_gpio_soc tegra194_main_soc = { .has_vm_support = true, }; -#define TEGRA194_AON_GPIO_PORT(_name, _bank, _port, _pins) \ - [TEGRA194_AON_GPIO_PORT_##_name] = { \ - .name = #_name, \ - .bank = _bank, \ - .port = _port, \ - .pins = _pins, \ - } +#define TEGRA194_AON_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA194_AON, _name, _bank, _port, _pins) static const struct tegra_gpio_port tegra194_aon_ports[] = { TEGRA194_AON_GPIO_PORT(AA, 0, 3, 8), @@ -1154,13 +1142,8 @@ static const struct tegra_gpio_soc tegra194_aon_soc = { .has_vm_support = false, }; -#define TEGRA234_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ - [TEGRA234_MAIN_GPIO_PORT_##_name] = { \ - .name = #_name, \ - .bank = _bank, \ - .port = _port, \ - .pins = _pins, \ - } +#define TEGRA234_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA234_MAIN, _name, _bank, _port, _pins) static const struct tegra_gpio_port tegra234_main_ports[] = { TEGRA234_MAIN_GPIO_PORT( A, 0, 0, 8), @@ -1199,13 +1182,8 @@ static const struct tegra_gpio_soc tegra234_main_soc = { .has_vm_support = true, }; -#define TEGRA234_AON_GPIO_PORT(_name, _bank, _port, _pins) \ - [TEGRA234_AON_GPIO_PORT_##_name] = { \ - .name = #_name, \ - .bank = _bank, \ - .port = _port, \ - .pins = _pins, \ - } +#define TEGRA234_AON_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA234_AON, _name, _bank, _port, _pins) static const struct tegra_gpio_port tegra234_aon_ports[] = { TEGRA234_AON_GPIO_PORT(AA, 0, 4, 8), @@ -1226,13 +1204,8 @@ static const struct tegra_gpio_soc tegra234_aon_soc = { .has_vm_support = false, }; -#define TEGRA241_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ - [TEGRA241_MAIN_GPIO_PORT_##_name] = { \ - .name = #_name, \ - .bank = _bank, \ - .port = _port, \ - .pins = _pins, \ - } +#define TEGRA241_MAIN_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA241_MAIN, _name, _bank, _port, _pins) static const struct tegra_gpio_port tegra241_main_ports[] = { TEGRA241_MAIN_GPIO_PORT(A, 0, 0, 8), @@ -1257,13 +1230,8 @@ static const struct tegra_gpio_soc tegra241_main_soc = { .has_vm_support = false, }; -#define TEGRA241_AON_GPIO_PORT(_name, _bank, _port, _pins) \ - [TEGRA241_AON_GPIO_PORT_##_name] = { \ - .name = #_name, \ - .bank = _bank, \ - .port = _port, \ - .pins = _pins, \ - } +#define TEGRA241_AON_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA241_AON, _name, _bank, _port, _pins) static const struct tegra_gpio_port tegra241_aon_ports[] = { TEGRA241_AON_GPIO_PORT(AA, 0, 0, 8), From b1d417ab5871fc689e3d494c7110e45ef07aa305 Mon Sep 17 00:00:00 2001 From: Prathamesh Shete Date: Fri, 10 Oct 2025 15:43:31 +0530 Subject: [PATCH 018/247] gpio: tegra186: Add support for Tegra410 BugLink: https://bugs.launchpad.net/bugs/2131269 Extend the existing Tegra186 GPIO controller driver with support for the GPIO controller found on Tegra410. Tegra410 supports two GPIO controllers referred to as 'COMPUTE' and 'SYSTEM'. Co-developed-by: Nathan Hartman Signed-off-by: Nathan Hartman Signed-off-by: Prathamesh Shete Signed-off-by: Kartik Rajput Acked-by: Thierry Reding Reviewed-by: Jon Hunter Signed-off-by: Bartosz Golaszewski (backported from commit 9631a10083d843b57b371d406235e2f2a3e49285 linux-next) [mochs: minor context cleanup due to Tegra256 support being absent] Signed-off-by: Matthew R. Ochs Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off--by: Brad Figg --- drivers/gpio/gpio-tegra186.c | 76 +++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-tegra186.c b/drivers/gpio/gpio-tegra186.c index d5b28a4f643e4..95cd22581731b 100644 --- a/drivers/gpio/gpio-tegra186.c +++ b/drivers/gpio/gpio-tegra186.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2016-2022 NVIDIA Corporation + * Copyright (c) 2016-2025 NVIDIA Corporation * * Author: Thierry Reding * Dipen Patel @@ -68,6 +68,30 @@ #define TEGRA186_GPIO_INTERRUPT_STATUS(x) (0x100 + (x) * 4) +/* Tegra410 GPIOs implemented by the COMPUTE GPIO controller */ +#define TEGRA410_COMPUTE_GPIO_PORT_A 0 +#define TEGRA410_COMPUTE_GPIO_PORT_B 1 +#define TEGRA410_COMPUTE_GPIO_PORT_C 2 +#define TEGRA410_COMPUTE_GPIO_PORT_D 3 +#define TEGRA410_COMPUTE_GPIO_PORT_E 4 + +/* Tegra410 GPIOs implemented by the SYSTEM GPIO controller */ +#define TEGRA410_SYSTEM_GPIO_PORT_A 0 +#define TEGRA410_SYSTEM_GPIO_PORT_B 1 +#define TEGRA410_SYSTEM_GPIO_PORT_C 2 +#define TEGRA410_SYSTEM_GPIO_PORT_D 3 +#define TEGRA410_SYSTEM_GPIO_PORT_E 4 +#define TEGRA410_SYSTEM_GPIO_PORT_I 5 +#define TEGRA410_SYSTEM_GPIO_PORT_J 6 +#define TEGRA410_SYSTEM_GPIO_PORT_K 7 +#define TEGRA410_SYSTEM_GPIO_PORT_L 8 +#define TEGRA410_SYSTEM_GPIO_PORT_M 9 +#define TEGRA410_SYSTEM_GPIO_PORT_N 10 +#define TEGRA410_SYSTEM_GPIO_PORT_P 11 +#define TEGRA410_SYSTEM_GPIO_PORT_Q 12 +#define TEGRA410_SYSTEM_GPIO_PORT_R 13 +#define TEGRA410_SYSTEM_GPIO_PORT_V 14 + struct tegra_gpio_port { const char *name; unsigned int bank; @@ -1247,6 +1271,54 @@ static const struct tegra_gpio_soc tegra241_aon_soc = { .has_vm_support = false, }; +#define TEGRA410_COMPUTE_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA410_COMPUTE, _name, _bank, _port, _pins) + +static const struct tegra_gpio_port tegra410_compute_ports[] = { + TEGRA410_COMPUTE_GPIO_PORT(A, 0, 0, 3), + TEGRA410_COMPUTE_GPIO_PORT(B, 1, 0, 8), + TEGRA410_COMPUTE_GPIO_PORT(C, 1, 1, 3), + TEGRA410_COMPUTE_GPIO_PORT(D, 2, 0, 8), + TEGRA410_COMPUTE_GPIO_PORT(E, 2, 1, 8), +}; + +static const struct tegra_gpio_soc tegra410_compute_soc = { + .num_ports = ARRAY_SIZE(tegra410_compute_ports), + .ports = tegra410_compute_ports, + .name = "tegra410-gpio-compute", + .num_irqs_per_bank = 8, + .instance = 0, +}; + +#define TEGRA410_SYSTEM_GPIO_PORT(_name, _bank, _port, _pins) \ + TEGRA_GPIO_PORT(TEGRA410_SYSTEM, _name, _bank, _port, _pins) + +static const struct tegra_gpio_port tegra410_system_ports[] = { + TEGRA410_SYSTEM_GPIO_PORT(A, 0, 0, 7), + TEGRA410_SYSTEM_GPIO_PORT(B, 0, 1, 8), + TEGRA410_SYSTEM_GPIO_PORT(C, 0, 2, 8), + TEGRA410_SYSTEM_GPIO_PORT(D, 0, 3, 8), + TEGRA410_SYSTEM_GPIO_PORT(E, 0, 4, 6), + TEGRA410_SYSTEM_GPIO_PORT(I, 1, 0, 8), + TEGRA410_SYSTEM_GPIO_PORT(J, 1, 1, 7), + TEGRA410_SYSTEM_GPIO_PORT(K, 1, 2, 7), + TEGRA410_SYSTEM_GPIO_PORT(L, 1, 3, 7), + TEGRA410_SYSTEM_GPIO_PORT(M, 2, 0, 7), + TEGRA410_SYSTEM_GPIO_PORT(N, 2, 1, 6), + TEGRA410_SYSTEM_GPIO_PORT(P, 2, 2, 8), + TEGRA410_SYSTEM_GPIO_PORT(Q, 2, 3, 3), + TEGRA410_SYSTEM_GPIO_PORT(R, 2, 4, 2), + TEGRA410_SYSTEM_GPIO_PORT(V, 1, 4, 2), +}; + +static const struct tegra_gpio_soc tegra410_system_soc = { + .num_ports = ARRAY_SIZE(tegra410_system_ports), + .ports = tegra410_system_ports, + .name = "tegra410-gpio-system", + .num_irqs_per_bank = 8, + .instance = 0, +}; + static const struct of_device_id tegra186_gpio_of_match[] = { { .compatible = "nvidia,tegra186-gpio", @@ -1279,6 +1351,8 @@ static const struct acpi_device_id tegra186_gpio_acpi_match[] = { { .id = "NVDA0408", .driver_data = (kernel_ulong_t)&tegra194_aon_soc }, { .id = "NVDA0508", .driver_data = (kernel_ulong_t)&tegra241_main_soc }, { .id = "NVDA0608", .driver_data = (kernel_ulong_t)&tegra241_aon_soc }, + { .id = "NVDA0708", .driver_data = (kernel_ulong_t)&tegra410_compute_soc }, + { .id = "NVDA0808", .driver_data = (kernel_ulong_t)&tegra410_system_soc }, {} }; MODULE_DEVICE_TABLE(acpi, tegra186_gpio_acpi_match); From 31f6b1dfd75317b2cb105bb1a6c6c6a1d354d9dc Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Thu, 20 Nov 2025 12:15:33 +0800 Subject: [PATCH 019/247] net: aquantia: Add missing descriptor cache invalidation on ATL2 BugLink: https://bugs.launchpad.net/bugs/2133755 ATL2 hardware was missing descriptor cache invalidation in hw_stop(), causing SMMU translation faults during device shutdown and module removal: [ 70.355743] arm-smmu-v3 arm-smmu-v3.5.auto: event 0x10 received: [ 70.361893] arm-smmu-v3 arm-smmu-v3.5.auto: 0x0002060000000010 [ 70.367948] arm-smmu-v3 arm-smmu-v3.5.auto: 0x0000020000000000 [ 70.374002] arm-smmu-v3 arm-smmu-v3.5.auto: 0x00000000ff9bc000 [ 70.380055] arm-smmu-v3 arm-smmu-v3.5.auto: 0x0000000000000000 [ 70.386109] arm-smmu-v3 arm-smmu-v3.5.auto: event: F_TRANSLATION client: 0001:06:00.0 sid: 0x20600 ssid: 0x0 iova: 0xff9bc000 ipa: 0x0 [ 70.398531] arm-smmu-v3 arm-smmu-v3.5.auto: unpriv data write s1 "Input address caused fault" stag: 0x0 Commit 7a1bb49461b1 ("net: aquantia: fix potential IOMMU fault after driver unbind") and commit ed4d81c4b3f2 ("net: aquantia: when cleaning hw cache it should be toggled") fixed cache invalidation for ATL B0, but ATL2 was left with only interrupt disabling. This allowed hardware to write to cached descriptors after DMA memory was unmapped, triggering SMMU faults. Once cache invalidation is applied to ATL2, the translation fault can't be observed anymore. Add shared aq_hw_invalidate_descriptor_cache() helper and use it in both ATL B0 and ATL2 hw_stop() implementations for consistent behavior. Fixes: e54dcf4bba3e ("net: atlantic: basic A2 init/deinit hw_ops") Tested-by: Carol Soto Signed-off-by: Kai-Heng Feng Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251120041537.62184-1-kaihengf@nvidia.com Signed-off-by: Paolo Abeni (cherry picked from commit 7526183cfdbe352c51c285762f0e15b7c428ea06) Signed-off-by: Kai-Heng Feng Acked-by: Nirmoy Das Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Noah Wager Acked-by: Jacob Martin Signed-off--by: Brad Figg --- .../ethernet/aquantia/atlantic/aq_hw_utils.c | 22 +++++++++++++++++++ .../ethernet/aquantia/atlantic/aq_hw_utils.h | 1 + .../aquantia/atlantic/hw_atl/hw_atl_b0.c | 19 +--------------- .../aquantia/atlantic/hw_atl2/hw_atl2.c | 2 +- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c index 1921741f7311d..18b08277d2e1a 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c @@ -15,6 +15,7 @@ #include "aq_hw.h" #include "aq_nic.h" +#include "hw_atl/hw_atl_llh.h" void aq_hw_write_reg_bit(struct aq_hw_s *aq_hw, u32 addr, u32 msk, u32 shift, u32 val) @@ -81,6 +82,27 @@ void aq_hw_write_reg64(struct aq_hw_s *hw, u32 reg, u64 value) lo_hi_writeq(value, hw->mmio + reg); } +int aq_hw_invalidate_descriptor_cache(struct aq_hw_s *hw) +{ + int err; + u32 val; + + /* Invalidate Descriptor Cache to prevent writing to the cached + * descriptors and to the data pointer of those descriptors + */ + hw_atl_rdm_rx_dma_desc_cache_init_tgl(hw); + + err = aq_hw_err_from_flags(hw); + if (err) + goto err_exit; + + readx_poll_timeout_atomic(hw_atl_rdm_rx_dma_desc_cache_init_done_get, + hw, val, val == 1, 1000U, 10000U); + +err_exit: + return err; +} + int aq_hw_err_from_flags(struct aq_hw_s *hw) { int err = 0; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h index ffa6e4067c211..d89c63d88e4a4 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h @@ -35,6 +35,7 @@ u32 aq_hw_read_reg(struct aq_hw_s *hw, u32 reg); void aq_hw_write_reg(struct aq_hw_s *hw, u32 reg, u32 value); u64 aq_hw_read_reg64(struct aq_hw_s *hw, u32 reg); void aq_hw_write_reg64(struct aq_hw_s *hw, u32 reg, u64 value); +int aq_hw_invalidate_descriptor_cache(struct aq_hw_s *hw); int aq_hw_err_from_flags(struct aq_hw_s *hw); int aq_hw_num_tcs(struct aq_hw_s *hw); int aq_hw_q_per_tc(struct aq_hw_s *hw); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index 493432d036b9a..c7895bfb2ecf8 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -1198,26 +1198,9 @@ static int hw_atl_b0_hw_interrupt_moderation_set(struct aq_hw_s *self) static int hw_atl_b0_hw_stop(struct aq_hw_s *self) { - int err; - u32 val; - hw_atl_b0_hw_irq_disable(self, HW_ATL_B0_INT_MASK); - /* Invalidate Descriptor Cache to prevent writing to the cached - * descriptors and to the data pointer of those descriptors - */ - hw_atl_rdm_rx_dma_desc_cache_init_tgl(self); - - err = aq_hw_err_from_flags(self); - - if (err) - goto err_exit; - - readx_poll_timeout_atomic(hw_atl_rdm_rx_dma_desc_cache_init_done_get, - self, val, val == 1, 1000U, 10000U); - -err_exit: - return err; + return aq_hw_invalidate_descriptor_cache(self); } int hw_atl_b0_hw_ring_tx_stop(struct aq_hw_s *self, struct aq_ring_s *ring) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c index b0ed572e88c67..0ce9caae8799c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c @@ -759,7 +759,7 @@ static int hw_atl2_hw_stop(struct aq_hw_s *self) { hw_atl_b0_hw_irq_disable(self, HW_ATL2_INT_MASK); - return 0; + return aq_hw_invalidate_descriptor_cache(self); } static struct aq_stats_s *hw_atl2_utils_get_hw_stats(struct aq_hw_s *self) From f5a8313ff92bbda8fad9be86d2fd61dbfcb04e1c Mon Sep 17 00:00:00 2001 From: Sourab Gupta Date: Thu, 20 Nov 2025 18:10:03 +0000 Subject: [PATCH 020/247] NVIDIA: SAUCE: Patch NVMe/NVMeoF driver to support GDS on Linux 6.17 Kernel BugLink: https://bugs.launchpad.net/bugs/2134960 With this change, the NVMe and NVMeoF driver would be enabled to support GPUDirectStorage(GDS). NVMe driver introduced a way to use the blk_rq_dma_map API to DMA map requests instead of scatter gather lists. With these changes, GDS path also adopts a similar framework where we introduce blk based APIs(nvfs_blk_rq_dma_map_iter_start and nvfs_blk_rq_dma_map_iter_next) to map a DMA request. The NVMeoF path remains the same as previous releases. Signed-off-by: Sourab Gupta Reviewed-by: Kiran Modukuri Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Acked-by: Abdur Rahman Acked-by: Jacob Martin Signed-off-by: Brad Figg --- drivers/nvme/host/Makefile | 5 +- drivers/nvme/host/nvfs-dma.c | 51 ++++++++++ drivers/nvme/host/nvfs-dma.h | 180 ++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvfs-rdma.c | 52 ++++++++++ drivers/nvme/host/nvfs-rdma.h | 88 +++++++++++++++++ drivers/nvme/host/nvfs.h | 156 +++++++++++++++++++++++++++++ drivers/nvme/host/pci.c | 80 +++++++++++++-- drivers/nvme/host/rdma.c | 22 +++++ 8 files changed, 626 insertions(+), 8 deletions(-) create mode 100644 drivers/nvme/host/nvfs-dma.c create mode 100644 drivers/nvme/host/nvfs-dma.h create mode 100644 drivers/nvme/host/nvfs-rdma.c create mode 100644 drivers/nvme/host/nvfs-rdma.h create mode 100644 drivers/nvme/host/nvfs.h diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 6414ec968f99a..2fdd327bf6a88 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 ccflags-y += -I$(src) - +ccflags-y += -DCONFIG_NVFS obj-$(CONFIG_NVME_CORE) += nvme-core.o obj-$(CONFIG_BLK_DEV_NVME) += nvme.o obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o @@ -20,10 +20,11 @@ nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o nvme-core-$(CONFIG_NVME_HOST_AUTH) += auth.o nvme-y += pci.o - +nvme-y += nvfs-dma.o nvme-fabrics-y += fabrics.o nvme-rdma-y += rdma.o +nvme-rdma-y += nvfs-rdma.o nvme-fc-y += fc.o diff --git a/drivers/nvme/host/nvfs-dma.c b/drivers/nvme/host/nvfs-dma.c new file mode 100644 index 0000000000000..33a27c3aeca90 --- /dev/null +++ b/drivers/nvme/host/nvfs-dma.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#ifdef CONFIG_NVFS +#define NVFS_USE_DMA_ITER_API +#define MODULE_PREFIX nvme_v2 +#include "nvfs.h" + +struct nvfs_dma_rw_blk_iter_ops *nvfs_ops = NULL; + +atomic_t nvfs_shutdown = ATOMIC_INIT(1); + +DEFINE_PER_CPU(long, nvfs_n_ops); + +#define NVIDIA_FS_COMPAT_FT(ops) \ + (NVIDIA_FS_CHECK_FT_BLK_DMA_MAP_ITER_START(ops) && NVIDIA_FS_CHECK_FT_BLK_DMA_MAP_ITER_NEXT(ops)) + +// protected via nvfs_module_mutex +int REGISTER_FUNC(struct nvfs_dma_rw_blk_iter_ops *ops) +{ + if (NVIDIA_FS_COMPAT_FT(ops)) { + nvfs_ops = ops; + atomic_set(&nvfs_shutdown, 0); + return 0; + } else + return -EOPNOTSUPP; + +} +EXPORT_SYMBOL_GPL(REGISTER_FUNC); + +// protected via nvfs_module_mutex +void UNREGISTER_FUNC(void) +{ + (void) atomic_cmpxchg(&nvfs_shutdown, 0, 1); + do { + msleep(NVFS_HOLD_TIME_MS); + } while(nvfs_count_ops()); + nvfs_ops = NULL; +} +EXPORT_SYMBOL_GPL(UNREGISTER_FUNC); +#endif diff --git a/drivers/nvme/host/nvfs-dma.h b/drivers/nvme/host/nvfs-dma.h new file mode 100644 index 0000000000000..7876bb7a4a1b7 --- /dev/null +++ b/drivers/nvme/host/nvfs-dma.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef NVFS_DMA_H +#define NVFS_DMA_H + +/* Forward declarations for functions from pci.c that we need */ +static blk_status_t nvme_pci_setup_data_prp(struct request *req, + struct blk_dma_iter *iter); +static blk_status_t nvme_pci_setup_data_sgl(struct request *req, + struct blk_dma_iter *iter); +static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, + struct nvme_iod *iod); +static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd); + +static inline bool nvme_nvfs_unmap_sgls(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct device *dma_dev = nvmeq->dev->dev; + dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr); + unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length); + struct nvme_sgl_desc *sg_list = iod->descriptors[0]; + enum dma_data_direction dir = rq_dma_dir(req); + + if (iod->nr_descriptors) { + unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i; + + for (i = 0; i < nr_entries; i++) { + nvfs_ops->nvfs_dma_unmap_page(dma_dev, + iod->nvfs_cookie, + le64_to_cpu(sg_list[i].addr), + le32_to_cpu(sg_list[i].length), + dir); + } + } else + nvfs_ops->nvfs_dma_unmap_page(dma_dev, iod->nvfs_cookie, sqe_dma_addr, sqe_dma_len, dir); + + + + return true; +} + +static inline bool nvme_nvfs_unmap_prps(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct device *dma_dev = nvmeq->dev->dev; + enum dma_data_direction dma_dir = rq_dma_dir(req); + unsigned int i; + + /* Check if dma_vecs was allocated - if setup failed early, it might be NULL */ + if (!iod->dma_vecs) + return true; + + /* Unmap all DMA vectors - pass page pointer from dma_vecs */ + for (i = 0; i < iod->nr_dma_vecs; i++) { + nvfs_ops->nvfs_dma_unmap_page(dma_dev, + iod->nvfs_cookie, + iod->dma_vecs[i].addr, + iod->dma_vecs[i].len, + dma_dir); + } + + /* Free the dma_vecs mempool allocation */ + mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); + iod->dma_vecs = NULL; + iod->nr_dma_vecs = 0; + + return true; +} + +static inline void nvme_nvfs_free_descriptors(struct request *req) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd); + int i; + + if (iod->nr_descriptors == 1) { + dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0], + dma_addr); + return; + } + + for (i = 0; i < iod->nr_descriptors; i++) { + __le64 *prp_list = iod->descriptors[i]; + dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); + + dma_pool_free(nvmeq->descriptor_pools.large, prp_list, + dma_addr); + dma_addr = next_dma_addr; + } +} + +static inline bool nvme_nvfs_unmap_data(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + bool ret; + + /* Check if this was an NVFS I/O by checking the IOD_NVFS_IO flag */ + if (!(iod->flags & IOD_NVFS_IO)) + return false; + + /* Clear the NVFS flag */ + iod->flags &= ~IOD_NVFS_IO; + + /* Call appropriate unmap function based on command type */ + if (nvme_pci_cmd_use_sgl(&iod->cmd)) + ret = nvme_nvfs_unmap_sgls(req); + else + ret = nvme_nvfs_unmap_prps(req); + + if (iod->nr_descriptors) + nvme_nvfs_free_descriptors(req); + + nvfs_put_ops(); + return ret; +} + +static inline blk_status_t nvme_nvfs_map_data(struct request *req, + bool *is_nvfs_io) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + struct device *dma_dev = nvmeq->dev->dev; + enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req); + struct blk_dma_iter iter; + blk_status_t ret = BLK_STS_RESOURCE; + + *is_nvfs_io = false; + + /* Check integrity and try to get nvfs_ops */ + if (blk_integrity_rq(req) || !nvfs_get_ops()) { + return ret; + } + + /* Initialize total_len for this request */ + iod->total_len = 0; + + if (!nvfs_ops->nvfs_blk_rq_dma_map_iter_start(req, dma_dev, + &iod->dma_state, &iter, &iod->nvfs_cookie)) { + nvfs_put_ops(); + ret = BLK_STS_IOERR; + return ret; + } + + /* NVFS can handle this request, set the flag */ + *is_nvfs_io = true; + iod->flags |= IOD_NVFS_IO; + + if (use_sgl == SGL_FORCED || + (use_sgl == SGL_SUPPORTED && + (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) + ret = nvme_pci_setup_data_sgl(req, &iter); + else + ret = nvme_pci_setup_data_prp(req, &iter); + + /* If setup failed, cleanup: unmap DMA, clear flag, release ops */ + if (ret != BLK_STS_OK) { + nvme_nvfs_unmap_data(req); + } + + return ret; +} + +#endif /* NVFS_DMA_H */ diff --git a/drivers/nvme/host/nvfs-rdma.c b/drivers/nvme/host/nvfs-rdma.c new file mode 100644 index 0000000000000..4b06e45883539 --- /dev/null +++ b/drivers/nvme/host/nvfs-rdma.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifdef CONFIG_NVFS +#define MODULE_PREFIX nvme_rdma_v1 +#include "nvfs.h" + +struct nvfs_dma_rw_ops *nvfs_ops; + +atomic_t nvfs_shutdown = ATOMIC_INIT(1); + +DEFINE_PER_CPU(long, nvfs_n_ops); + +// must have for compatability +#define NVIDIA_FS_COMPAT_FT(ops) \ + (NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) && NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops)) + +// protected via nvfs_module_mutex +int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops) +{ + if (NVIDIA_FS_COMPAT_FT(ops)) { + nvfs_ops = ops; + atomic_set(&nvfs_shutdown, 0); + return 0; + } else + return -EOPNOTSUPP; + +} +EXPORT_SYMBOL_GPL(REGISTER_FUNC); + +// protected via nvfs_module_mutex +void UNREGISTER_FUNC(void) +{ + (void) atomic_cmpxchg(&nvfs_shutdown, 0, 1); + do { + msleep(NVFS_HOLD_TIME_MS); + } while(nvfs_count_ops()); + nvfs_ops = NULL; +} +EXPORT_SYMBOL_GPL(UNREGISTER_FUNC); +#endif diff --git a/drivers/nvme/host/nvfs-rdma.h b/drivers/nvme/host/nvfs-rdma.h new file mode 100644 index 0000000000000..f9721ac9ead1e --- /dev/null +++ b/drivers/nvme/host/nvfs-rdma.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef NVFS_RDMA_H +#define NVFS_RDMA_H + +static bool nvme_rdma_nvfs_unmap_data(struct ib_device *ibdev, + struct request *rq) + +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + enum dma_data_direction dma_dir = rq_dma_dir(rq); + int count; + + if (!blk_integrity_rq(rq) && nvfs_ops != NULL) { + count = nvfs_ops->nvfs_dma_unmap_sg(ibdev->dma_device, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + dma_dir); + if (count) { + nvfs_put_ops(); + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); + return true; + } + } + return false; +} + +static int nvme_rdma_nvfs_map_data(struct ib_device *ibdev, struct request *rq, bool *is_nvfs_io, int* count) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + enum dma_data_direction dma_dir = rq_dma_dir(rq); + int ret = 0; + + *is_nvfs_io = false; + *count = 0; + if (!blk_integrity_rq(rq) && nvfs_get_ops()) { + + // associates bio pages to scatterlist + *count = nvfs_ops->nvfs_blk_rq_map_sg(rq->q, rq , req->data_sgl.sg_table.sgl); + if (!*count) { + nvfs_put_ops(); + return 0; // fall to cpu path + } + + *is_nvfs_io = true; + if (unlikely((*count == NVFS_IO_ERR))) { + nvfs_put_ops(); + pr_err("%s: failed to map sg_nents=:%d\n", __func__, req->data_sgl.nents); + return -EIO; + } + req->data_sgl.nents = *count; + + *count = nvfs_ops->nvfs_dma_map_sg_attrs(ibdev->dma_device, + req->data_sgl.sg_table.sgl, + req->data_sgl.nents, + dma_dir, + DMA_ATTR_NO_WARN); + + if (unlikely((*count == NVFS_IO_ERR))) { + nvfs_put_ops(); + return -EIO; + } + + if (unlikely(*count == NVFS_CPU_REQ)) { + nvfs_put_ops(); + return -EIO; + } + + return ret; + } else { + // Fall to CPU path + return 0; + } + + return ret; +} + +#endif diff --git a/drivers/nvme/host/nvfs.h b/drivers/nvme/host/nvfs.h new file mode 100644 index 0000000000000..0eb51b94b8d2e --- /dev/null +++ b/drivers/nvme/host/nvfs.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#ifndef NVFS_H +#define NVFS_H + +#include +#include +#include +#include +#include +#include +#include + +/* Forward declarations */ +struct blk_dma_iter; +struct dma_iova_state; + +#define REGSTR2(x) x##_register_nvfs_dma_ops +#define REGSTR(x) REGSTR2(x) + +#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops +#define UNREGSTR(x) UNREGSTR2(x) + +#define REGISTER_FUNC REGSTR(MODULE_PREFIX) +#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX) + +#define NVFS_IO_ERR -1 +#define NVFS_CPU_REQ -2 + +#define NVFS_HOLD_TIME_MS 1000 + +#ifdef NVFS_USE_DMA_ITER_API +extern struct nvfs_dma_rw_blk_iter_ops *nvfs_ops; +#else +extern struct nvfs_dma_rw_ops *nvfs_ops; +#endif + +extern atomic_t nvfs_shutdown; + +DECLARE_PER_CPU(long, nvfs_n_ops); + +static inline long nvfs_count_ops(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nvfs_n_ops, i); + return sum; +} + +static inline bool nvfs_get_ops(void) +{ + if (nvfs_ops && !atomic_read(&nvfs_shutdown)) { + this_cpu_inc(nvfs_n_ops); + return true; + } + return false; +} + +static inline void nvfs_put_ops(void) +{ + this_cpu_dec(nvfs_n_ops); +} + + +struct nvfs_dma_rw_blk_iter_ops { + unsigned long long ft_bmap; // feature bitmap + + int (*nvfs_blk_rq_dma_map_iter_start) (struct request *req, + struct device *dma_dev, + struct dma_iova_state *state, + struct blk_dma_iter *iter, + void **cookie); + + int (*nvfs_blk_rq_dma_map_iter_next) (struct request *req, + struct device *dma_dev, + struct dma_iova_state *state, + struct blk_dma_iter *iter); + + int (*nvfs_dma_unmap_page) (struct device *device, + void* cookie, + dma_addr_t addr, + size_t size, + enum dma_data_direction dir); + + bool (*nvfs_is_gpu_page) (struct page *page); + + unsigned int (*nvfs_gpu_index) (struct page *page); + + unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int gpu_index); + +}; + +struct nvfs_dma_rw_ops { + unsigned long long ft_bmap; // feature bitmap + + int (*nvfs_blk_rq_map_sg) (struct request_queue *q, + struct request *req, + struct scatterlist *sglist); + + int (*nvfs_dma_map_sg_attrs) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir, + unsigned long attrs); + + int (*nvfs_dma_unmap_sg) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir); + + bool (*nvfs_is_gpu_page) (struct page *page); + + unsigned int (*nvfs_gpu_index) (struct page *page); + + unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int gpu_index); +}; + +// feature list for dma_ops, values indicate bit pos +enum ft_bits { + nvfs_ft_prep_sglist = 1ULL << 0, + nvfs_ft_map_sglist = 1ULL << 1, + nvfs_ft_is_gpu_page = 1ULL << 2, + nvfs_ft_device_priority = 1ULL << 3, + nvfs_ft_blk_dma_map_iter_start = 1ULL << 5, + nvfs_ft_blk_dma_map_iter_next = 1ULL << 6, +}; + +// check features for use in registration with vendor drivers +#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) ((ops)->ft_bmap & nvfs_ft_prep_sglist) +#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops) ((ops)->ft_bmap & nvfs_ft_map_sglist) +#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops) ((ops)->ft_bmap & nvfs_ft_is_gpu_page) +#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops) ((ops)->ft_bmap & nvfs_ft_device_priority) +#define NVIDIA_FS_CHECK_FT_BLK_DMA_MAP_ITER_START(ops) ((ops)->ft_bmap & nvfs_ft_blk_dma_map_iter_start) +#define NVIDIA_FS_CHECK_FT_BLK_DMA_MAP_ITER_NEXT(ops) ((ops)->ft_bmap & nvfs_ft_blk_dma_map_iter_next) + +#ifdef NVFS_USE_DMA_ITER_API +int REGISTER_FUNC(struct nvfs_dma_rw_blk_iter_ops *ops); +#else +int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops); +#endif + +void UNREGISTER_FUNC(void); + +#endif /* NVFS_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8ed5f1941f05c..7e17c3f57d3eb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -30,6 +30,10 @@ #include "trace.h" #include "nvme.h" +#ifdef CONFIG_NVFS +#define NVFS_USE_DMA_ITER_API +#include "nvfs.h" +#endif #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) @@ -261,6 +265,11 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, + +#ifdef CONFIG_NVFS + /* NVFS GPU Direct Storage I/O */ + IOD_NVFS_IO = 1U << 3, +#endif }; struct nvme_dma_vec { @@ -286,6 +295,9 @@ struct nvme_iod { dma_addr_t meta_dma; struct sg_table meta_sgt; struct nvme_sgl_desc *meta_descriptor; +#ifdef CONFIG_NVFS + void *nvfs_cookie; +#endif }; static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) @@ -711,12 +723,22 @@ static void nvme_free_sgls(struct request *req) } } +#ifdef CONFIG_NVFS +#include "nvfs-dma.h" +#endif + static void nvme_unmap_data(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct device *dma_dev = nvmeq->dev->dev; +#ifdef CONFIG_NVFS + /* Check if this was an NVFS I/O and handle unmapping */ + if (nvme_nvfs_unmap_data(req)) + return; +#endif + if (iod->flags & IOD_SINGLE_SEGMENT) { static_assert(offsetof(union nvme_data_ptr, prp1) == offsetof(union nvme_data_ptr, sgl.addr)); @@ -743,6 +765,21 @@ static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev, if (iter->len) return true; + +#ifdef CONFIG_NVFS + if (iod->flags & IOD_NVFS_IO) { + if (!nvfs_ops->nvfs_blk_rq_dma_map_iter_next(req, dma_dev, + &iod->dma_state, iter)) + return false; + + iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr; + iod->dma_vecs[iod->nr_dma_vecs].len = iter->len; + iod->nr_dma_vecs++; + + return true; + } +#endif + if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter)) return false; if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) { @@ -763,7 +800,11 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req, unsigned int prp_len, i; __le64 *prp_list; - if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev)) { + if ( +#ifdef CONFIG_NVFS + (iod->flags & IOD_NVFS_IO) || +#endif + (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev))) { iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool, GFP_ATOMIC); if (!iod->dma_vecs) @@ -868,6 +909,11 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req, */ iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma); iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma); +#ifdef CONFIG_NVFS + /* For NVFS, don't call nvme_unmap_data - cleanup happens in nvme_nvfs_unmap_data */ + if (iod->flags & IOD_NVFS_IO) + return iter->status; +#endif if (unlikely(iter->status)) nvme_unmap_data(req); return iter->status; @@ -908,10 +954,15 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req, /* set the transfer type as SGL */ iod->cmd.common.flags = NVME_CMD_SGL_METABUF; - if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) { - nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter); - iod->total_len += iter->len; - return BLK_STS_OK; +#ifdef CONFIG_NVFS + if (!(iod->flags & IOD_NVFS_IO)) +#endif + { + if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) { + nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter); + iod->total_len += iter->len; + return BLK_STS_OK; + } } if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list)) @@ -930,10 +981,21 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req, } nvme_pci_sgl_set_data(&sg_list[mapped++], iter); iod->total_len += iter->len; - } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state, + } while ( +#ifdef CONFIG_NVFS + (iod->flags & IOD_NVFS_IO) ? + nvfs_ops->nvfs_blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, + &iod->dma_state, iter) : +#endif + blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state, iter)); nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); +#ifdef CONFIG_NVFS + /* For NVFS, don't call nvme_unmap_data - cleanup happens in nvme_nvfs_unmap_data */ + if (iod->flags & IOD_NVFS_IO) + return iter->status; +#endif if (unlikely(iter->status)) nvme_unmap_data(req); return iter->status; @@ -987,6 +1049,12 @@ static blk_status_t nvme_map_data(struct request *req) struct blk_dma_iter iter; blk_status_t ret; +#ifdef CONFIG_NVFS + bool is_nvfs_io = false; + ret = nvme_nvfs_map_data(req, &is_nvfs_io); + if (is_nvfs_io) + return ret; +#endif /* * Try to skip the DMA iterator for single segment requests, as that * significantly improves performances for small I/O sizes. diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 190a4cfa8a5ee..c2bebd7cebec4 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -27,6 +27,9 @@ #include "nvme.h" #include "fabrics.h" +#ifdef CONFIG_NVFS +#include "nvfs.h" +#endif #define NVME_RDMA_CM_TIMEOUT_MS 3000 /* 3 second */ @@ -1212,6 +1215,9 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, return ib_post_send(queue->qp, &wr, NULL); } +#ifdef CONFIG_NVFS +#include "nvfs-rdma.h" +#endif static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); @@ -1223,6 +1229,11 @@ static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq) NVME_INLINE_METADATA_SG_CNT); } +#ifdef CONFIG_NVFS + if (nvme_rdma_nvfs_unmap_data(ibdev, rq)) + return; +#endif + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, rq_dma_dir(rq)); sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); @@ -1476,6 +1487,17 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, if (ret) return -ENOMEM; +#ifdef CONFIG_NVFS + { + bool is_nvfs_io = false; + ret = nvme_rdma_nvfs_map_data(ibdev, rq, &is_nvfs_io, count); + if (is_nvfs_io) { + if (ret) + goto out_free_table; + return 0; + } + } +#endif req->data_sgl.nents = blk_rq_map_sg(rq, req->data_sgl.sg_table.sgl); *count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, From 107fd3d0cafbda7f245277f2990b8fbe16729cfa Mon Sep 17 00:00:00 2001 From: ChunHao Lin Date: Thu, 4 Dec 2025 16:51:16 +0800 Subject: [PATCH 021/247] NVIDIA: SAUCE: r8127: Remove registers2 proc entry BugLink: https://bugs.launchpad.net/bugs/2134991 Remove registers2 proc entry as it is causing system crash on running opensource LTP test suite. Change-Id: I47846bca0401d4403fba026d4a348eef3d454f80 Signed-off-by: ChunHao Lin Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Matthew R. Ochs Acked-by: Abdur Rahman Acked-by: Jacob Martin Signed-off-by: Brad Figg --- drivers/net/ethernet/realtek/r8127/r8127_n.c | 77 -------------------- 1 file changed, 77 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8127/r8127_n.c b/drivers/net/ethernet/realtek/r8127/r8127_n.c index 2c9c262abe2ea..496fec1320d12 100755 --- a/drivers/net/ethernet/realtek/r8127/r8127_n.c +++ b/drivers/net/ethernet/realtek/r8127/r8127_n.c @@ -1216,39 +1216,6 @@ static int proc_get_registers(struct seq_file *m, void *v) return 0; } -static int proc_get_all_registers(struct seq_file *m, void *v) -{ - struct net_device *dev = m->private; - int i, n, max; - u8 byte_rd; - struct rtl8127_private *tp = netdev_priv(dev); - void __iomem *ioaddr = tp->mmio_addr; - struct pci_dev *pdev = tp->pci_dev; - - seq_puts(m, "\nDump All MAC Registers\n"); - seq_puts(m, "Offset\tValue\n------\t-----\n"); - - max = pci_resource_len(pdev, 2); - - for (n = 0; n < max;) { - seq_printf(m, "\n0x%04x:\t", n); - - rtnl_lock(); - - for (i = 0; i < 16 && n < max; i++, n++) { - byte_rd = readb(ioaddr + n); - seq_printf(m, "%02x ", byte_rd); - } - - rtnl_unlock(); - } - - seq_printf(m, "\nTotal length:0x%X", max); - - seq_putc(m, '\n'); - return 0; -} - static int proc_get_pcie_phy(struct seq_file *m, void *v) { struct net_device *dev = m->private; @@ -2143,49 +2110,6 @@ static int proc_get_registers(char *page, char **start, return len; } -static int proc_get_all_registers(char *page, char **start, - off_t offset, int count, - int *eof, void *data) -{ - struct net_device *dev = data; - int i, n, max; - u8 byte_rd; - struct rtl8127_private *tp = netdev_priv(dev); - void __iomem *ioaddr = tp->mmio_addr; - struct pci_dev *pdev = tp->pci_dev; - int len = 0; - - len += snprintf(page + len, count - len, - "\nDump All MAC Registers\n" - "Offset\tValue\n------\t-----\n"); - - max = pci_resource_len(pdev, 2); - - for (n = 0; n < max;) { - len += snprintf(page + len, count - len, - "\n0x%04x:\t", - n); - - rtnl_lock(); - - for (i = 0; i < 16 && n < max; i++, n++) { - byte_rd = readb(ioaddr + n); - len += snprintf(page + len, count - len, - "%02x ", - byte_rd); - } - - rtnl_unlock(); - } - - len += snprintf(page + len, count - len, "\nTotal length:0x%X", max); - - len += snprintf(page + len, count - len, "\n"); - - *eof = 1; - return len; -} - static int proc_get_pcie_phy(char *page, char **start, off_t offset, int count, int *eof, void *data) @@ -2784,7 +2708,6 @@ static const struct rtl8127_proc_file rtl8127_debug_proc_files[] = { { "driver_var", &proc_get_driver_variable }, { "tally", &proc_get_tally_counter }, { "registers", &proc_get_registers }, - { "registers2", &proc_get_all_registers }, { "pcie_phy", &proc_get_pcie_phy }, { "eth_phy", &proc_get_eth_phy }, { "ext_regs", &proc_get_extended_registers }, From 537d9eaf386db860442df5708be1d66cd02ed72f Mon Sep 17 00:00:00 2001 From: Carol L Soto Date: Thu, 11 Dec 2025 07:53:36 -0800 Subject: [PATCH 022/247] Revert "NVIDIA: SAUCE: Fixes the kernel boot issues due to xhci mem errors" BugLink: https://bugs.launchpad.net/bugs/2134851 This reverts commit 8b97194e86b625767bb8bc636a5f45003cddc9c5. This was added to provide extra debug during Spark bringup. MediaTek has confirmed that this should be dropped in production. Signed-off-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Jacob Martin Signed-off-by: Brad Figg --- drivers/usb/host/xhci-mem.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index f985d779e5e69..c4a6544aa1075 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1510,8 +1510,6 @@ int xhci_endpoint_init(struct xhci_hcd *xhci, ep_ctx->tx_info = cpu_to_le32(EP_MAX_ESIT_PAYLOAD_LO(max_esit_payload) | EP_AVG_TRB_LENGTH(avg_trb_len)); - ep_ctx->reserved[0] = cpu_to_le32(0x1 | (0x1 << 11)); //mtk's bpks & bm - pr_err("%s rsv %#x\n", __func__, ep_ctx->reserved[0]); return 0; } @@ -1618,12 +1616,10 @@ void xhci_endpoint_copy(struct xhci_hcd *xhci, in_ep_ctx->ep_info2 = out_ep_ctx->ep_info2; in_ep_ctx->deq = out_ep_ctx->deq; in_ep_ctx->tx_info = out_ep_ctx->tx_info; -#if 0 if (xhci->quirks & XHCI_MTK_HOST) { in_ep_ctx->reserved[0] = out_ep_ctx->reserved[0]; in_ep_ctx->reserved[1] = out_ep_ctx->reserved[1]; } -#endif } /* Copy output xhci_slot_ctx to the input xhci_slot_ctx. From 6cf58cb159c38684eb0b6c948ca9bbc32f7d9a42 Mon Sep 17 00:00:00 2001 From: Jacob Martin Date: Mon, 15 Dec 2025 16:54:27 -0600 Subject: [PATCH 023/247] UBUNTU: [Packaging] update Ubuntu.md BugLink: https://bugs.launchpad.net/bugs/1786013 Signed-off-by: Jacob Martin --- Ubuntu.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Ubuntu.md b/Ubuntu.md index e00284995816a..cc91244731ecb 100644 --- a/Ubuntu.md +++ b/Ubuntu.md @@ -2,7 +2,7 @@ Name: linux-nvidia-6.17 Version: 6.17.0 Series: 24.04 (noble) Description: - This is the source code for the Ubuntu linux-nvidia-6.17 kernel for the -Noble series. This source tree is used to produce the flavours: nvidia, -nvidia-64k. This kernel is configured to support the NVIDIA x86 and arm64 -platforms. + This is the source code for the Ubuntu linux kernel for the 24.04 series. This + source tree is used to produce the flavours: nvidia, nvidia-64k. + This kernel is configured to support the widest range of desktop, laptop and + server configurations. From 50bb92105e84921023651541f9a9dee16233c59e Mon Sep 17 00:00:00 2001 From: Jacob Martin Date: Mon, 15 Dec 2025 16:57:54 -0600 Subject: [PATCH 024/247] UBUNTU: Start new release Ignore: yes Signed-off-by: Jacob Martin --- debian.nvidia-6.17/changelog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/debian.nvidia-6.17/changelog b/debian.nvidia-6.17/changelog index 5f93b16954c88..bd3b3fd0d28b8 100644 --- a/debian.nvidia-6.17/changelog +++ b/debian.nvidia-6.17/changelog @@ -1,3 +1,11 @@ +linux-nvidia-6.17 (6.17.0-1006.6) UNRELEASED; urgency=medium + + CHANGELOG: Do not edit directly. Autogenerated at release. + CHANGELOG: Use the printchanges target to see the current changes. + CHANGELOG: Use the insertchanges target to create the final log. + + -- Jacob Martin Mon, 15 Dec 2025 16:57:54 -0600 + linux-nvidia-6.17 (6.17.0-1004.4) noble; urgency=medium * noble/linux-nvidia-6.17: 6.17.0-1003.3 -proposed tracker (LP: #2131581) From 36eae6ddab13d44a712cb24fcef50ebb3454a0a7 Mon Sep 17 00:00:00 2001 From: Jacob Martin Date: Mon, 15 Dec 2025 17:07:44 -0600 Subject: [PATCH 025/247] UBUNTU: link-to-tracker: update tracking bug BugLink: https://bugs.launchpad.net/bugs/2136206 Properties: no-test-build Signed-off-by: Jacob Martin --- debian.nvidia-6.17/tracking-bug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian.nvidia-6.17/tracking-bug b/debian.nvidia-6.17/tracking-bug index f1e509ef8d222..b3effa0c06b00 100644 --- a/debian.nvidia-6.17/tracking-bug +++ b/debian.nvidia-6.17/tracking-bug @@ -1 +1 @@ -2131581 d2025.11.04-1 +2136206 d2025.12.15-1 From 64ee100cbfef4bca57b045f598ce7dc55925ef31 Mon Sep 17 00:00:00 2001 From: Jacob Martin Date: Mon, 15 Dec 2025 17:17:15 -0600 Subject: [PATCH 026/247] UBUNTU: [Packaging] debian.nvidia-6.17/dkms-versions -- update from kernel-versions (adhoc/d2025.12.15) BugLink: https://bugs.launchpad.net/bugs/1786013 Signed-off-by: Jacob Martin --- debian.nvidia-6.17/dkms-versions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian.nvidia-6.17/dkms-versions b/debian.nvidia-6.17/dkms-versions index 546ff4b97ac94..e9a45983e54e3 100644 --- a/debian.nvidia-6.17/dkms-versions +++ b/debian.nvidia-6.17/dkms-versions @@ -1,3 +1,3 @@ -zfs-linux 2.3.4-1ubuntu2 modulename=zfs debpath=pool/universe/z/%package%/zfs-dkms_%version%_all.deb arch=amd64 arch=arm64 arch=ppc64el arch=s390x rprovides=spl-modules rprovides=spl-dkms rprovides=zfs-modules rprovides=zfs-dkms +zfs-linux 2.3.4-1ubuntu2 modulename=zfs debpath=pool/universe/z/%package%/zfs-dkms_%version%_all.deb arch=amd64 arch=arm64 arch=ppc64el arch=s390x arch=riscv64 rprovides=spl-modules rprovides=spl-dkms rprovides=zfs-modules rprovides=zfs-dkms v4l2loopback 0.15.0-0ubuntu2 modulename=v4l2loopback debpath=pool/universe/v/%package%/v4l2loopback-dkms_%version%_all.deb arch=amd64 rprovides=v4l2loopback-modules rprovides=v4l2loopback-dkms mstflint 4.26.0-1 modulename=mstflint_access debpath=pool/universe/m/%package%/mstflint-dkms_%version%_all.deb arch=amd64 arch=arm64 rprovides=mstflint-modules rprovides=mstflint-dkms From 5fce40517b9523661d04261b04ee8fdb84594a9c Mon Sep 17 00:00:00 2001 From: Jacob Martin Date: Tue, 16 Dec 2025 08:44:20 -0600 Subject: [PATCH 027/247] UBUNTU: [Packaging] update variants BugLink: https://bugs.launchpad.net/bugs/1786013 Signed-off-by: Jacob Martin --- debian.nvidia-6.17/variants | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debian.nvidia-6.17/variants b/debian.nvidia-6.17/variants index 3225f003f2f4f..7d12a65a2f915 100644 --- a/debian.nvidia-6.17/variants +++ b/debian.nvidia-6.17/variants @@ -1 +1,3 @@ -6.17 +-hwe-24.04-edge +-hwe-24.04 From 6b40fa106aedcff76675772b1fee615caadf21a3 Mon Sep 17 00:00:00 2001 From: Jacob Martin Date: Wed, 17 Dec 2025 09:59:28 -0600 Subject: [PATCH 028/247] UBUNTU: Ubuntu-nvidia-6.17-6.17.0-1006.6 Signed-off-by: Jacob Martin --- debian.nvidia-6.17/changelog | 75 +++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 5 deletions(-) diff --git a/debian.nvidia-6.17/changelog b/debian.nvidia-6.17/changelog index bd3b3fd0d28b8..5475820539ce4 100644 --- a/debian.nvidia-6.17/changelog +++ b/debian.nvidia-6.17/changelog @@ -1,10 +1,75 @@ -linux-nvidia-6.17 (6.17.0-1006.6) UNRELEASED; urgency=medium +linux-nvidia-6.17 (6.17.0-1006.6) noble; urgency=medium - CHANGELOG: Do not edit directly. Autogenerated at release. - CHANGELOG: Use the printchanges target to see the current changes. - CHANGELOG: Use the insertchanges target to create the final log. + * noble/linux-nvidia-6.17: 6.17.0-1006.6 -proposed tracker (LP: #2136206) - -- Jacob Martin Mon, 15 Dec 2025 16:57:54 -0600 + * Packaging resync (LP: #1786013) + - [Packaging] update Ubuntu.md + - [Packaging] debian.nvidia-6.17/dkms-versions -- update from kernel- + versions (adhoc/d2025.12.15) + - [Packaging] update variants + + * Seeing a lot of these traces xhci_endpoint_init rsv 0x801 (LP: #2134851) + - Revert "NVIDIA: SAUCE: Fixes the kernel boot issues due to xhci mem + errors" + + * r8127: fix for LTS test panic (LP: #2134991) + - NVIDIA: SAUCE: r8127: Remove registers2 proc entry + + * Update GDS/NVMe SAUCE for v6.17 (LP: #2134960) + - NVIDIA: SAUCE: Patch NVMe/NVMeoF driver to support GDS on Linux 6.17 + Kernel + + * Aquantia: seeing arm-smmu-v3 at shutdown or module removal (LP: #2133755) + - net: aquantia: Add missing descriptor cache invalidation on ATL2 + + * Backport gpio: tegra186: Add support for Tegra410 (LP: #2131269) + - gpio: tegra186: Use generic macro for port definitions + - gpio: tegra186: Add support for Tegra410 + + * Backport perf/arm_cspmu: Preparatory patches for NVIDIA T410 PMU + (LP: #2131267) + - perf/arm_cspmu: Add callback to reset filter config + - perf/arm_cspmu: Add pmpidr support + - perf/arm_cspmu: nvidia: Add revision id matching + - perf/arm_cspmu: nvidia: Add pmevfiltr2 support + + * NULL pointer dereference during vEGM Libvirt VM lifecycle (LP: #2131582) + - NVIDIA: SAUCE: vfio/nvgrace-egm: Prevent double-unregister of + pfn_address_space + + * Add two more Spark iGPU IDs for the existing iommu quirk (LP: #2132033) + - NVIDIA: SAUCE: iommu/arm-smmu-v3: Add two more DGX Spark iGPU IDs for + existing iommu quirk + + * Pull CPPC mailing list patches for Spark (LP: #2131705) + - NVIDIA: SAUCE: cpufreq: CPPC: Add generic helpers for sysfs show/store + - NVIDIA: SAUCE: ACPI: CPPC: Add cppc_get_perf() API to read performance + controls + - NVIDIA: SAUCE: ACPI: CPPC: extend APIs to support auto_sel and epp + - NVIDIA: SAUCE: ACPI: CPPC: add APIs and sysfs interface for min/max_perf + - NVIDIA: SAUCE: ACPI: CPPC: add APIs and sysfs interface for perf_limited + register + - NVIDIA: SAUCE: cpufreq: CPPC: Add sysfs for min/max_perf and + perf_limited + - NVIDIA: SAUCE: cpufreq: CPPC: update policy min/max when toggling + auto_select + - NVIDIA: SAUCE: cpufreq: CPPC: add autonomous mode boot parameter support + + [ Ubuntu: 6.17.0-8.8 ] + + * questing/linux: 6.17.0-8.8 -proposed tracker (LP: #2131554) + * crash when reading from /sys/kernel/tracing/rv/enabled_monitors + (LP: #2131136) + - rv: Fully convert enabled_monitors to use list_head as iterator + * i40e driver is triggering VF resets on every link state change + (LP: #2130552) + - i40e: avoid redundant VF link state updates + * kernel crash on bootup for some arm64 machines (LP: #2129770) + - KVM: arm64: Guard PMSCR_EL1 initialization with SPE presence check + * CVE-2025-40018 + - ipvs: Defer ip_vs_ftp unregister during netns cleanup + + -- Jacob Martin Wed, 17 Dec 2025 09:59:28 -0600 linux-nvidia-6.17 (6.17.0-1004.4) noble; urgency=medium From db064665f47b97978057e09c3ab8b67951cb092f Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Wed, 20 Aug 2025 16:45:33 +0800 Subject: [PATCH 029/247] perf: arm_pmuv3: Factor out PMCCNTR_EL0 use conditions BugLink: https://bugs.launchpad.net/bugs/2136812 PMCCNTR_EL0 is preferred for counting CPU_CYCLES under certain conditions. Factor out the condition check to a separate function for further extension. Add documents for better understanding. No functional changes intended. Reviewed-by: James Clark Acked-by: Mark Rutland Signed-off-by: Yicong Yang Signed-off-by: Will Deacon (cherry picked from commit f8f89e8cf3d668a40106444276d8c448c114e963) Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/perf/arm_pmuv3.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index f6d7bab5d555c..69c5cc8f56067 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -978,6 +978,32 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, return -EAGAIN; } +static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; + + if (evtype != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) + return false; + + /* + * A CPU_CYCLES event with threshold counting cannot use PMCCNTR_EL0 + * since it lacks threshold support. + */ + if (armv8pmu_event_get_threshold(&event->attr)) + return false; + + /* + * PMCCNTR_EL0 is not affected by BRBE controls like BRBCR_ELx.FZP. + * So don't use it for branch events. + */ + if (has_branch_stack(event)) + return false; + + return true; +} + static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, struct perf_event *event) { @@ -986,8 +1012,7 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; /* Always prefer to place a cycle counter into the cycle counter. */ - if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) && - !armv8pmu_event_get_threshold(&event->attr) && !has_branch_stack(event)) { + if (armv8pmu_can_use_pmccntr(cpuc, event)) { if (!test_and_set_bit(ARMV8_PMU_CYCLE_IDX, cpuc->used_mask)) return ARMV8_PMU_CYCLE_IDX; else if (armv8pmu_event_is_64bit(event) && From 1fb7f4414c7545c284b2e23f1483b8cbaa1ecd78 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 22 Sep 2025 11:30:10 +0800 Subject: [PATCH 030/247] perf: arm_pmuv3: Don't use PMCCNTR_EL0 on SMT cores BugLink: https://bugs.launchpad.net/bugs/2136812 CPU_CYCLES is expected to count the logical CPU (PE) clock. Currently it's preferred to use PMCCNTR_EL0 for counting CPU_CYCLES, but it'll count processor clock rather than the PE clock (ARM DDI0487 L.b D13.1.3) if one of the SMT siblings is not idle on a multi-threaded implementation. So don't use it on SMT cores. Introduce topology_core_has_smt() for knowing the SMT implementation and cached it in arm_pmu::has_smt during allocation. When counting cycles on SMT CPU 2-3 and CPU 3 is idle, without this patch we'll get: [root@client1 tmp]# perf stat -e cycles -A -C 2-3 -- stress-ng -c 1 --taskset 2 --timeout 1 [...] Performance counter stats for 'CPU(s) 2-3': CPU2 2880457316 cycles CPU3 2880459810 cycles 1.254688470 seconds time elapsed With this patch the idle state of CPU3 is observed as expected: [root@client1 ~]# perf stat -e cycles -A -C 2-3 -- stress-ng -c 1 --taskset 2 --timeout 1 [...] Performance counter stats for 'CPU(s) 2-3': CPU2 2558580492 cycles CPU3 305749 cycles 1.113626410 seconds time elapsed Signed-off-by: Yicong Yang Signed-off-by: Will Deacon (cherry picked from commit c3d78c34ad009a7cce57ae5b5c93e1bd03bb31a3) Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/perf/arm_pmu.c | 6 ++++++ drivers/perf/arm_pmuv3.c | 10 ++++++++++ include/linux/arch_topology.h | 11 +++++++++++ include/linux/perf/arm_pmu.h | 1 + 4 files changed, 28 insertions(+) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 5c310e803dd78..ae437791b5f8c 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -925,6 +925,12 @@ int armpmu_register(struct arm_pmu *pmu) if (ret) return ret; + /* + * By this stage we know our supported CPUs on either DT/ACPI platforms, + * detect the SMT implementation. + */ + pmu->has_smt = topology_core_has_smt(cpumask_first(&pmu->supported_cpus)); + if (!pmu->set_event_filter) pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE; diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 69c5cc8f56067..d1d6000517b2b 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -981,6 +981,7 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc, struct perf_event *event) { + struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; @@ -1001,6 +1002,15 @@ static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc, if (has_branch_stack(event)) return false; + /* + * The PMCCNTR_EL0 increments from the processor clock rather than + * the PE clock (ARM DDI0487 L.b D13.1.3) which means it'll continue + * counting on a WFI PE if one of its SMT sibling is not idle on a + * multi-threaded implementation. So don't use it on SMT cores. + */ + if (cpu_pmu->has_smt) + return false; + return true; } diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d72d6e5aa2002..daa1af2e8204b 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -89,6 +89,17 @@ void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); void freq_inv_set_max_ratio(int cpu, u64 max_rate); + +/* + * Architectures like ARM64 don't have reliable architectural way to get SMT + * information and depend on the firmware (ACPI/OF) report. Non-SMT core won't + * initialize thread_id so we can use this to detect the SMT implementation. + */ +static inline bool topology_core_has_smt(int cpu) +{ + return cpu_topology[cpu].thread_id != -1; +} + #endif #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 93c9a26492fcf..2d39322c40c43 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -119,6 +119,7 @@ struct arm_pmu { /* PMUv3 only */ int pmuver; + bool has_smt; u64 reg_pmmir; u64 reg_brbidr; #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 From 9b88756b9ee5263753af97b3083c51d91973d28e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:00 -0500 Subject: [PATCH 031/247] x86,fs/resctrl: Consolidate monitor event descriptions BugLink: https://bugs.launchpad.net/bugs/2122432 There are currently only three monitor events, all associated with the RDT_RESOURCE_L3 resource. Growing support for additional events will be easier with some restructuring to have a single point in file system code where all attributes of all events are defined. Place all event descriptions into an array mon_event_all[]. Doing this has the beneficial side effect of removing the need for rdt_resource::evt_list. Add resctrl_event_id::QOS_FIRST_EVENT for a lower bound on range checks for event ids and as the starting index to scan mon_event_all[]. Drop the code that builds evt_list and change the two places where the list is scanned to scan mon_event_all[] instead using a new helper macro for_each_mon_event(). Architecture code now informs file system code which events are available with resctrl_enable_mon_event(). Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 09f37134464cc03baf5cb8eab2d99db27ee73217) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/core.c | 12 ++++-- fs/resctrl/internal.h | 13 ++++-- fs/resctrl/monitor.c | 63 +++++++++++++++--------------- fs/resctrl/rdtgroup.c | 11 +++--- include/linux/resctrl.h | 4 +- include/linux/resctrl_types.h | 12 ++++-- 6 files changed, 66 insertions(+), 49 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 187d527ef73b6..7fcae25874fea 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -864,12 +864,18 @@ static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + } if (!rdt_mon_features) return false; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151d..7a57366d1abce 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -52,19 +52,26 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) } /** - * struct mon_evt - Entry in the event list of a resource + * struct mon_evt - Properties of a monitor event * @evtid: event id + * @rid: resource id for this event * @name: name of the event * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list + * @enabled: true if the event is enabled */ struct mon_evt { enum resctrl_event_id evtid; + enum resctrl_res_level rid; char *name; bool configurable; - struct list_head list; + bool enabled; }; +extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; + +#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ + mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 7326c28a7908f..d5bf3e0334b69 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -842,38 +842,39 @@ static void dom_data_exit(struct rdt_resource *r) mutex_unlock(&rdtgroup_mutex); } -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; - -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; - -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; - /* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. + * All available events. Architecture code marks the ones that + * are supported by a system using resctrl_enable_mon_event() + * to set .enabled. */ -static void l3_mon_evt_init(struct rdt_resource *r) +struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { + [QOS_L3_OCCUP_EVENT_ID] = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_TOTAL_EVENT_ID] = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_LOCAL_EVENT_ID] = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, +}; + +void resctrl_enable_mon_event(enum resctrl_event_id eventid) { - INIT_LIST_HEAD(&r->evt_list); + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + return; + if (mon_event_all[eventid].enabled) { + pr_warn("Duplicate enable for event %d\n", eventid); + return; + } - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + mon_event_all[eventid].enabled = true; } /** @@ -900,15 +901,13 @@ int resctrl_mon_resource_init(void) if (ret) return ret; - l3_mon_evt_init(r); - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 77d08229d8550..b95501d4b5de8 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1152,7 +1152,9 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = rdt_kn_parent_priv(of->kn); struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; seq_printf(seq, "%s\n", mevt->name); if (mevt->configurable) seq_printf(seq, "%s_config\n", mevt->name); @@ -3057,10 +3059,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, struct mon_evt *mevt; int ret, domid; - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 6fb4894b8cfd1..2944042bd84c5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -269,7 +269,6 @@ enum resctrl_schema_fmt { * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @evt_list: List of monitoring events * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource @@ -287,7 +286,6 @@ struct rdt_resource { struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - struct list_head evt_list; unsigned int mbm_cfg_mask; bool cdp_capable; }; @@ -372,6 +370,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +void resctrl_enable_mon_event(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); /** diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index a25fb9c4070d3..2dadbc54e4b35 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,11 +34,15 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) -/* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. - */ +/* Event IDs */ enum resctrl_event_id { + /* Must match value of first event below */ + QOS_FIRST_EVENT = 0x01, + + /* + * These values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ QOS_L3_OCCUP_EVENT_ID = 0x01, QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, From dae00077630b185a5eec447c668563c2b8c20664 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:01 -0500 Subject: [PATCH 032/247] x86,fs/resctrl: Replace architecture event enabled checks BugLink: https://bugs.launchpad.net/bugs/2122432 The resctrl file system now has complete knowledge of the status of every event. So there is no need for per-event function calls to check. Replace each of the resctrl_arch_is_{event}enabled() calls with resctrl_is_mon_event_enabled(QOS_{EVENT}). No functional change. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit d257cc2e5c8bb8236cb161360d6c0529fd442712) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/include/asm/resctrl.h | 15 --------------- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- arch/x86/kernel/cpu/resctrl/monitor.c | 4 ++-- fs/resctrl/ctrlmondata.c | 4 ++-- fs/resctrl/monitor.c | 16 +++++++++++----- fs/resctrl/rdtgroup.c | 18 +++++++++--------- include/linux/resctrl.h | 2 ++ 7 files changed, 28 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index feb93b50e990a..b1dd5d6b87db4 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -84,21 +84,6 @@ static inline void resctrl_arch_disable_mon(void) static_branch_dec_cpuslocked(&rdt_enable_key); } -static inline bool resctrl_arch_is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7fcae25874fea..1a319ce9328c7 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -402,13 +402,13 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { size_t tsize; - if (resctrl_arch_is_mbm_total_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { tsize = sizeof(*hw_dom->arch_mbm_total); hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_total) return -ENOMEM; } - if (resctrl_arch_is_mbm_local_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { tsize = sizeof(*hw_dom->arch_mbm_local); hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_local) { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index eed0f8417b8c5..bd8b45359e245 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -207,11 +207,11 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) memset(hw_dom->arch_mbm_total, 0, sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) memset(hw_dom->arch_mbm_local, 0, sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); } diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 3c39cfacb2518..42b281b3852ff 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, rdt_last_cmd_clear(); if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; else ret = -EINVAL; } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; else ret = -EINVAL; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index d5bf3e0334b69..b0b1dcc367955 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid) entry = __rmid_entry(idx); - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); else list_add_tail(&entry->list, &rmid_free_lru); @@ -635,10 +635,10 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); } @@ -877,6 +877,12 @@ void resctrl_enable_mon_event(enum resctrl_event_id eventid) mon_event_all[eventid].enabled = true; } +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) +{ + return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS && + mon_event_all[eventid].enabled; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -912,9 +918,9 @@ int resctrl_mon_resource_init(void) RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) + else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index b95501d4b5de8..a7eeb33501da7 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -123,8 +123,8 @@ void rdt_staged_configs_clear(void) static bool resctrl_is_mbm_enabled(void) { - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); + return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) || + resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } static bool resctrl_is_mbm_event(int e) @@ -196,7 +196,7 @@ static int closid_alloc(void) lockdep_assert_held(&rdtgroup_mutex); if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { + resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { cleanest_closid = resctrl_find_cleanest_closid(); if (cleanest_closid < 0) return cleanest_closid; @@ -4051,7 +4051,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -4087,12 +4087,12 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; - if (resctrl_arch_is_llc_occupancy_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_arch_is_mbm_total_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { tsize = sizeof(*d->mbm_total); d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { @@ -4100,7 +4100,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain return -ENOMEM; } } - if (resctrl_arch_is_mbm_local_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { tsize = sizeof(*d->mbm_local); d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { @@ -4145,7 +4145,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) RESCTRL_PICK_ANY_CPU); } - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); /* @@ -4220,7 +4220,7 @@ void resctrl_offline_cpu(unsigned int cpu) cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0, cpu); } - if (resctrl_arch_is_llc_occupancy_enabled() && + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0, cpu); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 2944042bd84c5..40aba6b5d4f08 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -372,6 +372,8 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); void resctrl_enable_mon_event(enum resctrl_event_id eventid); +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); /** From f114eb71ed3ab57a378a4aa0e3683816d7ea6c6f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:02 -0500 Subject: [PATCH 033/247] x86/resctrl: Remove the rdt_mon_features global variable BugLink: https://bugs.launchpad.net/bugs/2122432 rdt_mon_features is used as a bitmask of enabled monitor events. A monitor event's status is now maintained in mon_evt::enabled with all monitor events' mon_evt structures found in the filesystem's mon_event_all[] array. Remove the remaining uses of rdt_mon_features. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 63cc9811aa874e6fab671599ba93a989f4f93a5d) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/include/asm/resctrl.h | 1 - arch/x86/kernel/cpu/resctrl/core.c | 9 +++++---- arch/x86/kernel/cpu/resctrl/monitor.c | 5 ----- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index b1dd5d6b87db4..575f8408a9e7c 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -44,7 +44,6 @@ DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); extern bool rdt_alloc_capable; extern bool rdt_mon_capable; -extern unsigned int rdt_mon_features; DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 1a319ce9328c7..5d14f9a14eda5 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -863,21 +863,22 @@ static __init bool get_rdt_alloc_resources(void) static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); - rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); - rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); - rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + ret = true; } - if (!rdt_mon_features) + if (!ret) return false; return !rdt_get_mon_l3_config(r); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index bd8b45359e245..c6156620b2f57 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -31,11 +31,6 @@ */ bool rdt_mon_capable; -/* - * Global to indicate which monitoring events are enabled. - */ -unsigned int rdt_mon_features; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; From adeca22e45bbd4408142ff4bc96dc62c66e6f108 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:03 -0500 Subject: [PATCH 034/247] x86,fs/resctrl: Prepare for more monitor events BugLink: https://bugs.launchpad.net/bugs/2122432 There's a rule in computer programming that objects appear zero, once, or many times. So code accordingly. There are two MBM events and resctrl is coded with a lot of if (local) do one thing if (total) do a different thing Change the rdt_mon_domain and rdt_hw_mon_domain structures to hold arrays of pointers to per event data instead of explicit fields for total and local bandwidth. Simplify by coding for many events using loops on which are enabled. Move resctrl_is_mbm_event() to so it can be used more widely. Also provide a for_each_mbm_event_id() helper macro. Cleanup variable names in functions touched to consistently use "eventid" for those with type enum resctrl_event_id. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 83b039877310ae1eb614eef17b780df1e10d9fb5) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/core.c | 40 +++++++++++---------- arch/x86/kernel/cpu/resctrl/internal.h | 8 ++--- arch/x86/kernel/cpu/resctrl/monitor.c | 36 +++++++++---------- fs/resctrl/monitor.c | 13 ++++--- fs/resctrl/rdtgroup.c | 50 +++++++++++++------------- include/linux/resctrl.h | 23 +++++++++--- include/linux/resctrl_types.h | 3 ++ 7 files changed, 96 insertions(+), 77 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 5d14f9a14eda5..fbf019c1ff11b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -365,8 +365,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { - kfree(hw_dom->arch_mbm_total); - kfree(hw_dom->arch_mbm_local); + int idx; + + for_each_mbm_idx(idx) + kfree(hw_dom->arch_mbm_states[idx]); kfree(hw_dom); } @@ -400,25 +402,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * */ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { - size_t tsize; - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { - tsize = sizeof(*hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_total) - return -ENOMEM; - } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { - tsize = sizeof(*hw_dom->arch_mbm_local); - hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_local) { - kfree(hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = NULL; - return -ENOMEM; - } + size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + for_each_mbm_idx(idx) { + kfree(hw_dom->arch_mbm_states[idx]); + hw_dom->arch_mbm_states[idx] = NULL; + } + + return -ENOMEM; } static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5e3c41b364373..58dca892a5df2 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -54,15 +54,15 @@ struct rdt_hw_ctrl_domain { * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share * a resource for a monitor function * @d_resctrl: Properties exposed to the resctrl file system - * @arch_mbm_total: arch private state for MBM total bandwidth - * @arch_mbm_local: arch private state for MBM local bandwidth + * @arch_mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct arch_mbm_state + * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_mon_domain { struct rdt_mon_domain d_resctrl; - struct arch_mbm_state *arch_mbm_total; - struct arch_mbm_state *arch_mbm_local; + struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c6156620b2f57..92b3d5403d0c4 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -161,18 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do u32 rmid, enum resctrl_event_id eventid) { - switch (eventid) { - case QOS_L3_OCCUP_EVENT_ID: - return NULL; - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &hw_dom->arch_mbm_total[rmid]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &hw_dom->arch_mbm_local[rmid]; - default: - /* Never expect to get here */ - WARN_ON_ONCE(1); + struct arch_mbm_state *state; + + if (!resctrl_is_mbm_event(eventid)) return NULL; - } + + state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; + + return state ? &state[rmid] : NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -201,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) - memset(hw_dom->arch_mbm_total, 0, - sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) - memset(hw_dom->arch_mbm_local, 0, - sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + memset(hw_dom->arch_mbm_states[idx], 0, + sizeof(*hw_dom->arch_mbm_states[0]) * r->num_rmid); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b0b1dcc367955..e0dfa5fb969e4 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -346,15 +346,14 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *state; - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: + if (!resctrl_is_mbm_event(evtid)) return NULL; - } + + state = d->mbm_states[MBM_STATE_IDX(evtid)]; + + return state ? &state[idx] : NULL; } static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index a7eeb33501da7..77336d5e4915e 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -127,12 +127,6 @@ static bool resctrl_is_mbm_enabled(void) resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - /* * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap * of free CLOSIDs. @@ -4023,9 +4017,13 @@ static void rdtgroup_setup_default(void) static void domain_destroy_mon_state(struct rdt_mon_domain *d) { + int idx; + bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } } void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4085,32 +4083,34 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; + size_t tsize = sizeof(*d->mbm_states[0]); + enum resctrl_event_id eventid; + int idx; if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + bitmap_free(d->rmid_busy_llc); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } + + return -ENOMEM; } int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 40aba6b5d4f08..478d7a935ca36 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -161,8 +161,9 @@ struct rdt_ctrl_domain { * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold - * @mbm_total: saved state for MBM total bandwidth - * @mbm_local: saved state for MBM local bandwidth + * @mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct mbm_state + * indexed by RMID on x86 or combined CLOSID, RMID on Arm. * @mbm_over: worker to periodically read MBM h/w counters * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters @@ -172,8 +173,7 @@ struct rdt_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; + struct mbm_state *mbm_states[QOS_NUM_L3_MBM_EVENTS]; struct delayed_work mbm_over; struct delayed_work cqm_limbo; int mbm_work_cpu; @@ -376,6 +376,21 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); +static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) +{ + return (eventid >= QOS_L3_MBM_TOTAL_EVENT_ID && + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* Iterate over all memory bandwidth events */ +#define for_each_mbm_event_id(eventid) \ + for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID; eventid++) + +/* Iterate over memory bandwidth arrays in domain structures */ +#define for_each_mbm_idx(idx) \ + for (idx = 0; idx < QOS_NUM_L3_MBM_EVENTS; idx++) + /** * resctrl_arch_mon_event_config_write() - Write the config for an event. * @config_info: struct resctrl_mon_config_info describing the resource, domain diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index 2dadbc54e4b35..d98351663c2c6 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -51,4 +51,7 @@ enum resctrl_event_id { QOS_NUM_EVENTS, }; +#define QOS_NUM_L3_MBM_EVENTS (QOS_L3_MBM_LOCAL_EVENT_ID - QOS_L3_MBM_TOTAL_EVENT_ID + 1) +#define MBM_STATE_IDX(evt) ((evt) - QOS_L3_MBM_TOTAL_EVENT_ID) + #endif /* __LINUX_RESCTRL_TYPES_H */ From c9b8bb8d693da99f2b50357197f4f908258ba323 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:04 -0500 Subject: [PATCH 035/247] x86/cpufeatures: Add support for Assignable Bandwidth Monitoring Counters (ABMC) BugLink: https://bugs.launchpad.net/bugs/2122432 Users can create as many monitor groups as RMIDs supported by the hardware. However, the bandwidth monitoring feature on AMD only guarantees that RMIDs currently assigned to a processor will be tracked by hardware. The counters of any other RMIDs which are no longer being tracked will be reset to zero. The MBM event counters return "Unavailable" for the RMIDs that are not tracked by hardware. So, there can be only limited number of groups that can give guaranteed monitoring numbers. With ever changing configurations there is no way to definitely know which of these groups are being tracked during a particular time. Users do not have the option to monitor a group or set of groups for a certain period of time without worrying about RMID being reset in between. The ABMC feature allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. There is no need to worry about counters being reset during this period. Additionally, the user can specify the type of memory transactions (e.g., reads, writes) for the counter to track. Without ABMC enabled, monitoring will work in current mode without assignment option. The Linux resctrl subsystem provides an interface that allows monitoring of up to two memory bandwidth events per group, selected from a combination of available total and local events. When ABMC is enabled, two events will be assigned to each group by default, in line with the current interface design. Users will also have the option to configure which types of memory transactions are counted by these events. Due to the limited number of available counters (32), users may quickly exhaust the available counters. If the system runs out of assignable ABMC counters, the kernel will report an error. In such cases, users will need to unassign one or more active counters to free up counters for new assignments. resctrl will provide options to assign or unassign events through the group-specific interface file. The feature is detected via CPUID_Fn80000020_EBX_x00 bit 5: ABMC (Assignable Bandwidth Monitoring Counters). The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). [ bp: Massage commit message, fixup enumeration due to VMSCAPE ] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] (cherry picked from commit e19c06219985f2beb9d71959d80f56e318abf744) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 751ca35386b0e..b2a562217d3ff 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -496,6 +496,7 @@ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ +#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ /* * BUG word(s) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 6b868afb26c31..4cee6213d6673 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -51,6 +51,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, From 33c441fbd29e6379e8784d31bc500208aacac156 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:05 -0500 Subject: [PATCH 036/247] x86/resctrl: Add ABMC feature in the command line options BugLink: https://bugs.launchpad.net/bugs/2122432 Add a kernel command-line parameter to enable or disable the exposure of the ABMC (Assignable Bandwidth Monitoring Counters) hardware feature to resctrl. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit bebf57bf054b561a62f3440142b2eddab2b0bbff) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/filesystems/resctrl.rst | 1 + arch/x86/kernel/cpu/resctrl/core.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 330f8560c6c14..44a70e1ab59a2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6173,7 +6173,7 @@ rdt= [HW,X86,RDT] Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, - mba, smba, bmec. + mba, smba, bmec, abmc. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index c7949dd44f2f3..c97fd77a107dc 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -26,6 +26,7 @@ MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local" MBA (Memory Bandwidth Allocation) "mba" SMBA (Slow Memory Bandwidth Allocation) "" BMEC (Bandwidth Monitoring Event Configuration) "" +ABMC (Assignable Bandwidth Monitoring Counters) "" =============================================== ================================ Historically, new features were made visible by default in /proc/cpuinfo. This diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index fbf019c1ff11b..b07b12a058862 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -711,6 +711,7 @@ enum { RDT_FLAG_MBA, RDT_FLAG_SMBA, RDT_FLAG_BMEC, + RDT_FLAG_ABMC, }; #define RDT_OPT(idx, n, f) \ @@ -736,6 +737,7 @@ static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), + RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) From f11bd5c744eb47fd94e7e4064afff1fc10649fbc Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:06 -0500 Subject: [PATCH 037/247] x86,fs/resctrl: Consolidate monitoring related data from rdt_resource BugLink: https://bugs.launchpad.net/bugs/2122432 The cache allocation and memory bandwidth allocation feature properties are consolidated into struct resctrl_cache and struct resctrl_membw respectively. In preparation for more monitoring properties that will clobber the existing resource struct more, re-organize the monitoring specific properties to also be in a separate structure. Also convert "bandwidth sources" terminology to "memory transactions" to have consistency within resctrl for related monitoring features. [ bp: Massage commit message. ] Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 5ad68c8f965fed78c61f2ac7aea933f06bb50032) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- arch/x86/kernel/cpu/resctrl/monitor.c | 10 +++++----- fs/resctrl/rdtgroup.c | 6 +++--- include/linux/resctrl.h | 18 +++++++++++++----- 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index b07b12a058862..267e9206a9992 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -107,7 +107,7 @@ u32 resctrl_arch_system_num_rmid_idx(void) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->num_rmid; + return r->mon.num_rmid; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -541,7 +541,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; } diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 92b3d5403d0c4..e50d8516d9a72 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -130,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid) if (snc_nodes_per_l3_cache == 1) return lrmid; - return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; } static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) @@ -205,7 +205,7 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * continue; idx = MBM_STATE_IDX(eventid); memset(hw_dom->arch_mbm_states[idx], 0, - sizeof(*hw_dom->arch_mbm_states[0]) * r->num_rmid); + sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); } } @@ -358,7 +358,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; - r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; + r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -373,7 +373,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - threshold = resctrl_rmid_realloc_limit / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; /* * Because num_rmid may not be a power of two, round the value @@ -387,7 +387,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } r->mon_capable = true; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 77336d5e4915e..ca0475b75390c 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1135,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->num_rmid); + seq_printf(seq, "%d\n", r->mon.num_rmid); return 0; } @@ -1731,9 +1731,9 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) } /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { + if ((val & r->mon.mbm_cfg_mask) != val) { rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); + r->mon.mbm_cfg_mask); return -EINVAL; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 478d7a935ca36..fe2af6cb96d4b 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -255,38 +255,46 @@ enum resctrl_schema_fmt { RESCTRL_SCHEMA_RANGE, }; +/** + * struct resctrl_mon - Monitoring related data of a resctrl resource. + * @num_rmid: Number of RMIDs available. + * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth + * monitoring events can be configured. + */ +struct resctrl_mon { + int num_rmid; + unsigned int mbm_cfg_mask; +}; + /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine - * @num_rmid: Number of RMIDs available * @ctrl_scope: Scope of this resource for control functions * @mon_scope: Scope of this resource for monitor functions * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. + * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth - * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { int rid; bool alloc_capable; bool mon_capable; - int num_rmid; enum resctrl_scope ctrl_scope; enum resctrl_scope mon_scope; struct resctrl_cache cache; struct resctrl_membw membw; + struct resctrl_mon mon; struct list_head ctrl_domains; struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - unsigned int mbm_cfg_mask; bool cdp_capable; }; From 2cacbfe2a811423b15356c961283c17ef4a5783a Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:07 -0500 Subject: [PATCH 038/247] x86,fs/resctrl: Detect Assignable Bandwidth Monitoring feature details BugLink: https://bugs.launchpad.net/bugs/2122432 ABMC feature details are reported via CPUID Fn8000_0020_EBX_x5. Bits Description 15:0 MAX_ABMC Maximum Supported Assignable Bandwidth Monitoring Counter ID + 1 The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). Detect the feature and number of assignable counters supported. For backward compatibility, upon detecting the assignable counter feature, enable the mbm_total_bytes and mbm_local_bytes events that users are familiar with as part of original L3 MBM support. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] (cherry picked from commit 13390861b426e936db20d675804a5b405622bc79) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/core.c | 7 +++++-- arch/x86/kernel/cpu/resctrl/monitor.c | 11 ++++++++--- fs/resctrl/monitor.c | 7 +++++++ include/linux/resctrl.h | 4 ++++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 267e9206a9992..2e68aa02ad3f4 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -883,6 +883,8 @@ static __init bool get_rdt_mon_resources(void) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); ret = true; } + if (rdt_cpu_has(X86_FEATURE_ABMC)) + ret = true; if (!ret) return false; @@ -978,7 +980,7 @@ static enum cpuhp_state rdt_online; /* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; c->x86_cache_mbm_width_offset = -1; @@ -990,7 +992,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) || + cpu_has(c, X86_FEATURE_ABMC)) { u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index e50d8516d9a72..62fc2a81f3430 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -353,6 +353,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; + u32 eax, ebx, ecx, edx; snc_nodes_per_l3_cache = snc_get_config(); @@ -382,14 +383,18 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - if (rdt_cpu_has(X86_FEATURE_BMEC)) { - u32 eax, ebx, ecx, edx; - + if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } + if (rdt_cpu_has(X86_FEATURE_ABMC)) { + r->mon.mbm_cntr_assignable = true; + cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); + r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + } + r->mon_capable = true; return 0; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index e0dfa5fb969e4..b578451de2b50 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -922,6 +922,13 @@ int resctrl_mon_resource_init(void) else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + if (r->mon.mbm_cntr_assignable) { + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + } + return 0; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index fe2af6cb96d4b..eb80cc233be43 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -260,10 +260,14 @@ enum resctrl_schema_fmt { * @num_rmid: Number of RMIDs available. * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth * monitoring events can be configured. + * @num_mbm_cntrs: Number of assignable counters. + * @mbm_cntr_assignable:Is system capable of supporting counter assignment? */ struct resctrl_mon { int num_rmid; unsigned int mbm_cfg_mask; + int num_mbm_cntrs; + bool mbm_cntr_assignable; }; /** From 5bfb45670aa94a71da569ddeb07e3435bcbfbe20 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:08 -0500 Subject: [PATCH 039/247] x86/resctrl: Add support to enable/disable AMD ABMC feature BugLink: https://bugs.launchpad.net/bugs/2122432 Add the functionality to enable/disable the AMD ABMC feature. The AMD ABMC feature is enabled by setting enabled bit(0) in the L3_QOS_EXT_CFG MSR. When the state of ABMC is changed, the MSR needs to be updated on all the logical processors in the QOS Domain. Hardware counters will reset when ABMC state is changed. [ bp: Massage commit message. ] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] (cherry picked from commit faebbc58cde9d8f6050ac152c34c88195ed4abaa) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/resctrl/internal.h | 5 +++ arch/x86/kernel/cpu/resctrl/monitor.c | 45 ++++++++++++++++++++++++++ include/linux/resctrl.h | 20 ++++++++++++ 4 files changed, 71 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 20fa4a79df137..ac4933363905c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1224,6 +1224,7 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 /* AMD-V MSRs */ diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 58dca892a5df2..a79a487e639c0 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,9 @@ struct arch_mbm_state { u64 prev_msr; }; +/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ +#define ABMC_ENABLE_BIT 0 + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -102,6 +105,7 @@ struct msr_param { * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource + * @mbm_cntr_assign_enabled: ABMC feature is enabled * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -115,6 +119,7 @@ struct rdt_hw_resource { unsigned int mon_scale; unsigned int mbm_width; bool cdp_enabled; + bool mbm_cntr_assign_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 62fc2a81f3430..b1ef5940b9e99 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -413,3 +413,48 @@ void __init intel_rdt_mbm_apply_quirk(void) mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; mbm_cf = mbm_cf_table[cf_index].cf; } + +static void resctrl_abmc_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); +} + +/* + * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs + * associated with all monitor domains. + */ +static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, + &enable, 1); + resctrl_arch_reset_rmid_all(r, d); + } +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (r->mon.mbm_cntr_assignable && + hw_res->mbm_cntr_assign_enabled != enable) { + _resctrl_abmc_enable(r, enable); + hw_res->mbm_cntr_assign_enabled = enable; + } + + return 0; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; +} diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index eb80cc233be43..919806122c509 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -445,6 +445,26 @@ static inline u32 resctrl_get_config_index(u32 closid, bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); +/** + * resctrl_arch_mbm_cntr_assign_enabled() - Check if MBM counter assignment + * mode is enabled. + * @r: Pointer to the resource structure. + * + * Return: + * true if the assignment mode is enabled, false otherwise. + */ +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); + +/** + * resctrl_arch_mbm_cntr_assign_set() - Configure the MBM counter assignment mode. + * @r: Pointer to the resource structure. + * @enable: Set to true to enable, false to disable the assignment mode. + * + * Return: + * 0 on success, < 0 on error. + */ +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. From ee5ef089ecfd8e9574e335a325b4f28c9a87bb9a Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:09 -0500 Subject: [PATCH 040/247] fs/resctrl: Introduce the interface to display monitoring modes BugLink: https://bugs.launchpad.net/bugs/2122432 Introduce the resctrl file "mbm_assign_mode" to list the supported counter assignment modes. The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Each event within a resctrl group can be assigned independently in this mode. On AMD systems "mbm_event" mode is backed by the ABMC (Assignable Bandwidth Monitoring Counters) hardware feature and is enabled by default. The "default" mode is the existing mode that works without the explicit counter assignment, instead relying on dynamic counter assignment by hardware that may result in hardware not dedicating a counter resulting in monitoring data reads returning "Unavailable". Provide an interface to display the monitor modes on the system. $ cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode [mbm_event] default Add IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED) check to support Arm64. On x86, CONFIG_RESCTRL_ASSIGN_FIXED is not defined. On Arm64, it will be defined when the "mbm_event" mode is supported. Add IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED) check early to ensure the user interface remains compatible with upcoming Arm64 support. IS_ENABLED() safely evaluates to 0 when the configuration is not defined. As a result, for MPAM, the display would be either: [default] or [mbm_event] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 3b497c3f4f0427d940ec5c8600e840c8adc5cfbf) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 31 +++++++++++++++++++++++++++ fs/resctrl/internal.h | 4 ++++ fs/resctrl/monitor.c | 30 ++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 9 +++++++- 4 files changed, 73 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index c97fd77a107dc..b692829fec5f6 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -257,6 +257,37 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/mbm_local_bytes_config 0=0x30;1=0x30;3=0x15;4=0x15 +"mbm_assign_mode": + The supported counter assignment modes. The enclosed brackets indicate which mode + is enabled. + :: + + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + [mbm_event] + default + + "mbm_event": + + mbm_event mode allows users to assign a hardware counter to an RMID, event + pair and monitor the bandwidth usage as long as it is assigned. The hardware + continues to track the assigned counter until it is explicitly unassigned by + the user. Each event within a resctrl group can be assigned independently. + + In this mode, a monitoring event can only accumulate data while it is backed + by a hardware counter. Use "mbm_L3_assignments" found in each CTRL_MON and MON + group to specify which of the events should have a counter assigned. The number + of counters available is described in the "num_mbm_cntrs" file. Changing the + mode may cause all counters on the resource to reset. + + "default": + + In default mode, resctrl assumes there is a hardware counter for each + event within every CTRL_MON and MON group. On AMD platforms, it is + recommended to use the mbm_event mode, if supported, to prevent reset of MBM + events between reads resulting from hardware re-allocating counters. This can + result in misleading values or display "Unavailable" if no counter is assigned + to the event. + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 7a57366d1abce..78aeb7ea38af2 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -382,6 +382,10 @@ bool closid_allocated(unsigned int closid); int resctrl_find_cleanest_closid(void); +void *rdt_kn_parent_priv(struct kernfs_node *kn); + +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b578451de2b50..96231d517e711 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -882,6 +882,36 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) mon_event_all[eventid].enabled; } +int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool enabled; + + mutex_lock(&rdtgroup_mutex); + enabled = resctrl_arch_mbm_cntr_assign_enabled(r); + + if (r->mon.mbm_cntr_assignable) { + if (enabled) + seq_puts(s, "[mbm_event]\n"); + else + seq_puts(s, "[default]\n"); + + if (!IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED)) { + if (enabled) + seq_puts(s, "default\n"); + else + seq_puts(s, "mbm_event\n"); + } + } else { + seq_puts(s, "[default]\n"); + } + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index ca0475b75390c..90bf57910aae7 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -975,7 +975,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of, return 0; } -static void *rdt_kn_parent_priv(struct kernfs_node *kn) +void *rdt_kn_parent_priv(struct kernfs_node *kn) { /* * The parent pointer is only valid within RCU section since it can be @@ -1911,6 +1911,13 @@ static struct rftype res_common_files[] = { .seq_show = mbm_local_bytes_config_show, .write = mbm_local_bytes_config_write, }, + { + .name = "mbm_assign_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_mode_show, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, + }, { .name = "cpus", .mode = 0644, From 6b1c5d644db487c631898ddffac69f08498b84c4 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:10 -0500 Subject: [PATCH 041/247] fs/resctrl: Add resctrl file to display number of assignable counters BugLink: https://bugs.launchpad.net/bugs/2122432 The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Create 'num_mbm_cntrs' resctrl file that displays the number of counters supported in each domain. 'num_mbm_cntrs' is only visible to user space when the system supports "mbm_event" mode. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 8c793336eaf8893a29626155d74615fe9f03e7f2) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 11 +++++++++++ fs/resctrl/internal.h | 2 ++ fs/resctrl/monitor.c | 26 ++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 6 ++++++ 4 files changed, 45 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index b692829fec5f6..4eb27530be6ff 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -288,6 +288,17 @@ with the following files: result in misleading values or display "Unavailable" if no counter is assigned to the event. +"num_mbm_cntrs": + The maximum number of counters (total of available and assigned counters) in + each domain when the system supports mbm_event mode. + + For example, on a system with maximum of 32 memory bandwidth monitoring + counters in each of its L3 domains: + :: + + # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs + 0=32;1=32 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 78aeb7ea38af2..7a12187eced82 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -386,6 +386,8 @@ void *rdt_kn_parent_priv(struct kernfs_node *kn); int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 96231d517e711..112979e9c4445 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -912,6 +912,30 @@ int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, return 0; } +int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + seq_printf(s, "%d=%d", dom->hdr.id, r->mon.num_mbm_cntrs); + sep = true; + } + seq_putc(s, '\n'); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return 0; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -957,6 +981,8 @@ int resctrl_mon_resource_init(void) resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + resctrl_file_fflags_init("num_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 90bf57910aae7..9b97cadbb5c58 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1836,6 +1836,12 @@ static struct rftype res_common_files[] = { .seq_show = rdt_default_ctrl_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, + { + .name = "num_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_num_mbm_cntrs_show, + }, { .name = "min_cbm_bits", .mode = 0444, From 6f5bf920ac257c47451c57e2c9496c19f79cda3b Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:11 -0500 Subject: [PATCH 042/247] fs/resctrl: Introduce mbm_cntr_cfg to track assignable counters per domain BugLink: https://bugs.launchpad.net/bugs/2122432 The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Counters are assigned/unassigned at monitoring domain level. Manage a monitoring domain's hardware counters using a per monitoring domain array of struct mbm_cntr_cfg that is indexed by the hardware counter ID. A hardware counter's configuration contains the MBM event ID and points to the monitoring group that it is assigned to, with a NULL pointer meaning that the hardware counter is available for assignment. There is no direct way to determine which hardware counters are assigned to a particular monitoring group. Check every entry of every hardware counter configuration array in every monitoring domain to query which MBM events of a monitoring group is tracked by hardware. Such queries are acceptable because of a very small number of assignable counters (32 to 64). Suggested-by: Peter Newman Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 4d32c24a74f2c12ff440d381ba01de574f6631ce) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/rdtgroup.c | 8 ++++++++ include/linux/resctrl.h | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 9b97cadbb5c58..ce1c894dab180 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4032,6 +4032,7 @@ static void domain_destroy_mon_state(struct rdt_mon_domain *d) { int idx; + kfree(d->cntr_cfg); bitmap_free(d->rmid_busy_llc); for_each_mbm_idx(idx) { kfree(d->mbm_states[idx]); @@ -4115,6 +4116,13 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain goto cleanup; } + if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) { + tsize = sizeof(*d->cntr_cfg); + d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL); + if (!d->cntr_cfg) + goto cleanup; + } + return 0; cleanup: bitmap_free(d->rmid_busy_llc); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 919806122c509..e013caba66414 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -156,6 +156,18 @@ struct rdt_ctrl_domain { u32 *mbps_val; }; +/** + * struct mbm_cntr_cfg - Assignable counter configuration. + * @evtid: MBM event to which the counter is assigned. Only valid + * if @rdtgroup is not NULL. + * @rdtgrp: resctrl group assigned to the counter. NULL if the + * counter is free. + */ +struct mbm_cntr_cfg { + enum resctrl_event_id evtid; + struct rdtgroup *rdtgrp; +}; + /** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types @@ -168,6 +180,8 @@ struct rdt_ctrl_domain { * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters * @cqm_work_cpu: worker CPU for CQM h/w counters + * @cntr_cfg: array of assignable counters' configuration (indexed + * by counter ID) */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; @@ -178,6 +192,7 @@ struct rdt_mon_domain { struct delayed_work cqm_limbo; int mbm_work_cpu; int cqm_work_cpu; + struct mbm_cntr_cfg *cntr_cfg; }; /** From 284a74fd51dd9ca69f485bc49ec864edcfc82d71 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:12 -0500 Subject: [PATCH 043/247] fs/resctrl: Introduce interface to display number of free MBM counters BugLink: https://bugs.launchpad.net/bugs/2122432 Introduce the "available_mbm_cntrs" resctrl file to display the number of counters available for assignment in each domain when "mbm_event" mode is enabled. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 16ff6b038fb3b64aa033efdc95d673239610e1a6) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 11 +++++++ fs/resctrl/internal.h | 3 ++ fs/resctrl/monitor.c | 44 +++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 6 ++++ 4 files changed, 64 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 4eb27530be6ff..446736dbd97f1 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -299,6 +299,17 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs 0=32;1=32 +"available_mbm_cntrs": + The number of counters available for assignment in each domain when mbm_event + mode is enabled on the system. + + For example, on a system with 30 available [hardware] assignable counters + in each of its L3 domains: + :: + + # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs + 0=30;1=30 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 7a12187eced82..4f372e80bf373 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -388,6 +388,9 @@ int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, + void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 112979e9c4445..1fa82a62b2e57 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -936,6 +936,48 @@ int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, return 0; } +int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *dom; + bool sep = false; + u32 cntrs, i; + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + cntrs = 0; + for (i = 0; i < r->mon.num_mbm_cntrs; i++) { + if (!dom->cntr_cfg[i].rdtgrp) + cntrs++; + } + + seq_printf(s, "%d=%u", dom->hdr.id, cntrs); + sep = true; + } + seq_putc(s, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -983,6 +1025,8 @@ int resctrl_mon_resource_init(void) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("available_mbm_cntrs", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index ce1c894dab180..8eaad45b28ead 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1822,6 +1822,12 @@ static struct rftype res_common_files[] = { .seq_show = rdt_mon_features_show, .fflags = RFTYPE_MON_INFO, }, + { + .name = "available_mbm_cntrs", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_available_mbm_cntrs_show, + }, { .name = "num_rmids", .mode = 0444, From 82d98e11a70d81f518d0b3ce68023c1091a8fa65 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:13 -0500 Subject: [PATCH 044/247] x86/resctrl: Add data structures and definitions for ABMC assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BugLink: https://bugs.launchpad.net/bugs/2122432 The ABMC feature allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. The ABMC feature implements an MSR L3_QOS_ABMC_CFG (C000_03FDh). ABMC counter assignment is done by setting the counter id, bandwidth source (RMID) and bandwidth configuration. Attempts to read or write the MSR when ABMC is not enabled will result in a #GP(0) exception. Introduce the data structures and definitions for MSR L3_QOS_ABMC_CFG (0xC000_03FDh): ========================================================================= Bits Mnemonic Description Access Reset Type Value ========================================================================= 63 CfgEn Configuration Enable R/W 0 62 CtrEn Enable/disable counting R/W 0 61:53 – Reserved MBZ 0 52:48 CtrID Counter Identifier R/W 0 47 IsCOS BwSrc field is a CLOSID R/W 0 (not an RMID) 46:44 – Reserved MBZ 0 43:32 BwSrc Bandwidth Source R/W 0 (RMID or CLOSID) 31:0 BwType Bandwidth configuration R/W 0 tracked by the CtrID ========================================================================== The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). [ bp: Touchups. ] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] (cherry picked from commit 84ecefb766748916099f5b7444a973a623611d63) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/resctrl/internal.h | 36 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ac4933363905c..97420e1c92e1a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1224,6 +1224,7 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_ABMC_CFG 0xc00003fd #define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index a79a487e639c0..0444fea49b11f 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -164,6 +164,42 @@ union cpuid_0x10_x_edx { unsigned int full; }; +/* + * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG. + * + * @bw_type : Event configuration that represents the memory + * transactions being tracked by the @cntr_id. + * @bw_src : Bandwidth source (RMID or CLOSID). + * @reserved1 : Reserved. + * @is_clos : @bw_src field is a CLOSID (not an RMID). + * @cntr_id : Counter identifier. + * @reserved : Reserved. + * @cntr_en : Counting enable bit. + * @cfg_en : Configuration enable bit. + * + * Configuration and counting: + * Counter can be configured across multiple writes to MSR. Configuration + * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the + * configuration is applied. + * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not + * count events. + * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start + * counting events. + */ +union l3_qos_abmc_cfg { + struct { + unsigned long bw_type :32, + bw_src :12, + reserved1: 3, + is_clos : 1, + cntr_id : 5, + reserved : 9, + cntr_en : 1, + cfg_en : 1; + } split; + unsigned long full; +}; + void rdt_ctrl_update(void *arg); int rdt_get_mon_l3_config(struct rdt_resource *r); From eb706ab132fdc5ea2bb8fc4f7ccb673203ae89d3 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:14 -0500 Subject: [PATCH 045/247] fs/resctrl: Introduce event configuration field in struct mon_evt BugLink: https://bugs.launchpad.net/bugs/2122432 When supported, mbm_event counter assignment mode allows the user to configure events to track specific types of memory transactions. Introduce an evt_cfg field in struct mon_evt to define the type of memory transactions tracked by a monitoring event. Also add a helper function to get the evt_cfg value. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit ebebda853633de389ba2c6737f8ca38405713e90) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/internal.h | 5 +++++ fs/resctrl/monitor.c | 10 ++++++++++ include/linux/resctrl.h | 2 ++ 3 files changed, 17 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 4f372e80bf373..1cddfff007a24 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -56,6 +56,10 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * @evtid: event id * @rid: resource id for this event * @name: name of the event + * @evt_cfg: Event configuration value that represents the + * memory transactions (e.g., READS_TO_LOCAL_MEM, + * READS_TO_REMOTE_MEM) being tracked by @evtid. + * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable * @enabled: true if the event is enabled */ @@ -63,6 +67,7 @@ struct mon_evt { enum resctrl_event_id evtid; enum resctrl_res_level rid; char *name; + u32 evt_cfg; bool configurable; bool enabled; }; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 1fa82a62b2e57..f714e7baaea65 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -882,6 +882,11 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) mon_event_all[eventid].enabled; } +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) +{ + return mon_event_all[evtid].evt_cfg; +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { @@ -1023,6 +1028,11 @@ int resctrl_mon_resource_init(void) resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index e013caba66414..87daa4ca312dd 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -409,6 +409,8 @@ static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); } +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id eventid); + /* Iterate over all memory bandwidth events */ #define for_each_mbm_event_id(eventid) \ for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ From 60e2f0c76f1a168822271c31a1dc3e811b48b162 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:15 -0500 Subject: [PATCH 046/247] x86,fs/resctrl: Implement resctrl_arch_config_cntr() to assign a counter with ABMC BugLink: https://bugs.launchpad.net/bugs/2122432 The ABMC feature allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Implement an x86 architecture-specific handler to configure a counter. This architecture specific handler is called by resctrl fs when a counter is assigned or unassigned as well as when an already assigned counter's configuration should be updated. Configure counters by writing to the L3_QOS_ABMC_CFG MSR, specifying the counter ID, bandwidth source (RMID), and event configuration. The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] (cherry picked from commit f7a4fb22312646329ba21bc58958fd83fb9fc15d) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/monitor.c | 36 +++++++++++++++++++++++++++ include/linux/resctrl.h | 19 ++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index b1ef5940b9e99..a5ba079948329 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -458,3 +458,39 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; } + +static void resctrl_abmc_config_one_amd(void *info) +{ + union l3_qos_abmc_cfg *abmc_cfg = info; + + wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); +} + +/* + * Send an IPI to the domain to assign the counter to RMID, event pair. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + union l3_qos_abmc_cfg abmc_cfg = { 0 }; + struct arch_mbm_state *am; + + abmc_cfg.split.cfg_en = 1; + abmc_cfg.split.cntr_en = assign ? 1 : 0; + abmc_cfg.split.cntr_id = cntr_id; + abmc_cfg.split.bw_src = rmid; + if (assign) + abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); + + smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); + + /* + * The hardware counter is reset (because cfg_en == 1) so there is no + * need to record initial non-zero counts. + */ + am = get_arch_mbm_state(hw_dom, rmid, evtid); + if (am) + memset(am, 0, sizeof(*am)); +} diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 87daa4ca312dd..50e38445183a8 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -594,6 +594,25 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * */ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); +/** + * resctrl_arch_config_cntr() - Configure the counter with its new RMID + * and event details. + * @r: Resource structure. + * @d: The domain in which counter with ID @cntr_id should be configured. + * @evtid: Monitoring event type (e.g., QOS_L3_MBM_TOTAL_EVENT_ID + * or QOS_L3_MBM_LOCAL_EVENT_ID). + * @rmid: RMID. + * @closid: CLOSID. + * @cntr_id: Counter ID to configure. + * @assign: True to assign the counter or update an existing assignment, + * false to unassign the counter. + * + * This can be called from any CPU. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; From 26f86009b302161840263fc23bf4fb3e31f80b11 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:16 -0500 Subject: [PATCH 047/247] fs/resctrl: Add the functionality to assign MBM events BugLink: https://bugs.launchpad.net/bugs/2122432 When supported, "mbm_event" counter assignment mode offers "num_mbm_cntrs" number of counters that can be assigned to RMID, event pairs and monitor bandwidth usage as long as it is assigned. Add the functionality to allocate and assign a counter to an RMID, event pair in the domain. Also, add the helper rdtgroup_assign_cntrs() to assign counters in the group. Log the error message "Failed to allocate counter for in domain " in /sys/fs/resctrl/info/last_cmd_status if all the counters are in use. Exit on the first failure when assigning counters across all the domains. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit bd85310efd71b9e7809e1b95fe7a60fde42e62db) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/internal.h | 2 + fs/resctrl/monitor.c | 156 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 1cddfff007a24..762705d7eb8db 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -396,6 +396,8 @@ int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f714e7baaea65..106e9bdb8a9d8 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -356,6 +356,55 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, return state ? &state[idx] : NULL; } +/* + * mbm_cntr_get() - Return the counter ID for the matching @evtid and @rdtgrp. + * + * Return: + * Valid counter ID on success, or -ENOENT on failure. + */ +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + if (!r->mon.mbm_cntr_assignable) + return -ENOENT; + + if (!resctrl_is_mbm_event(evtid)) + return -ENOENT; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (d->cntr_cfg[cntr_id].rdtgrp == rdtgrp && + d->cntr_cfg[cntr_id].evtid == evtid) + return cntr_id; + } + + return -ENOENT; +} + +/* + * mbm_cntr_alloc() - Initialize and return a new counter ID in the domain @d. + * Caller must ensure that the specified event is not assigned already. + * + * Return: + * Valid counter ID on success, or -ENOSPC on failure. + */ +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) +{ + int cntr_id; + + for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) { + if (!d->cntr_cfg[cntr_id].rdtgrp) { + d->cntr_cfg[cntr_id].rdtgrp = rdtgrp; + d->cntr_cfg[cntr_id].evtid = evtid; + return cntr_id; + } + } + + return -ENOSPC; +} + static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); @@ -887,6 +936,113 @@ u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) return mon_event_all[evtid].evt_cfg; } +/* + * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID + * pair in the domain. + * + * Assign the counter if @assign is true else unassign the counter. Reset the + * associated non-architectural state. + */ +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct mbm_state *m; + + resctrl_arch_config_cntr(r, d, evtid, rmid, closid, cntr_id, assign); + + m = get_mbm_state(d, closid, rmid, evtid); + if (m) + memset(m, 0, sizeof(*m)); +} + +/* + * rdtgroup_alloc_assign_cntr() - Allocate a counter ID and assign it to the event + * pointed to by @mevt and the resctrl group @rdtgrp within the domain @d. + * + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + /* No action required if the counter is assigned already. */ + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + if (cntr_id >= 0) + return 0; + + cntr_id = mbm_cntr_alloc(r, d, rdtgrp, mevt->evtid); + if (cntr_id < 0) { + rdt_last_cmd_printf("Failed to allocate counter for %s in domain %d\n", + mevt->name, d->hdr.id); + return cntr_id; + } + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, true); + + return 0; +} + +/* + * rdtgroup_assign_cntr_event() - Assign a hardware counter for the event in + * @mevt to the resctrl group @rdtgrp. Assign counters to all domains if @d is + * NULL; otherwise, assign the counter to the specified domain @d. + * + * If all counters in a domain are already in use, rdtgroup_alloc_assign_cntr() + * will fail. The assignment process will abort at the first failure encountered + * during domain traversal, which may result in the event being only partially + * assigned. + * + * Return: + * 0 on success, < 0 on failure. + */ +static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + int ret = 0; + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + if (ret) + return ret; + } + } else { + ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt); + } + + return ret; +} + +/* + * rdtgroup_assign_cntrs() - Assign counters to MBM events. Called when + * a new group is created. + * + * Each group can accommodate two counters per domain: one for the total + * event and one for the local event. Assignments may fail due to the limited + * number of counters. However, it is not necessary to fail the group creation + * and thus no failure is returned. Users have the option to modify the + * counter assignments after the group has been created. + */ +void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_assign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { From bf0b7d9e857e8bc7c111d508f554a7a181f6c4b3 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:17 -0500 Subject: [PATCH 048/247] fs/resctrl: Add the functionality to unassign MBM events BugLink: https://bugs.launchpad.net/bugs/2122432 The "mbm_event" counter assignment mode offers "num_mbm_cntrs" number of counters that can be assigned to RMID, event pairs and monitor bandwidth usage as long as it is assigned. If all the counters are in use, the kernel logs the error message "Failed to allocate counter for in domain " in /sys/fs/resctrl/info/last_cmd_status when a new assignment is requested. To make space for a new assignment, users must unassign an already assigned counter and retry the assignment again. Add the functionality to unassign and free the counters in the domain. Also, add the helper rdtgroup_unassign_cntrs() to unassign counters in the group. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit aab2c5088cdb26e80d51ffbe72d24ab23fa1533e) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/internal.h | 2 ++ fs/resctrl/monitor.c | 66 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 762705d7eb8db..c6b66d4a6a375 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -398,6 +398,8 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_fil void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 106e9bdb8a9d8..2ed29ae831a4c 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -405,6 +405,14 @@ static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, return -ENOSPC; } +/* + * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. + */ +static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +{ + memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); +} + static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); @@ -1043,6 +1051,64 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); } +/* + * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration + * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. + */ +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int cntr_id; + + cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid); + + /* If there is no cntr_id assigned, nothing to do */ + if (cntr_id < 0) + return; + + rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false); + + mbm_cntr_free(d, cntr_id); +} + +/* + * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with + * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign + * the counters from all the domains if @d is NULL else unassign from @d. + */ +static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + + if (!d) { + list_for_each_entry(d, &r->mon_domains, hdr.list) + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } else { + rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt); + } +} + +/* + * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events. + * Called when a group is deleted. + */ +void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + return; + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + rdtgroup_unassign_cntr_event(NULL, rdtgrp, + &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { From 5260485396cf5897b8f4e7df7a2a97ba08703c77 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:18 -0500 Subject: [PATCH 049/247] fs/resctrl: Pass struct rdtgroup instead of individual members BugLink: https://bugs.launchpad.net/bugs/2122432 Reading monitoring data for a monitoring group requires both the RMID and CLOSID. The RMID and CLOSID are members of struct rdtgroup but passed separately to several functions involved in retrieving event data. When "mbm_event" counter assignment mode is enabled, a counter ID is required to read event data. The counter ID is obtained through mbm_cntr_get(), which expects a struct rdtgroup pointer. Provide a pointer to the struct rdtgroup as parameter to functions involved in retrieving event data to simplify access to RMID, CLOSID, and counter ID. Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit bc53eea6c2a1dea152a0073a2f2814b697ad197e) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/monitor.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 2ed29ae831a4c..c815153cad074 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -413,9 +413,11 @@ static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); } -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct rdt_mon_domain *d; struct mbm_state *m; int err, ret; @@ -475,8 +477,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. + * @rdtgrp: resctrl group associated with the CLOSID and RMID to identify + * the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * * Supporting function to calculate the memory bandwidth @@ -484,9 +486,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { u64 cur_bw, bytes, cur_bytes; + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; struct mbm_state *m; m = get_mbm_state(rr->d, closid, rmid, rr->evtid); @@ -515,7 +519,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp, rr); /* * For Ctrl groups read data from child monitor groups and @@ -526,8 +530,7 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) + if (__mon_event_count(entry, rr) == 0) ret = 0; } } @@ -658,7 +661,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) } static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) + struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; @@ -672,30 +675,30 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * return; } - __mon_event_count(closid, rmid, &rr); + __mon_event_count(rdtgrp, &rr); /* * If the software controller is enabled, compute the * bandwidth for this event id. */ if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); + mbm_bw_count(rdtgrp, &rr); resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) + struct rdtgroup *rdtgrp) { /* * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID); if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); + mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID); } /* @@ -768,11 +771,11 @@ void mbm_handle_overflow(struct work_struct *work) d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); + mbm_update(r, d, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); + mbm_update(r, d, crgrp); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); From 36f479d6f0e9c11c2ea737b75defe4f0924aa0db Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:19 -0500 Subject: [PATCH 050/247] fs/resctrl: Introduce counter ID read, reset calls in mbm_event mode BugLink: https://bugs.launchpad.net/bugs/2122432 When supported, "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor the bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Introduce the architecture calls resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() to read and reset event counters when "mbm_event" mode is supported. Function names match existing resctrl_arch_rmid_read() and resctrl_arch_reset_rmid(). Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 862314fd1f93d96eddb0559a807c66cb1f6ee520) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- include/linux/resctrl.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 50e38445183a8..04152654827d4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -613,6 +613,44 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign); +/** + * resctrl_arch_cntr_read() - Read the event data corresponding to the counter ID + * assigned to the RMID, event pair for this resource + * and domain. + * @r: Resource that the counter should be read from. + * @d: Domain that the counter should be read from. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to read. + * @eventid: The MBM event to which @cntr_id is assigned. + * @val: Result of the counter read in bytes. + * + * Called on a CPU that belongs to domain @d when "mbm_event" mode is enabled. + * Called from a non-migrateable process context via smp_call_on_cpu() unless all + * CPUs are nohz_full, in which case it is called via IPI (smp_call_function_any()). + * + * Return: + * 0 on success, or -EIO, -EINVAL etc on error. + */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val); + +/** + * resctrl_arch_reset_cntr() - Reset any private state associated with counter ID. + * @r: The domain's resource. + * @d: The counter ID's domain. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to reset. + * @eventid: The MBM event to which @cntr_id is assigned. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; From e7c03990060cc7991101eb7448b3fed4f6984cd5 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:21 -0500 Subject: [PATCH 051/247] x86/resctrl: Implement resctrl_arch_reset_cntr() and resctrl_arch_cntr_read() BugLink: https://bugs.launchpad.net/bugs/2122432 System software reads resctrl event data for a particular resource by writing the RMID and Event Identifier (EvtID) to the QM_EVTSEL register and then reading the event data from the QM_CTR register. In ABMC mode, the event data of a specific counter ID is read by setting the following fields: QM_EVTSEL.ExtendedEvtID = 1, QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID to the desired counter ID. Reading the QM_CTR then returns the contents of the specified counter ID. RMID_VAL_ERROR bit is set if the counter configuration is invalid, or if an invalid counter ID is set in the QM_EVTSEL.RMID field. RMID_VAL_UNAVAIL bit is set if the counter data is unavailable. Introduce resctrl_arch_reset_cntr() and resctrl_arch_cntr_read() to reset and read event data for a specific counter. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 2a65b72c1603a74f35228acbb8de2ecff9c13efe) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/internal.h | 6 +++ arch/x86/kernel/cpu/resctrl/monitor.c | 69 ++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 0444fea49b11f..e5edddb290c9d 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -40,6 +40,12 @@ struct arch_mbm_state { /* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ #define ABMC_ENABLE_BIT 0 +/* + * Qos Event Identifiers. + */ +#define ABMC_EXTENDED_EVT_ID BIT(31) +#define ABMC_EVT_ID BIT(0) + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index a5ba079948329..6eb4b6f5fd911 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -265,6 +265,75 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, return ret; } +static int __cntr_id_read(u32 cntr_id, u64 *val) +{ + u64 msr_val; + + /* + * QM_EVTSEL Register definition: + * ======================================================= + * Bits Mnemonic Description + * ======================================================= + * 63:44 -- Reserved + * 43:32 RMID RMID or counter ID in ABMC mode + * when reading an MBM event + * 31 ExtendedEvtID Extended Event Identifier + * 30:8 -- Reserved + * 7:0 EvtID Event Identifier + * ======================================================= + * The contents of a specific counter can be read by setting the + * following fields in QM_EVTSEL.ExtendedEvtID(=1) and + * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID + * to the desired counter ID. Reading the QM_CTR then returns the + * contents of the specified counter. The RMID_VAL_ERROR bit is set + * if the counter configuration is invalid, or if an invalid counter + * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit + * is set if the counter data is unavailable. + */ + wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + *val = msr_val; + return 0; +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct arch_mbm_state *am; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + memset(am, 0, sizeof(*am)); + + /* Record any initial, non-zero count value. */ + __cntr_id_read(cntr_id, &am->prev_msr); + } +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + u64 msr_val; + int ret; + + ret = __cntr_id_read(cntr_id, &msr_val); + if (ret) + return ret; + + *val = get_corrected_val(r, d, rmid, eventid, msr_val); + + return 0; +} + /* * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 * which indicates that RMIDs are configured in legacy mode. From bcacd479b90e7a5ec40939b1792b24559d829b91 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:22 -0500 Subject: [PATCH 052/247] fs/resctrl: Support counter read/reset with mbm_event assignment mode BugLink: https://bugs.launchpad.net/bugs/2122432 When "mbm_event" counter assignment mode is enabled, the architecture requires a counter ID to read the event data. Introduce an is_mbm_cntr field in struct rmid_read to indicate whether counter assignment mode is in use. Update the logic to call resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() when the assignment mode is active. Report 'Unassigned' in case the user attempts to read an event without assigning a hardware counter. Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 159f36cd4de7718779fd0b232de5137b4ffd2d1e) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 6 ++++ fs/resctrl/ctrlmondata.c | 22 ++++++++++--- fs/resctrl/internal.h | 3 ++ fs/resctrl/monitor.c | 47 ++++++++++++++++++++------- 4 files changed, 62 insertions(+), 16 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 446736dbd97f1..4c24c5f3f4c19 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -434,6 +434,12 @@ When monitoring is enabled all MON groups will also contain: for the L3 cache they occupy). These are named "mon_sub_L3_YY" where "YY" is the node number. + When the 'mbm_event' counter assignment mode is enabled, reading + an MBM event of a MON group returns 'Unassigned' if no hardware + counter is assigned to it. For CTRL_MON groups, 'Unassigned' is + returned if the MBM event does not have an assigned counter in the + CTRL_MON group nor in any of its associated MON groups. + "mon_hw_id": Available only with debug option. The identifier used by hardware for the monitor group. On x86 this is the RMID. diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 42b281b3852ff..0d0ef54fc4de1 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -563,10 +563,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->r = r; rr->d = d; rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r) && + resctrl_is_mbm_event(evtid)) { + rr->is_mbm_cntr = true; + } else { + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } } cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); @@ -582,7 +587,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + if (rr->arch_mon_ctx) + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) @@ -653,10 +659,16 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) checkresult: + /* + * -ENOENT is a special case, set only when "mbm_event" counter assignment + * mode is enabled and no counter has been assigned. + */ if (rr.err == -EIO) seq_puts(m, "Error\n"); else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); + else if (rr.err == -ENOENT) + seq_puts(m, "Unassigned\n"); else seq_printf(m, "%llu\n", rr.val); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index c6b66d4a6a375..2f1f2efe2f40f 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -111,6 +111,8 @@ struct mon_data { * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it + * is an MBM event. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -125,6 +127,7 @@ struct rmid_read { enum resctrl_event_id evtid; bool first; struct cacheinfo *ci; + bool is_mbm_cntr; int err; u64 val; void *arch_mon_ctx; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index c815153cad074..55327056596ee 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -419,12 +419,24 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; struct rdt_mon_domain *d; + int cntr_id = -ENOENT; struct mbm_state *m; int err, ret; u64 tval = 0; + if (rr->is_mbm_cntr) { + cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + if (cntr_id < 0) { + rr->err = -ENOENT; + return -EINVAL; + } + } + if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + if (rr->is_mbm_cntr) + resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + else + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); @@ -435,8 +447,12 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) /* Reading a single domain, must be on a CPU in that domain. */ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -460,8 +476,12 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { if (d->ci_id != rr->ci->id) continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->is_mbm_cntr) + err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -668,11 +688,15 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * rr.r = r; rr.d = d; rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; + if (resctrl_arch_mbm_cntr_assign_enabled(r)) { + rr.is_mbm_cntr = true; + } else { + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } } __mon_event_count(rdtgrp, &rr); @@ -684,7 +708,8 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * if (is_mba_sc(NULL)) mbm_bw_count(rdtgrp, &rr); - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + if (rr.arch_mon_ctx) + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, From 8cca9f0caf30d509726abf384dd33233cdc72195 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:23 -0500 Subject: [PATCH 053/247] fs/resctrl: Add event configuration directory under info/L3_MON/ BugLink: https://bugs.launchpad.net/bugs/2122432 The "mbm_event" counter assignment mode allows the user to assign a hardware counter to an RMID, event pair and monitor the bandwidth as long as it is assigned. The user can specify the memory transaction(s) for the counter to track. When this mode is supported, the /sys/fs/resctrl/info/L3_MON/event_configs directory contains a sub-directory for each MBM event that can be assigned to a counter. The MBM event sub-directory contains a file named "event_filter" that is used to view and modify which memory transactions the MBM event is configured with. Create /sys/fs/resctrl/info/L3_MON/event_configs directory on resctrl mount and pre-populate it with directories for the two existing MBM events: mbm_total_bytes and mbm_local_bytes. Create the "event_filter" file within each MBM event directory with the needed *show() that displays the memory transactions with which the MBM event is configured. Example: $ mount -t resctrl resctrl /sys/fs/resctrl $ cd /sys/fs/resctrl/ $ cat info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,remote_reads,local_non_temporal_writes, remote_non_temporal_writes,local_reads_slow_memory, remote_reads_slow_memory,dirty_victim_writes_all $ cat info/L3_MON/event_configs/mbm_local_bytes/event_filter local_reads,local_non_temporal_writes,local_reads_slow_memory Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit ea274cbeaf8f0667267b347e3f84797439cdab4e) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 33 +++++++++++++++ fs/resctrl/internal.h | 4 ++ fs/resctrl/monitor.c | 56 +++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 59 ++++++++++++++++++++++++++- include/linux/resctrl_types.h | 3 ++ 5 files changed, 153 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 4c24c5f3f4c19..ddd95f1472e67 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -310,6 +310,39 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs 0=30;1=30 +"event_configs": + Directory that exists when "mbm_event" counter assignment mode is supported. + Contains a sub-directory for each MBM event that can be assigned to a counter. + + Two MBM events are supported by default: mbm_local_bytes and mbm_total_bytes. + Each MBM event's sub-directory contains a file named "event_filter" that is + used to view and modify which memory transactions the MBM event is configured + with. The file is accessible only when "mbm_event" counter assignment mode is + enabled. + + List of memory transaction types supported: + + ========================== ======================================================== + Name Description + ========================== ======================================================== + dirty_victim_writes_all Dirty Victims from the QOS domain to all types of memory + remote_reads_slow_memory Reads to slow memory in the non-local NUMA domain + local_reads_slow_memory Reads to slow memory in the local NUMA domain + remote_non_temporal_writes Non-temporal writes to non-local NUMA domain + local_non_temporal_writes Non-temporal writes to local NUMA domain + remote_reads Reads to memory in the non-local NUMA domain + local_reads Reads to memory in the local NUMA domain + ========================== ======================================================== + + For example:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 2f1f2efe2f40f..9bf2e2fd5c19a 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -241,6 +241,8 @@ struct rdtgroup { #define RFTYPE_DEBUG BIT(10) +#define RFTYPE_ASSIGN_CONFIG BIT(11) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -403,6 +405,8 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 55327056596ee..7179f9865a48b 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -972,6 +972,61 @@ u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) return mon_event_all[evtid].evt_cfg; } +/** + * struct mbm_transaction - Memory transaction an MBM event can be configured with. + * @name: Name of memory transaction (read, write ...). + * @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to + * represent the memory transaction within an event's configuration. + */ +struct mbm_transaction { + char name[32]; + u32 val; +}; + +/* Decoded values for each type of memory transaction. */ +static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = { + {"local_reads", READS_TO_LOCAL_MEM}, + {"remote_reads", READS_TO_REMOTE_MEM}, + {"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM}, + {"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM}, + {"local_reads_slow_memory", READS_TO_LOCAL_S_MEM}, + {"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM}, + {"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM}, +}; + +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + bool sep = false; + int ret = 0, i; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (mevt->evt_cfg & mbm_transactions[i].val) { + if (sep) + seq_putc(seq, ','); + seq_printf(seq, "%s", mbm_transactions[i].name); + sep = true; + } + } + seq_putc(seq, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1287,6 +1342,7 @@ int resctrl_mon_resource_init(void) RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 8eaad45b28ead..25a653847f499 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1923,6 +1923,12 @@ static struct rftype res_common_files[] = { .seq_show = mbm_local_bytes_config_show, .write = mbm_local_bytes_config_write, }, + { + .name = "event_filter", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = event_filter_show, + }, { .name = "mbm_assign_mode", .mode = 0444, @@ -2183,10 +2189,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, return ret; } +static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn) +{ + struct kernfs_node *kn_subdir, *kn_subdir2; + struct mon_evt *mevt; + int ret; + + kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt); + if (IS_ERR(kn_subdir2)) { + ret = PTR_ERR(kn_subdir2); + goto out; + } + + ret = rdtgroup_kn_set_ugid(kn_subdir2); + if (ret) + goto out; + + ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG); + if (ret) + break; + } + +out: + return ret; +} + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, unsigned long fflags) { struct kernfs_node *kn_subdir; + struct rdt_resource *r; int ret; kn_subdir = kernfs_create_dir(kn_info, name, @@ -2199,8 +2243,19 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, return ret; ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); + if (ret) + return ret; + + if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) { + r = priv; + if (r->mon.mbm_cntr_assignable) { + ret = resctrl_mkdir_event_configs(r, kn_subdir); + if (ret) + return ret; + } + } + + kernfs_activate(kn_subdir); return ret; } diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index d98351663c2c6..acfe07860b346 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,6 +34,9 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) +/* Number of memory transactions that an MBM event can be configured with */ +#define NUM_MBM_TRANSACTIONS 7 + /* Event IDs */ enum resctrl_event_id { /* Must match value of first event below */ From c364873df15c574b842623a40dee3b287ebac341 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:24 -0500 Subject: [PATCH 054/247] fs/resctrl: Provide interface to update the event configurations BugLink: https://bugs.launchpad.net/bugs/2122432 When "mbm_event" counter assignment mode is enabled, users can modify the event configuration by writing to the 'event_filter' resctrl file. The event configurations for mbm_event mode are located in /sys/fs/resctrl/info/L3_MON/event_configs/. Update the assignments of all CTRL_MON and MON resource groups when the event configuration is modified. Example: $ mount -t resctrl resctrl /sys/fs/resctrl $ cd /sys/fs/resctrl/ $ cat info/L3_MON/event_configs/mbm_local_bytes/event_filter local_reads,local_non_temporal_writes,local_reads_slow_memory $ echo "local_reads,local_non_temporal_writes" > info/L3_MON/event_configs/mbm_total_bytes/event_filter $ cat info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,local_non_temporal_writes Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit f9ae5913d47cda67481a4f54cc3273d3d1d00a01) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 12 +++ fs/resctrl/internal.h | 3 + fs/resctrl/monitor.c | 114 ++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 3 +- 4 files changed, 131 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index ddd95f1472e67..2e840ef26f682 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -343,6 +343,18 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter local_reads,local_non_temporal_writes,local_reads_slow_memory + Modify the event configuration by writing to the "event_filter" file within + the "event_configs" directory. The read/write "event_filter" file contains the + configuration of the event that reflects which memory transactions are counted by it. + + For example:: + + # echo "local_reads, local_non_temporal_writes" > + /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,local_non_temporal_writes + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9bf2e2fd5c19a..90d3e4ab335b6 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -407,6 +407,9 @@ void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 7179f9865a48b..ccb9726bba545 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1192,6 +1192,120 @@ void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]); } +static int resctrl_parse_mem_transactions(char *tok, u32 *val) +{ + u32 temp_val = 0; + char *evt_str; + bool found; + int i; + +next_config: + if (!tok || tok[0] == '\0') { + *val = temp_val; + return 0; + } + + /* Start processing the strings for each memory transaction type */ + evt_str = strim(strsep(&tok, ",")); + found = false; + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (!strcmp(mbm_transactions[i].name, evt_str)) { + temp_val |= mbm_transactions[i].val; + found = true; + break; + } + } + + if (!found) { + rdt_last_cmd_printf("Invalid memory transaction type %s\n", evt_str); + return -EINVAL; + } + + goto next_config; +} + +/* + * rdtgroup_update_cntr_event - Update the counter assignments for the event + * in a group. + * @r: Resource to which update needs to be done. + * @rdtgrp: Resctrl group. + * @evtid: MBM monitor event. + */ +static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, + enum resctrl_event_id evtid) +{ + struct rdt_mon_domain *d; + int cntr_id; + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid); + if (cntr_id >= 0) + rdtgroup_assign_cntr(r, d, evtid, rdtgrp->mon.rmid, + rdtgrp->closid, cntr_id, true); + } +} + +/* + * resctrl_update_cntr_allrdtgrp - Update the counter assignments for the event + * for all the groups. + * @mevt MBM Monitor event. + */ +static void resctrl_update_cntr_allrdtgrp(struct mon_evt *mevt) +{ + struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); + struct rdtgroup *prgrp, *crgrp; + + /* + * Find all the groups where the event is assigned and update the + * configuration of existing assignments. + */ + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + rdtgroup_update_cntr_event(r, prgrp, mevt->evtid); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + rdtgroup_update_cntr_event(r, crgrp, mevt->evtid); + } +} + +ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + u32 evt_cfg = 0; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + ret = resctrl_parse_mem_transactions(buf, &evt_cfg); + if (!ret && mevt->evt_cfg != evt_cfg) { + mevt->evt_cfg = evt_cfg; + resctrl_update_cntr_allrdtgrp(mevt); + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 25a653847f499..8187df7b85d25 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1925,9 +1925,10 @@ static struct rftype res_common_files[] = { }, { .name = "event_filter", - .mode = 0444, + .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = event_filter_show, + .write = event_filter_write, }, { .name = "mbm_assign_mode", From caf1a4037bf91cf865aefbaa9be8a9d8970f0d1c Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:25 -0500 Subject: [PATCH 055/247] fs/resctrl: Introduce mbm_assign_on_mkdir to enable assignments on mkdir BugLink: https://bugs.launchpad.net/bugs/2122432 The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor the bandwidth as long as it is assigned. Introduce a user-configurable option that determines if a counter will automatically be assigned to an RMID, event pair when its associated monitor group is created via mkdir. Accessible when "mbm_event" counter assignment mode is enabled. Suggested-by: Peter Newman Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit ac1df9bb0ba3ae94137fb494cd9efc598f65d826) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 20 ++++++++++ fs/resctrl/internal.h | 6 +++ fs/resctrl/monitor.c | 53 +++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 7 ++++ include/linux/resctrl.h | 3 ++ 5 files changed, 89 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 2e840ef26f682..1de815b3a07b4 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -355,6 +355,26 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,local_non_temporal_writes +"mbm_assign_on_mkdir": + Exists when "mbm_event" counter assignment mode is supported. Accessible + only when "mbm_event" counter assignment mode is enabled. + + Determines if a counter will automatically be assigned to an RMID, MBM event + pair when its associated monitor group is created via mkdir. Enabled by default + on boot, also when switched from "default" mode to "mbm_event" counter assignment + mode. Users can disable this capability by writing to the interface. + + "0": + Auto assignment is disabled. + "1": + Auto assignment is enabled. + + Example:: + + # echo 0 > /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + 0 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 90d3e4ab335b6..66c677c1b8588 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -410,6 +410,12 @@ int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index ccb9726bba545..deca9535fbbb8 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1027,6 +1027,57 @@ int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v return ret; } +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s, + void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool value; + int ret; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + r->mon.mbm_assign_on_mkdir = value; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1457,6 +1508,8 @@ int resctrl_mon_resource_init(void) resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); + resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | + RFTYPE_RES_CACHE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 8187df7b85d25..ced46eb4a959b 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1808,6 +1808,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_last_cmd_status_show, .fflags = RFTYPE_TOP_INFO, }, + { + .name = "mbm_assign_on_mkdir", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_on_mkdir_show, + .write = resctrl_mbm_assign_on_mkdir_write, + }, { .name = "num_closids", .mode = 0444, diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 04152654827d4..a7d92718b653f 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -277,12 +277,15 @@ enum resctrl_schema_fmt { * monitoring events can be configured. * @num_mbm_cntrs: Number of assignable counters. * @mbm_cntr_assignable:Is system capable of supporting counter assignment? + * @mbm_assign_on_mkdir:True if counters should automatically be assigned to MBM + * events of monitor groups created via mkdir. */ struct resctrl_mon { int num_rmid; unsigned int mbm_cfg_mask; int num_mbm_cntrs; bool mbm_cntr_assignable; + bool mbm_assign_on_mkdir; }; /** From 3f87575107cb4735f8aeda1cc68a12572a962f0a Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:26 -0500 Subject: [PATCH 056/247] fs/resctrl: Auto assign counters on mkdir and clean up on group removal BugLink: https://bugs.launchpad.net/bugs/2122432 Resctrl provides a user-configurable option mbm_assign_on_mkdir that determines if a counter will automatically be assigned to an RMID, event pair when its associated monitor group is created via mkdir. Enable mbm_assign_on_mkdir by default to automatically assign counters to the two default events (MBM total and MBM local) of a new monitoring group created via mkdir. This maintains backward compatibility with original resctrl support for these two events. Unassign and free counters belonging to a monitoring group when the group is deleted. Monitor group creation does not fail if a counter cannot be assigned to one or both events. There may be limited counters and users have the flexibility to modify counter assignments at a later time. Log the error message "Failed to allocate counter for in domain " in /sys/fs/resctrl/info/last_cmd_status when a new monitoring group is created but counter assignment failed. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit ef712fe97ec575657abb12d76837867dd8b8a0ed) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/monitor.c | 4 +++- fs/resctrl/rdtgroup.c | 22 ++++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index deca9535fbbb8..9cb334136d210 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1231,7 +1231,8 @@ void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || + !r->mon.mbm_assign_on_mkdir) return; if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) @@ -1503,6 +1504,7 @@ int resctrl_mon_resource_init(void) (READS_TO_LOCAL_MEM | READS_TO_LOCAL_S_MEM | NON_TEMP_WRITE_TO_LOCAL_MEM); + r->mon.mbm_assign_on_mkdir = true; resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index ced46eb4a959b..c6fd3bb45c199 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2715,6 +2715,8 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_info; + rdtgroup_assign_cntrs(&rdtgroup_default); + ret = mkdir_mondata_all(rdtgroup_default.kn, &rdtgroup_default, &kn_mondata); if (ret < 0) @@ -2753,8 +2755,10 @@ static int rdt_get_tree(struct fs_context *fc) if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(&rdtgroup_default); kernfs_remove(kn_mongrp); + } out_info: kernfs_remove(kn_info); out_closid_exit: @@ -2900,6 +2904,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + rdtgroup_unassign_cntrs(sentry); free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); @@ -2940,6 +2945,8 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); @@ -3024,6 +3031,7 @@ static void resctrl_fs_teardown(void) return; rmdir_all_sub(); + rdtgroup_unassign_cntrs(&rdtgroup_default); mon_put_kn_priv(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; @@ -3504,9 +3512,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) } rdtgrp->mon.rmid = ret; + rdtgroup_assign_cntrs(rdtgrp); + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); if (ret) { rdt_last_cmd_puts("kernfs subdir error\n"); + rdtgroup_unassign_cntrs(rdtgrp); free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -3516,8 +3527,10 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) { - if (resctrl_arch_mon_capable()) + if (resctrl_arch_mon_capable()) { + rdtgroup_unassign_cntrs(rgrp); free_rmid(rgrp->closid, rgrp->mon.rmid); + } } /* @@ -3793,6 +3806,9 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; + + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* @@ -3840,6 +3856,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + rdtgroup_unassign_cntrs(rdtgrp); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); From 07f97dd8dd090bdae6106b0755fc50c4d4905172 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:27 -0500 Subject: [PATCH 057/247] fs/resctrl: Introduce mbm_L3_assignments to list assignments in a group BugLink: https://bugs.launchpad.net/bugs/2122432 Introduce the mbm_L3_assignments resctrl file associated with CTRL_MON and MON resource groups to display the counter assignment states of the resource group when "mbm_event" counter assignment mode is enabled. Display the list in the following format: :=;= Event: A valid MBM event listed in /sys/fs/resctrl/info/L3_MON/event_configs directory. Domain ID: A valid domain ID. The assignment state can be one of the following: _ : No counter assigned. e : Counter assigned exclusively. Example: To list the assignment states for the default group $ cd /sys/fs/resctrl $ cat /sys/fs/resctrl/mbm_L3_assignments mbm_total_bytes:0=e;1=e mbm_local_bytes:0=e;1=e Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit cba8222880b88d96f3ed6c8a115e335e552b83a1) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 31 +++++++++++++++++ fs/resctrl/internal.h | 2 ++ fs/resctrl/monitor.c | 49 +++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 6 ++++ 4 files changed, 88 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 1de815b3a07b4..a2b7240b0818b 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -509,6 +509,37 @@ When monitoring is enabled all MON groups will also contain: Available only with debug option. The identifier used by hardware for the monitor group. On x86 this is the RMID. +When monitoring is enabled all MON groups may also contain: + +"mbm_L3_assignments": + Exists when "mbm_event" counter assignment mode is supported and lists the + counter assignment states of the group. + + The assignment list is displayed in the following format: + + :=;= + + Event: A valid MBM event in the + /sys/fs/resctrl/info/L3_MON/event_configs directory. + + Domain ID: A valid domain ID. + + Assignment states: + + _ : No counter assigned. + + e : Counter assigned exclusively. + + Example: + + To display the counter assignment states for the default group. + :: + + # cd /sys/fs/resctrl + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + When the "mba_MBps" mount option is used all CTRL_MON groups will also contain: "mba_MBps_event": diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 66c677c1b8588..43297b36f5dda 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -416,6 +416,8 @@ int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 9cb334136d210..b692a0ef17107 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1454,6 +1454,54 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, return ret; } +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + struct mon_evt *mevt; + int ret = 0; + bool sep; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out_unlock; + } + + rdt_last_cmd_clear(); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + sep = false; + seq_printf(s, "%s:", mevt->name); + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + + if (mbm_cntr_get(r, d, rdtgrp, mevt->evtid) < 0) + seq_printf(s, "%d=_", d->hdr.id); + else + seq_printf(s, "%d=e", d->hdr.id); + + sep = true; + } + seq_putc(s, '\n'); + } + +out_unlock: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -1512,6 +1560,7 @@ int resctrl_mon_resource_init(void) resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index c6fd3bb45c199..9d3a58c76fe27 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1937,6 +1937,12 @@ static struct rftype res_common_files[] = { .seq_show = event_filter_show, .write = event_filter_write, }, + { + .name = "mbm_L3_assignments", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_L3_assignments_show, + }, { .name = "mbm_assign_mode", .mode = 0444, From 024e75fec718f66e11e73f42e5bac88059a3ac11 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:28 -0500 Subject: [PATCH 058/247] fs/resctrl: Introduce the interface to modify assignments in a group BugLink: https://bugs.launchpad.net/bugs/2122432 Enable the mbm_l3_assignments resctrl file to be used to modify counter assignments of CTRL_MON and MON groups when the "mbm_event" counter assignment mode is enabled. Process the assignment modifications in the following format: :=;= Event: A valid MBM event in the /sys/fs/resctrl/info/L3_MON/event_configs directory. Domain ID: A valid domain ID. When writing, '*' applies the changes to all domains. Assignment states: _ : Unassign a counter. e : Assign a counter exclusively. Examples: $ cd /sys/fs/resctrl $ cat /sys/fs/resctrl/mbm_L3_assignments mbm_total_bytes:0=e;1=e mbm_local_bytes:0=e;1=e To unassign the counter associated with the mbm_total_bytes event on domain 0: $ echo "mbm_total_bytes:0=_" > mbm_L3_assignments $ cat /sys/fs/resctrl/mbm_L3_assignments mbm_total_bytes:0=_;1=e mbm_local_bytes:0=e;1=e To unassign the counter associated with the mbm_total_bytes event on all the domains: $ echo "mbm_total_bytes:*=_" > mbm_L3_assignments $ cat /sys/fs/resctrl/mbm_L3_assignments mbm_total_bytes:0=_;1=_ mbm_local_bytes:0=e;1=e Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 88bee79640aea6192f98c8faae30e0013453479d) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 151 +++++++++++++++++++++++++- fs/resctrl/internal.h | 3 + fs/resctrl/monitor.c | 139 ++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 3 +- 4 files changed, 294 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index a2b7240b0818b..f60f6a96cb6b8 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -522,7 +522,8 @@ When monitoring is enabled all MON groups may also contain: Event: A valid MBM event in the /sys/fs/resctrl/info/L3_MON/event_configs directory. - Domain ID: A valid domain ID. + Domain ID: A valid domain ID. When writing, '*' applies the changes + to all the domains. Assignment states: @@ -540,6 +541,35 @@ When monitoring is enabled all MON groups may also contain: mbm_total_bytes:0=e;1=e mbm_local_bytes:0=e;1=e + Assignments can be modified by writing to the interface. + + Examples: + + To unassign the counter associated with the mbm_total_bytes event on domain 0: + :: + + # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=e + mbm_local_bytes:0=e;1=e + + To unassign the counter associated with the mbm_total_bytes event on all the domains: + :: + + # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=_ + mbm_local_bytes:0=e;1=e + + To assign a counter associated with the mbm_total_bytes event on all domains in + exclusive mode: + :: + + # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + When the "mba_MBps" mount option is used all CTRL_MON groups will also contain: "mba_MBps_event": @@ -1585,6 +1615,125 @@ View the llc occupancy snapshot:: # cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy 11234000 + +Examples on working with mbm_assign_mode +======================================== + +a. Check if MBM counter assignment mode is supported. +:: + + # mount -t resctrl resctrl /sys/fs/resctrl/ + + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + [mbm_event] + default + +The "mbm_event" mode is detected and enabled. + +b. Check how many assignable counters are supported. +:: + + # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs + 0=32;1=32 + +c. Check how many assignable counters are available for assignment in each domain. +:: + + # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs + 0=30;1=30 + +d. To list the default group's assign states. +:: + + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + +e. To unassign the counter associated with the mbm_total_bytes event on domain 0. +:: + + # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=_;1=e + mbm_local_bytes:0=e;1=e + +f. To unassign the counter associated with the mbm_total_bytes event on all domains. +:: + + # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignment + mbm_total_bytes:0=_;1=_ + mbm_local_bytes:0=e;1=e + +g. To assign a counter associated with the mbm_total_bytes event on all domains in +exclusive mode. +:: + + # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments + # cat /sys/fs/resctrl/mbm_L3_assignments + mbm_total_bytes:0=e;1=e + mbm_local_bytes:0=e;1=e + +h. Read the events mbm_total_bytes and mbm_local_bytes of the default group. There is +no change in reading the events with the assignment. +:: + + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_total_bytes + 779247936 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_total_bytes + 562324232 + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + 212122123 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + 121212144 + +i. Check the event configurations. +:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + +j. Change the event configuration for mbm_local_bytes. +:: + + # echo "local_reads, local_non_temporal_writes, local_reads_slow_memory, remote_reads" > + /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory,remote_reads + +k. Now read the local events again. The first read may come back with "Unavailable" +status. The subsequent read of mbm_local_bytes will display the current value. +:: + + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + Unavailable + # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes + 2252323 + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + Unavailable + # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes + 1566565 + +l. Users have the option to go back to 'default' mbm_assign_mode if required. This can be +done using the following command. Note that switching the mbm_assign_mode may reset all +the MBM counters (and thus all MBM events) of all the resctrl groups. +:: + + # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + mbm_event + [default] + +m. Unmount the resctrl filesystem. +:: + + # umount /sys/fs/resctrl/ + Intel RDT Errata ================ diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 43297b36f5dda..c69b1da80d3f9 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -418,6 +418,9 @@ ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b692a0ef17107..f388dbcdbdcd6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1502,6 +1502,145 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi return ret; } +/* + * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching + * event name. + */ +static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *name) +{ + struct mon_evt *mevt; + + for_each_mon_event(mevt) { + if (mevt->rid == r->rid && mevt->enabled && + resctrl_is_mbm_event(mevt->evtid) && + !strcmp(mevt->name, name)) + return mevt; + } + + return NULL; +} + +static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, + struct rdtgroup *rdtgrp, struct mon_evt *mevt) +{ + int ret = 0; + + if (!assign || strlen(assign) != 1) + return -EINVAL; + + switch (*assign) { + case 'e': + ret = rdtgroup_assign_cntr_event(d, rdtgrp, mevt); + break; + case '_': + rdtgroup_unassign_cntr_event(d, rdtgrp, mevt); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, + char *event, char *tok) +{ + struct rdt_mon_domain *d; + unsigned long dom_id = 0; + char *dom_str, *id_str; + struct mon_evt *mevt; + int ret; + + mevt = mbm_get_mon_event_by_name(r, event); + if (!mevt) { + rdt_last_cmd_printf("Invalid event %s\n", event); + return -ENOENT; + } + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + + id_str = strsep(&dom_str, "="); + + /* Check for domain id '*' which means all domains */ + if (id_str && *id_str == '*') { + ret = rdtgroup_modify_assign_state(dom_str, NULL, rdtgrp, mevt); + if (ret) + rdt_last_cmd_printf("Assign operation '%s:*=%s' failed\n", + event, dom_str); + return ret; + } else if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing domain id\n"); + return -EINVAL; + } + + /* Verify if the dom_id is valid */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + ret = rdtgroup_modify_assign_state(dom_str, d, rdtgrp, mevt); + if (ret) { + rdt_last_cmd_printf("Assign operation '%s:%ld=%s' failed\n", + event, dom_id, dom_str); + return ret; + } + goto next; + } + } + + rdt_last_cmd_printf("Invalid domain id %ld\n", dom_id); + return -EINVAL; +} + +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdtgroup *rdtgrp; + char *token, *event; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event mode is not enabled\n"); + rdtgroup_kn_unlock(of->kn); + return -EINVAL; + } + + while ((token = strsep(&buf, "\n")) != NULL) { + /* + * The write command follows the following format: + * ":=" + * Extract the event name first. + */ + event = strsep(&token, ":"); + + ret = resctrl_parse_mbm_assignment(r, rdtgrp, event, token); + if (ret) + break; + } + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 9d3a58c76fe27..dd5e6f720b984 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1939,9 +1939,10 @@ static struct rftype res_common_files[] = { }, { .name = "mbm_L3_assignments", - .mode = 0444, + .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = mbm_L3_assignments_show, + .write = mbm_L3_assignments_write, }, { .name = "mbm_assign_mode", From a7ce1567331f49efe454c3e7a29aca8523a2d704 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:29 -0500 Subject: [PATCH 059/247] fs/resctrl: Disable BMEC event configuration when mbm_event mode is enabled BugLink: https://bugs.launchpad.net/bugs/2122432 The BMEC (Bandwidth Monitoring Event Configuration) feature enables per-domain event configuration. With BMEC the MBM events are configured using the mbm_total_bytes_config or mbm_local_bytes_config files in /sys/fs/resctrl/info/L3_MON/ and the per-domain event configuration affects all monitor resource groups. The mbm_event counter assignment mode enables counters to be assigned to RMID (i.e. a monitor resource group), event pairs, with potentially unique event configurations associated with every counter. There may be systems that support both BMEC and mbm_event counter assignment mode, but resctrl supporting both concurrently will present a conflicting interface to the user with both per-domain and per RMID, event configurations active at the same time. The mbm_event counter assignment provides most flexibility to user space and aligns with Arm's counter support. On systems that support both, disable BMEC event configuration when mbm_event mode is enabled by hiding the mbm_total_bytes_config or mbm_local_bytes_config files when mbm_event mode is enabled. Ensure mon_features always displays accurate information about monitor features. Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 9f0209b857d2fc99c2a32bff1d7bfd54b314c29c) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/rdtgroup.c | 47 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index dd5e6f720b984..72a19e0e4fc2f 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1150,7 +1150,8 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, if (mevt->rid != r->rid || !mevt->enabled) continue; seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) + if (mevt->configurable && + !resctrl_arch_mbm_cntr_assign_enabled(r)) seq_printf(seq, "%s_config\n", mevt->name); } @@ -1799,6 +1800,44 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, return ret ?: nbytes; } +/* + * resctrl_bmec_files_show() — Controls the visibility of BMEC-related resctrl + * files. When @show is true, the files are displayed; when false, the files + * are hidden. + * Don't treat kernfs_find_and_get failure as an error, since this function may + * be called regardless of whether BMEC is supported or the event is enabled. + */ +static void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show) +{ + struct kernfs_node *kn_config, *mon_kn = NULL; + char name[32]; + + if (!l3_mon_kn) { + sprintf(name, "%s_MON", r->name); + mon_kn = kernfs_find_and_get(kn_info, name); + if (!mon_kn) + return; + l3_mon_kn = mon_kn; + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_total_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_local_bytes_config"); + if (kn_config) { + kernfs_show(kn_config, show); + kernfs_put(kn_config); + } + + /* Release the reference only if it was acquired */ + if (mon_kn) + kernfs_put(mon_kn); +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -2267,6 +2306,12 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, ret = resctrl_mkdir_event_configs(r, kn_subdir); if (ret) return ret; + /* + * Hide BMEC related files if mbm_event mode + * is enabled. + */ + if (resctrl_arch_mbm_cntr_assign_enabled(r)) + resctrl_bmec_files_show(r, kn_subdir, false); } } From 4df9e3d89a78e697cadded5999915450a46a7426 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:30 -0500 Subject: [PATCH 060/247] fs/resctrl: Introduce the interface to switch between monitor modes BugLink: https://bugs.launchpad.net/bugs/2122432 Resctrl subsystem can support two monitoring modes, "mbm_event" or "default". In mbm_event mode, monitoring event can only accumulate data while it is backed by a hardware counter. In "default" mode, resctrl assumes there is a hardware counter for each event within every CTRL_MON and MON group. Introduce mbm_assign_mode resctrl file to switch between mbm_event and default modes. Example: To list the MBM monitor modes supported: $ cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode [mbm_event] default To enable the "mbm_event" counter assignment mode: $ echo "mbm_event" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode To enable the "default" monitoring mode: $ echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode Reset MBM event counters automatically as part of changing the mode. Clear both architectural and non-architectural event states to prevent overflow conditions during the next event read. Clear assignable counter configuration on all the domains. Also, enable auto assignment when switching to "mbm_event" mode. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 8004ea01cf6338298e0c6ab055bc3ec659ce381b) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 22 +++++- fs/resctrl/internal.h | 6 ++ fs/resctrl/monitor.c | 100 ++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 7 +- 4 files changed, 131 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index f60f6a96cb6b8..006d23af66e19 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -259,7 +259,8 @@ with the following files: "mbm_assign_mode": The supported counter assignment modes. The enclosed brackets indicate which mode - is enabled. + is enabled. The MBM events associated with counters may reset when "mbm_assign_mode" + is changed. :: # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode @@ -279,6 +280,15 @@ with the following files: of counters available is described in the "num_mbm_cntrs" file. Changing the mode may cause all counters on the resource to reset. + Moving to mbm_event counter assignment mode requires users to assign the counters + to the events. Otherwise, the MBM event counters will return 'Unassigned' when read. + + The mode is beneficial for AMD platforms that support more CTRL_MON + and MON groups than available hardware counters. By default, this + feature is enabled on AMD platforms with the ABMC (Assignable Bandwidth + Monitoring Counters) capability, ensuring counters remain assigned even + when the corresponding RMID is not actively used by any processor. + "default": In default mode, resctrl assumes there is a hardware counter for each @@ -288,6 +298,16 @@ with the following files: result in misleading values or display "Unavailable" if no counter is assigned to the event. + * To enable "mbm_event" counter assignment mode: + :: + + # echo "mbm_event" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + + * To enable "default" monitoring mode: + :: + + # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode + "num_mbm_cntrs": The maximum number of counters (total of available and assigned counters) in each domain when the system supports mbm_event mode. diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index c69b1da80d3f9..cf1fd82dc5a99 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -396,6 +396,12 @@ void *rdt_kn_parent_priv(struct kernfs_node *kn); int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v); +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show); + int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v); int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f388dbcdbdcd6..50c24460d992a 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1078,6 +1078,33 @@ ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf return ret ?: nbytes; } +/* + * mbm_cntr_free_all() - Clear all the counter ID configuration details in the + * domain @d. Called when mbm_assign_mode is changed. + */ +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); +} + +/* + * resctrl_reset_rmid_all() - Reset all non-architecture states for all the + * supported RMIDs. + */ +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + enum resctrl_event_id evt; + int idx; + + for_each_mbm_event_id(evt) { + if (!resctrl_is_mon_event_enabled(evt)) + continue; + idx = MBM_STATE_IDX(evt); + memset(d->mbm_states[idx], 0, sizeof(*d->mbm_states[0]) * idx_limit); + } +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1388,6 +1415,79 @@ int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, return 0; } +ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct rdt_mon_domain *d; + int ret = 0; + bool enable; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!strcmp(buf, "default")) { + enable = 0; + } else if (!strcmp(buf, "mbm_event")) { + if (r->mon.mbm_cntr_assignable) { + enable = 1; + } else { + ret = -EINVAL; + rdt_last_cmd_puts("mbm_event mode is not supported\n"); + goto out_unlock; + } + } else { + ret = -EINVAL; + rdt_last_cmd_puts("Unsupported assign mode\n"); + goto out_unlock; + } + + if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) { + ret = resctrl_arch_mbm_cntr_assign_set(r, enable); + if (ret) + goto out_unlock; + + /* Update the visibility of BMEC related files */ + resctrl_bmec_files_show(r, NULL, !enable); + + /* + * Initialize the default memory transaction values for + * total and local events. + */ + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); + /* Enable auto assignment when switching to "mbm_event" mode */ + if (enable) + r->mon.mbm_assign_on_mkdir = true; + /* + * Reset all the non-achitectural RMID state and assignable counters. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + mbm_cntr_free_all(r, d); + resctrl_reset_rmid_all(r, d); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 72a19e0e4fc2f..ce4e716e6404a 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1807,8 +1807,8 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, * Don't treat kernfs_find_and_get failure as an error, since this function may * be called regardless of whether BMEC is supported or the event is enabled. */ -static void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, - bool show) +void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn, + bool show) { struct kernfs_node *kn_config, *mon_kn = NULL; char name[32]; @@ -1985,9 +1985,10 @@ static struct rftype res_common_files[] = { }, { .name = "mbm_assign_mode", - .mode = 0444, + .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = resctrl_mbm_assign_mode_show, + .write = resctrl_mbm_assign_mode_write, .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, }, { From 284a5ee23e9a21edff6492fba8649f987c129661 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:31 -0500 Subject: [PATCH 061/247] x86/resctrl: Configure mbm_event mode if supported BugLink: https://bugs.launchpad.net/bugs/2122432 Configure mbm_event mode on AMD platforms. On AMD platforms, it is recommended to use the mbm_event mode, if supported, to prevent the hardware from resetting counters between reads. This can result in misleading values or display "Unavailable" if no counter is assigned to the event. Enable mbm_event mode, known as ABMC (Assignable Bandwidth Monitoring Counters) on AMD, by default if the system supports it. Update ABMC across all logical processors within the resctrl domain to ensure proper functionality. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit 0f1576e43adc62756879a240e66e89ce386b6eb9) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/core.c | 7 +++++++ arch/x86/kernel/cpu/resctrl/internal.h | 1 + arch/x86/kernel/cpu/resctrl/monitor.c | 8 ++++++++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 2e68aa02ad3f4..06ca5a30140c2 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -520,6 +520,9 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = container_of(hdr, struct rdt_mon_domain, hdr); cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); return; } @@ -539,6 +542,10 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + arch_mon_domain_online(r, d); if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index e5edddb290c9d..9f4c2f0aaf5c8 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -215,5 +215,6 @@ bool rdt_cpu_has(int flag); void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 6eb4b6f5fd911..2cd25a0d4637e 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -462,6 +462,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) r->mon.mbm_cntr_assignable = true; cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + hw_res->mbm_cntr_assign_enabled = true; } r->mon_capable = true; @@ -563,3 +564,10 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, if (am) memset(am, 0, sizeof(*am)); } + +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); +} From 0eea27ad0e11baccd0e0bcb60311f7e9f2068ff5 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:32 -0500 Subject: [PATCH 062/247] MAINTAINERS: resctrl: Add myself as reviewer BugLink: https://bugs.launchpad.net/bugs/2122432 I have been contributing to resctrl for sometime now and I would like to help with code reviews as well. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Acked-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com (cherry picked from commit d79bab8a48bfcf5495f72d10bf609478a4a3b916) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 263288f73850d..85bd4a19a6c42 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21216,6 +21216,7 @@ M: Tony Luck M: Reinette Chatre R: Dave Martin R: James Morse +R: Babu Moger L: linux-kernel@vger.kernel.org S: Supported F: Documentation/filesystems/resctrl.rst From 7f539c830e0d63aca39f5cb69632f369c1b28ff5 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Tue, 16 Sep 2025 12:25:49 -0500 Subject: [PATCH 063/247] fs/resctrl: Fix counter auto-assignment on mkdir with mbm_event enabled BugLink: https://bugs.launchpad.net/bugs/2122432 rdt_resource::resctrl_mon::mbm_assign_on_mkdir determines if a counter will automatically be assigned to an RMID, MBM event pair when its associated monitor group is created via mkdir. Testing shows that counters are always automatically assigned to new monitor groups, whether mbm_assign_on_mkdir is set or not. To support automatic counter assignment the check for mbm_assign_on_mkdir should be in rdtgroup_assign_cntrs() that assigns counters during monitor group creation. Instead, the check for mbm_assign_on_mkdir is in rdtgroup_unassign_cntrs() that is called on monitor group deletion from where counters should always be unassigned, whether mbm_assign_on_mkdir is set or not. Fix automatic counter assignment by moving the mbm_assign_on_mkdir check from rdtgroup_unassign_cntrs() to rdtgroup_assign_cntrs(). [ bp: Replace commit message with Reinette's version. ] Fixes: ef712fe97ec57 ("fs/resctrl: Auto assign counters on mkdir and clean up on group removal") Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Acked-by: Reinette Chatre (cherry picked from commit dd86b69d20fb9fa7e941ed01ff05f1e662fcc3ff) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/monitor.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 50c24460d992a..4076336fbba6d 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1200,7 +1200,8 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || + !r->mon.mbm_assign_on_mkdir) return; if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) @@ -1258,8 +1259,7 @@ void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) || - !r->mon.mbm_assign_on_mkdir) + if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r)) return; if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) From 828f896555bce377c73910b2a4539ea7e2a1b95e Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Oct 2021 16:04:55 +0100 Subject: [PATCH 064/247] NVIDIA: SAUCE: DT: cacheinfo: Expose the code to generate a cache-id from a device_node BugLink: https://bugs.launchpad.net/bugs/2122432 The MPAM driver identifies caches by id for use with resctrl. It needs to know the cache-id when probe-ing, but the value isn't set in cacheinfo until device_initcall(). Even after device_initcall(), the cache-id is only available if at least one CPU associated with the cache is online. Instead of making the driver wait, expose the code that generates the cache-id. The parts of the MPAM driver that run early can use this to set up the resctrl structures before cacheinfo is ready in device_initcall(). Signed-off-by: James Morse (cherry picked from commit 2af39084438cebc0053e8ddcc4a855873125b518 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/base/cacheinfo.c | 17 ++++++++++++----- include/linux/cacheinfo.h | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index 9a16b4e7c2fc3..1d0094c3d8729 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -210,8 +210,7 @@ static bool match_cache_node(struct device_node *cpu, #define arch_compact_of_hwid(_x) (_x) #endif -static void cache_of_set_id(struct cacheinfo *this_leaf, - struct device_node *cache_node) +u32 cache_of_calculate_id(struct device_node *cache_node) { struct device_node *cpu; u32 min_id = ~0; @@ -222,15 +221,23 @@ static void cache_of_set_id(struct cacheinfo *this_leaf, id = arch_compact_of_hwid(id); if (FIELD_GET(GENMASK_ULL(63, 32), id)) { of_node_put(cpu); - return; + return ~0; } if (match_cache_node(cpu, cache_node)) min_id = min(min_id, id); } - if (min_id != ~0) { - this_leaf->id = min_id; + return min_id; +} + +static void cache_of_set_id(struct cacheinfo *this_leaf, + struct device_node *cache_node) +{ + u32 id = cache_of_calculate_id(cache_node); + + if (id != ~0) { + this_leaf->id = id; this_leaf->attributes |= CACHE_ID; } } diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index c8f4f0a0b874e..6c8b6a4559312 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -112,6 +112,7 @@ int acpi_get_cache_info(unsigned int cpu, #endif const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf); +u32 cache_of_calculate_id(struct device_node *np); /* * Get the cacheinfo structure for the cache associated with @cpu at From 31ee2bd1cec3bb0eb2fe18714d0e3e5e7aae598d Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 4 Dec 2023 14:33:19 +0000 Subject: [PATCH 065/247] NVIDIA: SAUCE: ACPI / PPTT: Add a helper to fill a cpumask from a processor container BugLink: https://bugs.launchpad.net/bugs/2122432 The ACPI MPAM table uses the UID of a processor container specified in the PPTT to indicate the subset of CPUs and cache topology that can access each MPAM System Component (MSC). This information is not directly useful to the kernel. The equivalent cpumask is needed instead. Add a helper to find the processor container by its id, then walk the possible CPUs to fill a cpumask with the CPUs that have this processor container as a parent. CC: Dave Martin Reviewed-by: Sudeep Holla Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit a6aad1476a772dc16291c3e6efc2ca9af0ed428c https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/acpi/pptt.c | 82 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 3 ++ 2 files changed, 85 insertions(+) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 54676e3d82dd5..58cfa3916a13e 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -817,3 +817,85 @@ int find_acpi_cpu_topology_hetero_id(unsigned int cpu) return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE, ACPI_PPTT_ACPI_IDENTICAL); } + +/** + * acpi_pptt_get_child_cpus() - Find all the CPUs below a PPTT processor node + * @table_hdr: A reference to the PPTT table. + * @parent_node: A pointer to the processor node in the @table_hdr. + * @cpus: A cpumask to fill with the CPUs below @parent_node. + * + * Walks up the PPTT from every possible CPU to find if the provided + * @parent_node is a parent of this CPU. + */ +static void acpi_pptt_get_child_cpus(struct acpi_table_header *table_hdr, + struct acpi_pptt_processor *parent_node, + cpumask_t *cpus) +{ + struct acpi_pptt_processor *cpu_node; + u32 acpi_id; + int cpu; + + cpumask_clear(cpus); + + for_each_possible_cpu(cpu) { + acpi_id = get_acpi_id_for_cpu(cpu); + cpu_node = acpi_find_processor_node(table_hdr, acpi_id); + + while (cpu_node) { + if (cpu_node == parent_node) { + cpumask_set_cpu(cpu, cpus); + break; + } + cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent); + } + } +} + +/** + * acpi_pptt_get_cpus_from_container() - Populate a cpumask with all CPUs in a + * processor container + * @acpi_cpu_id: The UID of the processor container. + * @cpus: The resulting CPU mask. + * + * Find the specified Processor Container, and fill @cpus with all the cpus + * below it. + * + * Not all 'Processor' entries in the PPTT are either a CPU or a Processor + * Container, they may exist purely to describe a Private resource. CPUs + * have to be leaves, so a Processor Container is a non-leaf that has the + * 'ACPI Processor ID valid' flag set. + */ +void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) +{ + struct acpi_table_header *table_hdr; + struct acpi_subtable_header *entry; + unsigned long table_end; + u32 proc_sz; + + cpumask_clear(cpus); + + table_hdr = acpi_get_pptt(); + if (!table_hdr) + return; + + table_end = (unsigned long)table_hdr + table_hdr->length; + entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr, + sizeof(struct acpi_table_pptt)); + proc_sz = sizeof(struct acpi_pptt_processor); + while ((unsigned long)entry + proc_sz <= table_end) { + + if (entry->type == ACPI_PPTT_TYPE_PROCESSOR) { + struct acpi_pptt_processor *cpu_node; + + cpu_node = (struct acpi_pptt_processor *)entry; + if (cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID && + !acpi_pptt_leaf_node(table_hdr, cpu_node) && + cpu_node->acpi_processor_id == acpi_cpu_id) { + acpi_pptt_get_child_cpus(table_hdr, cpu_node, cpus); + break; + } + } + entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry, + entry->length); + } +} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 219ef1b5970fc..bb03b52a13ab6 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1541,6 +1541,7 @@ int find_acpi_cpu_topology(unsigned int cpu, int level); int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); +void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1562,6 +1563,8 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) { return -EINVAL; } +static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, + cpumask_t *cpus) { } #endif void acpi_arch_init(void); From 4942fc06df76268e8144d83f946e8dcc8ce2a8f7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 11 Jun 2025 17:02:00 +0100 Subject: [PATCH 066/247] NVIDIA: SAUCE: ACPI / PPTT: Stop acpi_count_levels() expecting callers to clear levels BugLink: https://bugs.launchpad.net/bugs/2122432 In acpi_count_levels(), the initial value of *levels passed by the caller is really an implementation detail of acpi_count_levels(), so it is unreasonable to expect the callers of this function to know what to pass in for this parameter. The only sensible initial value is 0, which is what the only upstream caller (acpi_get_cache_info()) passes. Use a local variable for the starting cache level in acpi_count_levels(), and pass the result back to the caller via the function return value. Get rid of the levels parameter, which has no remaining purpose. Fix acpi_get_cache_info() to match. Suggested-by: Jonathan Cameron Reviewed-by: Lorenzo Pieralisi Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit c29acdf93d972a6653d94ee109e7e6a8c5134e20 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/acpi/pptt.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 58cfa3916a13e..63c3a344c075d 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -177,14 +177,14 @@ acpi_find_cache_level(struct acpi_table_header *table_hdr, } /** - * acpi_count_levels() - Given a PPTT table, and a CPU node, count the cache - * levels and split cache levels (data/instruction). + * acpi_count_levels() - Given a PPTT table, and a CPU node, count the + * total number of levels and split cache levels (data/instruction). * @table_hdr: Pointer to the head of the PPTT table * @cpu_node: processor node we wish to count caches for - * @levels: Number of levels if success. * @split_levels: Number of split cache levels (data/instruction) if * success. Can by NULL. * + * Return: number of levels. * Given a processor node containing a processing unit, walk into it and count * how many levels exist solely for it, and then walk up each level until we hit * the root node (ignore the package level because it may be possible to have @@ -192,14 +192,18 @@ acpi_find_cache_level(struct acpi_table_header *table_hdr, * split cache levels (data/instruction) that exist at each level on the way * up. */ -static void acpi_count_levels(struct acpi_table_header *table_hdr, - struct acpi_pptt_processor *cpu_node, - unsigned int *levels, unsigned int *split_levels) +static int acpi_count_levels(struct acpi_table_header *table_hdr, + struct acpi_pptt_processor *cpu_node, + unsigned int *split_levels) { + int starting_level = 0; + do { - acpi_find_cache_level(table_hdr, cpu_node, levels, split_levels, 0, 0); + acpi_find_cache_level(table_hdr, cpu_node, &starting_level, split_levels, 0, 0); cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent); } while (cpu_node); + + return starting_level; } /** @@ -645,7 +649,7 @@ int acpi_get_cache_info(unsigned int cpu, unsigned int *levels, if (!cpu_node) return -ENOENT; - acpi_count_levels(table, cpu_node, levels, split_levels); + *levels = acpi_count_levels(table, cpu_node, split_levels); pr_debug("Cache Setup: last_level=%d split_levels=%d\n", *levels, split_levels ? *split_levels : -1); From d65a17db8579ce74f491aad28205cba86aaf5c3c Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 30 Nov 2020 13:29:56 +0000 Subject: [PATCH 067/247] NVIDIA: SAUCE: ACPI / PPTT: Find cache level by cache-id BugLink: https://bugs.launchpad.net/bugs/2122432 The MPAM table identifies caches by id. The MPAM driver also wants to know the cache level to determine if the platform is of the shape that can be managed via resctrl. Cacheinfo has this information, but only for CPUs that are online. Waiting for all CPUs to come online is a problem for platforms where CPUs are brought online late by user-space. Add a helper that walks every possible cache, until it finds the one identified by cache-id, then return the level. Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 0dd98ac483407b74641bbb2551b72ca09ac042bf https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/acpi/pptt.c | 82 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 5 +++ 2 files changed, 87 insertions(+) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 63c3a344c075d..50c8f2a3c927e 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -350,6 +350,27 @@ static struct acpi_pptt_cache *acpi_find_cache_node(struct acpi_table_header *ta return found; } +static struct acpi_pptt_cache * +acpi_find_any_type_cache_node(struct acpi_table_header *table_hdr, + u32 acpi_cpu_id, unsigned int level, + struct acpi_pptt_processor **node) +{ + struct acpi_pptt_cache *cache; + + cache = acpi_find_cache_node(table_hdr, acpi_cpu_id, CACHE_TYPE_UNIFIED, + level, node); + if (cache) + return cache; + + cache = acpi_find_cache_node(table_hdr, acpi_cpu_id, CACHE_TYPE_DATA, + level, node); + if (cache) + return cache; + + return acpi_find_cache_node(table_hdr, acpi_cpu_id, CACHE_TYPE_INST, + level, node); +} + /** * update_cache_properties() - Update cacheinfo for the given processor * @this_leaf: Kernel cache info structure being updated @@ -903,3 +924,64 @@ void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) entry->length); } } + +/* + * find_acpi_cache_level_from_id() - Get the level of the specified cache + * @cache_id: The id field of the cache + * + * Determine the level relative to any CPU for the cache identified by + * cache_id. This allows the property to be found even if the CPUs are offline. + * + * The returned level can be used to group caches that are peers. + * + * The PPTT table must be rev 3 or later. + * + * If one CPU's L2 is shared with another CPU as L3, this function will return + * an unpredictable value. + * + * Return: -ENOENT if the PPTT doesn't exist, the revision isn't supported or + * the cache cannot be found. + * Otherwise returns a value which represents the level of the specified cache. + */ +int find_acpi_cache_level_from_id(u32 cache_id) +{ + int level, cpu; + u32 acpi_cpu_id; + struct acpi_pptt_cache *cache; + struct acpi_table_header *table; + struct acpi_pptt_cache_v1 *cache_v1; + struct acpi_pptt_processor *cpu_node; + + table = acpi_get_pptt(); + if (!table) + return -ENOENT; + + if (table->revision < 3) + return -ENOENT; + + for_each_possible_cpu(cpu) { + acpi_cpu_id = get_acpi_id_for_cpu(cpu); + cpu_node = acpi_find_processor_node(table, acpi_cpu_id); + if (!cpu_node) + continue; + + /* Start at 1 for L1 */ + level = 1; + cache = acpi_find_any_type_cache_node(table, acpi_cpu_id, level, + &cpu_node); + while (cache) { + cache_v1 = ACPI_ADD_PTR(struct acpi_pptt_cache_v1, + cache, sizeof(*cache)); + + if (cache->flags & ACPI_PPTT_CACHE_ID_VALID && + cache_v1->cache_id == cache_id) + return level; + + level++; + cache = acpi_find_any_type_cache_node(table, acpi_cpu_id, + level, &cpu_node); + } + } + + return -ENOENT; +} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index bb03b52a13ab6..decc3672be996 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1542,6 +1542,7 @@ int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); +int find_acpi_cache_level_from_id(u32 cache_id); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1565,6 +1566,10 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) } static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) { } +static inline int find_acpi_cache_level_from_id(u32 cache_id) +{ + return -ENOENT; +} #endif void acpi_arch_init(void); From 599bccb54e24d63b4b869a127dde5cb83dc0cc56 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 May 2021 15:16:28 +0100 Subject: [PATCH 068/247] NVIDIA: SAUCE: ACPI / PPTT: Add a helper to fill a cpumask from a cache_id BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM identifies CPUs by the cache_id in the PPTT cache structure. The driver needs to know which CPUs are associated with the cache. The CPUs may not all be online, so cacheinfo does not have the information. Add a helper to pull this information out of the PPTT. CC: Rohit Mathew Reviewed-by: Sudeep Holla Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit e8467e9a04989988b121753e9408eb06c32a42c9 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/acpi/pptt.c | 64 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 6 +++++ 2 files changed, 70 insertions(+) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 50c8f2a3c927e..2f86f58699a69 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -985,3 +985,67 @@ int find_acpi_cache_level_from_id(u32 cache_id) return -ENOENT; } + +/** + * acpi_pptt_get_cpumask_from_cache_id() - Get the cpus associated with the + * specified cache + * @cache_id: The id field of the cache + * @cpus: Where to build the cpumask + * + * Determine which CPUs are below this cache in the PPTT. This allows the property + * to be found even if the CPUs are offline. + * + * The PPTT table must be rev 3 or later, + * + * Return: -ENOENT if the PPTT doesn't exist, or the cache cannot be found. + * Otherwise returns 0 and sets the cpus in the provided cpumask. + */ +int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus) +{ + int level, cpu; + u32 acpi_cpu_id; + struct acpi_pptt_cache *cache; + struct acpi_table_header *table; + struct acpi_pptt_cache_v1 *cache_v1; + struct acpi_pptt_processor *cpu_node; + + cpumask_clear(cpus); + + table = acpi_get_pptt(); + if (!table) + return -ENOENT; + + if (table->revision < 3) + return -ENOENT; + + for_each_possible_cpu(cpu) { + acpi_cpu_id = get_acpi_id_for_cpu(cpu); + cpu_node = acpi_find_processor_node(table, acpi_cpu_id); + if (!cpu_node) + continue; + + /* Start at 1 for L1 */ + level = 1; + cache = acpi_find_any_type_cache_node(table, acpi_cpu_id, level, + &cpu_node); + while (cache) { + cache_v1 = ACPI_ADD_PTR(struct acpi_pptt_cache_v1, + cache, sizeof(*cache)); + if (!cache) + continue; + + cache_v1 = ACPI_ADD_PTR(struct acpi_pptt_cache_v1, + cache, sizeof(*cache)); + + if (cache->flags & ACPI_PPTT_CACHE_ID_VALID && + cache_v1->cache_id == cache_id) + cpumask_set_cpu(cpu, cpus); + + level++; + cache = acpi_find_any_type_cache_node(table, acpi_cpu_id, + level, &cpu_node); + } + } + + return 0; +} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index decc3672be996..dc95b0d0e0e4d 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1543,6 +1543,7 @@ int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); int find_acpi_cache_level_from_id(u32 cache_id); +int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1570,6 +1571,11 @@ static inline int find_acpi_cache_level_from_id(u32 cache_id) { return -ENOENT; } +static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, + cpumask_t *cpus) +{ + return -ENOENT; +} #endif void acpi_arch_init(void); From 661ced4a395b63b9446e6a2eef3db7389c028e5f Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 17 Sep 2025 16:34:31 +0100 Subject: [PATCH 069/247] NVIDIA: SAUCE: DROP: ACPI / PPTT: Add a for_each_acpi_pptt_entry() helper BugLink: https://bugs.launchpad.net/bugs/2122432 The PPTT has three functions that loop over the table looking at each entry. This adds a fair amount of visual distraction which isn't relevant to what each of these functions do. Add a for_each_acpi_pptt_entry() helper to do this work making the users easier on the eye. Signed-off-by: James Morse (cherry picked from commit f549ad6ffcb4b39411f0fd7674de365fe7a0d8f8 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/acpi/pptt.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 2f86f58699a69..4adb3de10c3de 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -21,6 +21,15 @@ #include #include +#define for_each_acpi_pptt_entry(table, entry) \ + for ((entry = ACPI_ADD_PTR(struct acpi_subtable_header, table, \ + sizeof(struct acpi_table_pptt))); \ + ((unsigned long)entry + sizeof(struct acpi_subtable_header)) \ + <= ((unsigned long)table + table->length); \ + (entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry, \ + (entry)->length))) + + static struct acpi_subtable_header *fetch_pptt_subtable(struct acpi_table_header *table_hdr, u32 pptt_ref) { @@ -221,22 +230,16 @@ static int acpi_pptt_leaf_node(struct acpi_table_header *table_hdr, struct acpi_pptt_processor *node) { struct acpi_subtable_header *entry; - unsigned long table_end; u32 node_entry; struct acpi_pptt_processor *cpu_node; - u32 proc_sz; if (table_hdr->revision > 1) return (node->flags & ACPI_PPTT_ACPI_LEAF_NODE); - table_end = (unsigned long)table_hdr + table_hdr->length; node_entry = ACPI_PTR_DIFF(node, table_hdr); - entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr, - sizeof(struct acpi_table_pptt)); - proc_sz = sizeof(struct acpi_pptt_processor); /* ignore subtable types that are smaller than a processor node */ - while ((unsigned long)entry + proc_sz <= table_end) { + for_each_acpi_pptt_entry(table_hdr, entry) { cpu_node = (struct acpi_pptt_processor *)entry; if (entry->type == ACPI_PPTT_TYPE_PROCESSOR && @@ -244,9 +247,6 @@ static int acpi_pptt_leaf_node(struct acpi_table_header *table_hdr, return 0; if (entry->length == 0) return 0; - - entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry, - entry->length); } return 1; } @@ -274,12 +274,10 @@ static struct acpi_pptt_processor *acpi_find_processor_node(struct acpi_table_he u32 proc_sz; table_end = (unsigned long)table_hdr + table_hdr->length; - entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr, - sizeof(struct acpi_table_pptt)); proc_sz = sizeof(struct acpi_pptt_processor); /* find the processor structure associated with this cpuid */ - while ((unsigned long)entry + proc_sz <= table_end) { + for_each_acpi_pptt_entry(table_hdr, entry) { cpu_node = (struct acpi_pptt_processor *)entry; if (entry->length == 0) { @@ -294,9 +292,6 @@ static struct acpi_pptt_processor *acpi_find_processor_node(struct acpi_table_he acpi_pptt_leaf_node(table_hdr, cpu_node)) { return (struct acpi_pptt_processor *)entry; } - - entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry, - entry->length); } return NULL; @@ -894,8 +889,6 @@ void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) { struct acpi_table_header *table_hdr; struct acpi_subtable_header *entry; - unsigned long table_end; - u32 proc_sz; cpumask_clear(cpus); @@ -903,12 +896,7 @@ void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) if (!table_hdr) return; - table_end = (unsigned long)table_hdr + table_hdr->length; - entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr, - sizeof(struct acpi_table_pptt)); - proc_sz = sizeof(struct acpi_pptt_processor); - while ((unsigned long)entry + proc_sz <= table_end) { - + for_each_acpi_pptt_entry(table_hdr, entry) { if (entry->type == ACPI_PPTT_TYPE_PROCESSOR) { struct acpi_pptt_processor *cpu_node; @@ -920,8 +908,6 @@ void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) break; } } - entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry, - entry->length); } } From 80849246234edf13b5ae617029339d9b208c97b5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 4 Jul 2025 15:45:24 +0100 Subject: [PATCH 070/247] NVIDIA: SAUCE: arm64: kconfig: Add Kconfig entry for MPAM BugLink: https://bugs.launchpad.net/bugs/2122432 The bulk of the MPAM driver lives outside the arch code because it largely manages MMIO devices that generate interrupts. The driver needs a Kconfig symbol to enable it. As MPAM is only found on arm64 platforms, the arm64 tree is the most natural home for the Kconfig option. This Kconfig option will later be used by the arch code to enable or disable the MPAM context-switch code, and to register properties of CPUs with the MPAM driver. Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Tested-by: Fenghua Yu CC: Dave Martin Signed-off-by: James Morse (cherry picked from commit 1a0142e7fa9f7ca1e4209b03b00d15331b402a5d https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/arm64/Kconfig | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c61d85dfa07c4..bd3b75f379b52 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2061,6 +2061,29 @@ config ARM64_TLB_RANGE ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a range of input addresses. +config ARM64_MPAM + bool "Enable support for MPAM" + help + Memory System Resource Partitioning and Monitoring (MPAM) is an + optional extension to the Arm architecture that allows each + transaction issued to the memory system to be labelled with a + Partition identifier (PARTID) and Performance Monitoring Group + identifier (PMG). + + Memory system components, such as the caches, can be configured with + policies to control how much of various physical resources (such as + memory bandwidth or cache memory) the transactions labelled with each + PARTID can consume. Depending on the capabilities of the hardware, + the PARTID and PMG can also be used as filtering criteria to measure + the memory system resource consumption of different parts of a + workload. + + Use of this extension requires CPU support, support in the + Memory System Components (MSC), and a description from firmware + of where the MSCs are in the address space. + + MPAM is exposed to user-space via the resctrl pseudo filesystem. + endmenu # "ARMv8.4 architectural features" menu "ARMv8.5 architectural features" From ecd69ed7ebf572e9ac50d7992b468414e4a80ac4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 7 Jul 2025 11:13:54 +0100 Subject: [PATCH 071/247] NVIDIA: SAUCE: ACPI / MPAM: Parse the MPAM table BugLink: https://bugs.launchpad.net/bugs/2122432 Add code to parse the arm64 specific MPAM table, looking up the cache level from the PPTT and feeding the end result into the MPAM driver. This happens in two stages. Platform devices are created first for the MSC devices. Once the driver probes it calls acpi_mpam_parse_resources() to discover the RIS entries the MSC contains. For now the MPAM hook mpam_ris_create() is stubbed out, but will update the MPAM driver with optional discovered data about the RIS entries. CC: Carl Worth Link: https://developer.arm.com/documentation/den0065/3-0bet/?lang=en Reviewed-by: Lorenzo Pieralisi Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 22f9e3b01379d6b820853d650a197ab76868172d https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/arm64/Kconfig | 1 + drivers/acpi/arm64/Kconfig | 3 + drivers/acpi/arm64/Makefile | 1 + drivers/acpi/arm64/mpam.c | 377 ++++++++++++++++++++++++++++++++ drivers/acpi/tables.c | 2 +- include/linux/acpi.h | 12 + include/linux/arm_mpam.h | 48 ++++ include/linux/platform_device.h | 1 + 8 files changed, 444 insertions(+), 1 deletion(-) create mode 100644 drivers/acpi/arm64/mpam.c create mode 100644 include/linux/arm_mpam.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index bd3b75f379b52..79e7a80a792d2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2063,6 +2063,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" + select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) is an optional extension to the Arm architecture that allows each diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig index b3ed6212244c1..f2fd79f22e7d8 100644 --- a/drivers/acpi/arm64/Kconfig +++ b/drivers/acpi/arm64/Kconfig @@ -21,3 +21,6 @@ config ACPI_AGDI config ACPI_APMT bool + +config ACPI_MPAM + bool diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile index 05ecde9eaabe9..9390b57cb5648 100644 --- a/drivers/acpi/arm64/Makefile +++ b/drivers/acpi/arm64/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI_APMT) += apmt.o obj-$(CONFIG_ACPI_FFH) += ffh.o obj-$(CONFIG_ACPI_GTDT) += gtdt.o obj-$(CONFIG_ACPI_IORT) += iort.o +obj-$(CONFIG_ACPI_MPAM) += mpam.o obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o obj-$(CONFIG_ARM_AMBA) += amba.o obj-y += dma.o init.o diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c new file mode 100644 index 0000000000000..59712397025d4 --- /dev/null +++ b/drivers/acpi/arm64/mpam.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +/* Parse the MPAM ACPI table feeding the discovered nodes into the driver */ + +#define pr_fmt(fmt) "ACPI MPAM: " fmt + +#include +#include +#include +#include +#include +#include + +#include + +/* + * Flags for acpi_table_mpam_msc.*_interrupt_flags. + * See 2.1.1 Interrupt Flags, Table 5, of DEN0065B_MPAM_ACPI_3.0-bet. + */ +#define ACPI_MPAM_MSC_IRQ_MODE BIT(0) +#define ACPI_MPAM_MSC_IRQ_TYPE_MASK GENMASK(2, 1) +#define ACPI_MPAM_MSC_IRQ_TYPE_WIRED 0 +#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK BIT(3) +#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR 0 +#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER 1 +#define ACPI_MPAM_MSC_IRQ_AFFINITY_VALID BIT(4) + +/* + * Encodings for the MSC node body interface type field. + * See 2.1 MPAM MSC node, Table 4 of DEN0065B_MPAM_ACPI_3.0-bet. + */ +#define ACPI_MPAM_MSC_IFACE_MMIO 0x00 +#define ACPI_MPAM_MSC_IFACE_PCC 0x0a + +static bool _is_ppi_partition(u32 flags) +{ + u32 aff_type, is_ppi; + bool ret; + + is_ppi = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_VALID, flags); + if (!is_ppi) + return false; + + aff_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK, flags); + ret = (aff_type == ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER); + if (ret) + pr_err_once("Partitioned interrupts not supported\n"); + + return ret; +} + +static bool acpi_mpam_register_irq(struct platform_device *pdev, int intid, + u32 flags, int *irq) +{ + u32 int_type; + int sense; + + if (!intid) + return false; + + if (_is_ppi_partition(flags)) + return false; + + sense = FIELD_GET(ACPI_MPAM_MSC_IRQ_MODE, flags); + int_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_TYPE_MASK, flags); + if (int_type != ACPI_MPAM_MSC_IRQ_TYPE_WIRED) + return false; + + *irq = acpi_register_gsi(&pdev->dev, intid, sense, ACPI_ACTIVE_HIGH); + if (*irq <= 0) { + pr_err_once("Failed to register interrupt 0x%x with ACPI\n", + intid); + return false; + } + + return true; +} + +static void acpi_mpam_parse_irqs(struct platform_device *pdev, + struct acpi_mpam_msc_node *tbl_msc, + struct resource *res, int *res_idx) +{ + u32 flags, intid; + int irq; + + intid = tbl_msc->overflow_interrupt; + flags = tbl_msc->overflow_interrupt_flags; + if (acpi_mpam_register_irq(pdev, intid, flags, &irq)) + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "overflow"); + + intid = tbl_msc->error_interrupt; + flags = tbl_msc->error_interrupt_flags; + if (acpi_mpam_register_irq(pdev, intid, flags, &irq)) + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error"); +} + +static int acpi_mpam_parse_resource(struct mpam_msc *msc, + struct acpi_mpam_resource_node *res) +{ + int level, nid; + u32 cache_id; + + switch (res->locator_type) { + case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: + cache_id = res->locator.cache_locator.cache_reference; + level = find_acpi_cache_level_from_id(cache_id); + if (level <= 0) { + pr_err_once("Bad level (%d) for cache with id %u\n", level, cache_id); + return -EINVAL; + } + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, + level, cache_id); + case ACPI_MPAM_LOCATION_TYPE_MEMORY: + nid = pxm_to_node(res->locator.memory_locator.proximity_domain); + if (nid == NUMA_NO_NODE) { + pr_debug("Bad proxmity domain %lld, using node 0 instead\n", + res->locator.memory_locator.proximity_domain); + nid = 0; + } + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, + 255, nid); + default: + /* These get discovered later and are treated as unknown */ + return 0; + } +} + +int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc) +{ + int i, err; + char *ptr, *table_end; + struct acpi_mpam_resource_node *resource; + + ptr = (char *)(tbl_msc + 1); + table_end = ptr + tbl_msc->length; + for (i = 0; i < tbl_msc->num_resource_nodes; i++) { + u64 max_deps, remaining_table; + + if (ptr + sizeof(*resource) > table_end) + return -EINVAL; + + resource = (struct acpi_mpam_resource_node *)ptr; + + remaining_table = table_end - ptr; + max_deps = remaining_table / sizeof(struct acpi_mpam_func_deps); + if (resource->num_functional_deps > max_deps) { + pr_debug("MSC has impossible number of functional dependencies\n"); + return -EINVAL; + } + + err = acpi_mpam_parse_resource(msc, resource); + if (err) + return err; + + ptr += sizeof(*resource); + ptr += resource->num_functional_deps * sizeof(struct acpi_mpam_func_deps); + } + + return 0; +} + +static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc, + struct platform_device *pdev, + u32 *acpi_id) +{ + char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1] = { 0 }; + bool acpi_id_valid = false; + struct acpi_device *buddy; + char uid[11]; + int err; + + memcpy(hid, &tbl_msc->hardware_id_linked_device, + sizeof(tbl_msc->hardware_id_linked_device)); + + if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) { + *acpi_id = tbl_msc->instance_id_linked_device; + acpi_id_valid = true; + } + + err = snprintf(uid, sizeof(uid), "%u", + tbl_msc->instance_id_linked_device); + if (err >= sizeof(uid)) { + pr_debug("Failed to convert uid of device for power management."); + return acpi_id_valid; + } + + buddy = acpi_dev_get_first_match_dev(hid, uid, -1); + if (buddy) + device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS); + + return acpi_id_valid; +} + +static int decode_interface_type(struct acpi_mpam_msc_node *tbl_msc, + enum mpam_msc_iface *iface) +{ + switch (tbl_msc->interface_type) { + case ACPI_MPAM_MSC_IFACE_MMIO: + *iface = MPAM_IFACE_MMIO; + return 0; + case ACPI_MPAM_MSC_IFACE_PCC: + *iface = MPAM_IFACE_PCC; + return 0; + default: + return -EINVAL; + } +} + +static struct platform_device * __init acpi_mpam_parse_msc(struct acpi_mpam_msc_node *tbl_msc) +{ + struct platform_device *pdev __free(platform_device_put) = platform_device_alloc("mpam_msc", tbl_msc->identifier); + int next_res = 0, next_prop = 0, err; + /* pcc, nrdy, affinity and a sentinel */ + struct property_entry props[4] = { 0 }; + /* mmio, 2xirq, no sentinel. */ + struct resource res[3] = { 0 }; + struct acpi_device *companion; + enum mpam_msc_iface iface; + char uid[16]; + u32 acpi_id; + + if (!pdev) + return ERR_PTR(-ENOMEM); + + /* Some power management is described in the namespace: */ + err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier); + if (err > 0 && err < sizeof(uid)) { + companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1); + if (companion) + ACPI_COMPANION_SET(&pdev->dev, companion); + else + pr_debug("MSC.%u: missing namespace entry\n", + tbl_msc->identifier); + } + + if (decode_interface_type(tbl_msc, &iface)) { + pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier); + return ERR_PTR(-EINVAL); + } + + if (iface == MPAM_IFACE_MMIO) + res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address, + tbl_msc->mmio_size, + "MPAM:MSC"); + else if (iface == MPAM_IFACE_PCC) + props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel", + tbl_msc->base_address); + + acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res); + + WARN_ON_ONCE(next_res > ARRAY_SIZE(res)); + err = platform_device_add_resources(pdev, res, next_res); + if (err) + return ERR_PTR(err); + + props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us", + tbl_msc->max_nrdy_usec); + + /* + * The MSC's CPU affinity is described via its linked power + * management device, but only if it points at a Processor or + * Processor Container. + */ + if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id)) + props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity", acpi_id); + + WARN_ON_ONCE(next_prop > ARRAY_SIZE(props)); + err = device_create_managed_software_node(&pdev->dev, props, NULL); + if (err) + return ERR_PTR(err); + + /* + * Stash the table entry for acpi_mpam_parse_resources() to discover + * what this MSC controls. + */ + err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length); + if (err) + return ERR_PTR(err); + + err = platform_device_add(pdev); + if (err) + return ERR_PTR(err); + + return_ptr(pdev); +} + +static int __init acpi_mpam_parse(void) +{ + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); + char *table_end, *table_offset = (char *)(table + 1); + struct acpi_mpam_msc_node *tbl_msc; + struct platform_device *pdev; + + if (acpi_disabled || !system_supports_mpam() || IS_ERR(table)) + return 0; + + if (table->revision < 1) + return 0; + + table_end = (char *)table + table->length; + + while (table_offset < table_end) { + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; + table_offset += tbl_msc->length; + + if (table_offset > table_end) { + pr_err("MSC entry overlaps end of ACPI table\n"); + return -EINVAL; + } + + /* + * If any of the reserved fields are set, make no attempt to + * parse the MSC structure. This MSC will still be counted by + * acpi_mpam_count_msc(), meaning the MPAM driver can't probe + * against all MSC, and will never be enabled. There is no way + * to enable it safely, because we cannot determine safe + * system-wide partid and pmg ranges in this situation. + */ + if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) { + pr_err_once("Unrecognised MSC, MPAM not usable\n"); + pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier); + continue; + } + + if (!tbl_msc->mmio_size) { + pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier); + continue; + } + + pdev = acpi_mpam_parse_msc(tbl_msc); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + } + + return 0; +} + +int acpi_mpam_count_msc(void) +{ + struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); + char *table_end, *table_offset = (char *)(table + 1); + struct acpi_mpam_msc_node *tbl_msc; + int count = 0; + + if (IS_ERR(table)) + return 0; + + if (table->revision < 1) + return 0; + + table_end = (char *)table + table->length; + + while (table_offset < table_end) { + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; + if (!tbl_msc->mmio_size) + continue; + + if (tbl_msc->length < sizeof(*tbl_msc)) + return -EINVAL; + if (tbl_msc->length > table_end - table_offset) + return -EINVAL; + table_offset += tbl_msc->length; + + count++; + } + + return count; +} + +/* + * Call after ACPI devices have been created, which happens behind acpi_scan_init() + * called from subsys_initcall(). PCC requires the mailbox driver, which is + * initialised from postcore_initcall(). + */ +subsys_initcall_sync(acpi_mpam_parse); diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index fa9bb8c8ce953..f0cf9d7562e08 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -408,7 +408,7 @@ static const char table_sigs[][ACPI_NAMESEG_SIZE] __nonstring_array __initconst ACPI_SIG_PSDT, ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT, ACPI_SIG_IORT, ACPI_SIG_NFIT, ACPI_SIG_HMAT, ACPI_SIG_PPTT, ACPI_SIG_NHLT, ACPI_SIG_AEST, ACPI_SIG_CEDT, ACPI_SIG_AGDI, - ACPI_SIG_NBFT }; + ACPI_SIG_NBFT, ACPI_SIG_MPAM}; #define ACPI_HEADER_SIZE sizeof(struct acpi_table_header) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index dc95b0d0e0e4d..03ba4992ba54f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -8,6 +8,7 @@ #ifndef _LINUX_ACPI_H #define _LINUX_ACPI_H +#include #include #include /* for struct resource */ #include @@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void); void acpi_table_init_complete (void); int acpi_table_init (void); +static inline struct acpi_table_header *acpi_get_table_ret(char *signature, u32 instance) +{ + struct acpi_table_header *table; + int status = acpi_get_table(signature, instance, &table); + + if (ACPI_FAILURE(status)) + return ERR_PTR(-ENOENT); + return table; +} +DEFINE_FREE(acpi_table, struct acpi_table_header *, if (!IS_ERR(_T)) acpi_put_table(_T)) + int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init_or_acpilib acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h new file mode 100644 index 0000000000000..3d6c39c667c39 --- /dev/null +++ b/include/linux/arm_mpam.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __LINUX_ARM_MPAM_H +#define __LINUX_ARM_MPAM_H + +#include +#include + +#define GLOBAL_AFFINITY ~0 + +struct mpam_msc; + +enum mpam_msc_iface { + MPAM_IFACE_MMIO, /* a real MPAM MSC */ + MPAM_IFACE_PCC, /* a fake MPAM MSC */ +}; + +enum mpam_class_types { + MPAM_CLASS_CACHE, /* Well known caches, e.g. L2 */ + MPAM_CLASS_MEMORY, /* Main memory */ + MPAM_CLASS_UNKNOWN, /* Everything else, e.g. SMMU */ +}; + +#ifdef CONFIG_ACPI_MPAM +/* Parse the ACPI description of resources entries for this MSC. */ +int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc); + +int acpi_mpam_count_msc(void); +#else +static inline int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc) +{ + return -EINVAL; +} + +static inline int acpi_mpam_count_msc(void) { return -EINVAL; } +#endif + +static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, + int component_id) +{ + return -EINVAL; +} + +#endif /* __LINUX_ARM_MPAM_H */ diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 074754c23d330..23a30ada2d4cf 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -232,6 +232,7 @@ extern int platform_device_add_data(struct platform_device *pdev, extern int platform_device_add(struct platform_device *pdev); extern void platform_device_del(struct platform_device *pdev); extern void platform_device_put(struct platform_device *pdev); +DEFINE_FREE(platform_device_put, struct platform_device *, if (_T) platform_device_put(_T)) struct platform_driver { int (*probe)(struct platform_device *); From 10b5598d816ad7833a55886d41c3ec8a9eb313c4 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 12 Nov 2021 13:24:35 -0600 Subject: [PATCH 072/247] NVIDIA: SAUCE: DT: dt-bindings: arm: Add MPAM MSC binding BugLink: https://bugs.launchpad.net/bugs/2122432 The binding is designed around the assumption that an MSC will be a sub-block of something else such as a memory controller, cache controller, or IOMMU. However, it's certainly possible a design does not have that association or has a mixture of both, so the binding illustrates how we can support that with RIS child nodes. A key part of MPAM is we need to know about all of the MSCs in the system before it can be enabled. This drives the need for the genericish 'arm,mpam-msc' compatible. Though we can't assume an MSC is accessible until a h/w specific driver potentially enables the h/w. Cc: James Morse Signed-off-by: Rob Herring Signed-off-by: James Morse (cherry picked from commit b38bed339681b3d90ff7508f8a585127bd721d90 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- .../devicetree/bindings/arm/arm,mpam-msc.yaml | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 Documentation/devicetree/bindings/arm/arm,mpam-msc.yaml diff --git a/Documentation/devicetree/bindings/arm/arm,mpam-msc.yaml b/Documentation/devicetree/bindings/arm/arm,mpam-msc.yaml new file mode 100644 index 0000000000000..53a6fdbbf05fe --- /dev/null +++ b/Documentation/devicetree/bindings/arm/arm,mpam-msc.yaml @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/arm/arm,mpam-msc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Arm Memory System Resource Partitioning and Monitoring (MPAM) + +description: | + The Arm MPAM specification can be found here: + + https://developer.arm.com/documentation/ddi0598/latest + +maintainers: + - Rob Herring + +properties: + compatible: + items: + - const: arm,mpam-msc # Further details are discoverable + - const: arm,mpam-memory-controller-msc + + reg: + maxItems: 1 + description: A memory region containing registers as defined in the MPAM + specification. + + interrupts: + minItems: 1 + items: + - description: error (optional) + - description: overflow (optional, only for monitoring) + + interrupt-names: + oneOf: + - items: + - enum: [ error, overflow ] + - items: + - const: error + - const: overflow + + arm,not-ready-us: + description: The maximum time in microseconds for monitoring data to be + accurate after a settings change. For more information, see the + Not-Ready (NRDY) bit description in the MPAM specification. + + numa-node-id: true # see NUMA binding + + '#address-cells': + const: 1 + + '#size-cells': + const: 0 + +patternProperties: + '^ris@[0-9a-f]+$': + type: object + additionalProperties: false + description: + RIS nodes for each resource instance in an MSC. These nodes are required + for each resource instance implementing known MPAM controls + + properties: + compatible: + enum: + - arm,mpam-cache + # Memory bandwidth + - arm,mpam-memory + + reg: + minimum: 0 + maximum: 0xf + + cpus: + description: + Phandle(s) to the CPU node(s) this RIS belongs to. By default, the parent + device's affinity is used. + + arm,mpam-device: + $ref: /schemas/types.yaml#/definitions/phandle + description: + By default, the MPAM enabled device associated with a RIS is the MSC's + parent node. It is possible for each RIS to be associated with different + devices in which case 'arm,mpam-device' should be used. + + required: + - compatible + - reg + +required: + - compatible + - reg + +dependencies: + interrupts: [ interrupt-names ] + +additionalProperties: false + +examples: + - | + L3: cache-controller@30000000 { + compatible = "arm,dsu-l3-cache", "cache"; + cache-level = <3>; + cache-unified; + + ranges = <0x0 0x30000000 0x800000>; + #address-cells = <1>; + #size-cells = <1>; + + msc@10000 { + compatible = "arm,mpam-msc"; + + reg = <0x10000 0x2000>; + interrupts = <1>, <2>; + interrupt-names = "error", "overflow"; + arm,not-ready-us = <1>; + /* CPU affinity implied by parent cache node */ + }; + }; + + mem: memory-controller@20000 { + compatible = "foo,a-memory-controller"; + reg = <0x20000 0x1000>; + + #address-cells = <1>; + #size-cells = <1>; + ranges; + + msc@21000 { + compatible = "arm,mpam-memory-controller-msc", "arm,mpam-msc"; + reg = <0x21000 0x1000>; + interrupts = <3>; + interrupt-names = "error"; + arm,not-ready-us = <1>; + numa-node-id = <1>; + }; + }; + + iommu@40000 { + reg = <0x40000 0x1000>; + + ranges; + #address-cells = <1>; + #size-cells = <1>; + + msc@41000 { + compatible = "arm,mpam-msc"; + reg = <0 0x1000>; + interrupts = <5>, <6>; + interrupt-names = "error", "overflow"; + arm,not-ready-us = <1>; + + #address-cells = <1>; + #size-cells = <0>; + + ris@2 { + compatible = "arm,mpam-cache"; + reg = <0>; + // TODO: How to map to device(s)? + }; + }; + }; + + msc@80000 { + compatible = "foo,a-standalone-msc"; + reg = <0x80000 0x1000>; + + clocks = <&clks 123>; + + ranges; + #address-cells = <1>; + #size-cells = <1>; + + msc@10000 { + compatible = "arm,mpam-msc"; + + reg = <0x10000 0x2000>; + interrupts = <7>; + interrupt-names = "overflow"; + arm,not-ready-us = <1>; + + #address-cells = <1>; + #size-cells = <0>; + + ris@0 { + compatible = "arm,mpam-cache"; + reg = <0>; + arm,mpam-device = <&L2_0>; + }; + + ris@1 { + compatible = "arm,mpam-memory"; + reg = <1>; + arm,mpam-device = <&mem>; + }; + }; + }; + +... From 24f2485b45d3c16b3c2e7ca825373020498a186d Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 14 Aug 2018 15:03:34 +0100 Subject: [PATCH 073/247] NVIDIA: SAUCE: arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate BugLink: https://bugs.launchpad.net/bugs/2122432 Probing MPAM is convoluted. MSCs that are integrated with a CPU may only be accessible from those CPUs, and they may not be online. Touching the hardware early is pointless as MPAM can't be used until the system-wide common values for num_partid and num_pmg have been discovered. Start with driver probe/remove and mapping the MSC. CC: Carl Worth Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit ac46acae13756a05118b806ef6061f47e51d01c5 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/arm64/Kconfig | 1 + drivers/Kconfig | 2 + drivers/Makefile | 1 + drivers/acpi/arm64/mpam.c | 7 ++ drivers/resctrl/Kconfig | 13 +++ drivers/resctrl/Makefile | 4 + drivers/resctrl/mpam_devices.c | 190 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 52 +++++++++ include/linux/acpi.h | 2 +- 9 files changed, 271 insertions(+), 1 deletion(-) create mode 100644 drivers/resctrl/Kconfig create mode 100644 drivers/resctrl/Makefile create mode 100644 drivers/resctrl/mpam_devices.c create mode 100644 drivers/resctrl/mpam_internal.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 79e7a80a792d2..17394c637adbf 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2063,6 +2063,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" + select ARM64_MPAM_DRIVER if EXPERT # does nothing yet select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) is an diff --git a/drivers/Kconfig b/drivers/Kconfig index 4915a63866b01..3054b50a2f4cb 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -251,4 +251,6 @@ source "drivers/hte/Kconfig" source "drivers/cdx/Kconfig" +source "drivers/resctrl/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 9afe024f2d755..6c2af14c22b87 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -194,5 +194,6 @@ obj-$(CONFIG_HTE) += hte/ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ obj-$(CONFIG_DPLL) += dpll/ +obj-y += resctrl/ obj-$(CONFIG_S390) += s390/ diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c index 59712397025d4..51c6f5fd4a5e0 100644 --- a/drivers/acpi/arm64/mpam.c +++ b/drivers/acpi/arm64/mpam.c @@ -337,6 +337,13 @@ static int __init acpi_mpam_parse(void) return 0; } +/** + * acpi_mpam_count_msc() - Count the number of MSC described by firmware. + * + * Returns the number of of MSC, or zero for an error. + * + * This can be called before or in parallel with acpi_mpam_parse(). + */ int acpi_mpam_count_msc(void) { struct acpi_table_header *table __free(acpi_table) = acpi_get_table_ret(ACPI_SIG_MPAM, 0); diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig new file mode 100644 index 0000000000000..58c83b5c8bfdf --- /dev/null +++ b/drivers/resctrl/Kconfig @@ -0,0 +1,13 @@ +menuconfig ARM64_MPAM_DRIVER + bool "MPAM driver" + depends on ARM64 && ARM64_MPAM && EXPERT + help + MPAM driver for System IP, e,g. caches and memory controllers. + +if ARM64_MPAM_DRIVER +config ARM64_MPAM_DRIVER_DEBUG + bool "Enable debug messages from the MPAM driver" + help + Say yes here to enable debug messages from the MPAM driver. + +endif diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile new file mode 100644 index 0000000000000..898199dcf80d5 --- /dev/null +++ b/drivers/resctrl/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o +mpam-y += mpam_devices.o + +ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c new file mode 100644 index 0000000000000..d18eeec95f795 --- /dev/null +++ b/drivers/resctrl/mpam_devices.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mpam_internal.h" + +/* + * mpam_list_lock protects the SRCU lists when writing. Once the + * mpam_enabled key is enabled these lists are read-only, + * unless the error interrupt disables the driver. + */ +static DEFINE_MUTEX(mpam_list_lock); +static LIST_HEAD(mpam_all_msc); + +static struct srcu_struct mpam_srcu; + +/* + * Number of MSCs that have been probed. Once all MSC have been probed MPAM + * can be enabled. + */ +static atomic_t mpam_num_msc; + +/* + * An MSC can control traffic from a set of CPUs, but may only be accessible + * from a (hopefully wider) set of CPUs. The common reason for this is power + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the + * corresponding cache may also be powered off. By making accesses from + * one of those CPUs, we ensure this isn't the case. + */ +static int update_msc_accessibility(struct mpam_msc *msc) +{ + u32 affinity_id; + int err; + + err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity", + &affinity_id); + if (err) + cpumask_copy(&msc->accessibility, cpu_possible_mask); + else + acpi_pptt_get_cpus_from_container(affinity_id, + &msc->accessibility); + return err; +} + +static int fw_num_msc; + +static void mpam_msc_destroy(struct mpam_msc *msc) +{ + struct platform_device *pdev = msc->pdev; + + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&msc->all_msc_list); + platform_set_drvdata(pdev, NULL); +} + +static void mpam_msc_drv_remove(struct platform_device *pdev) +{ + struct mpam_msc *msc = platform_get_drvdata(pdev); + + if (!msc) + return; + + mutex_lock(&mpam_list_lock); + mpam_msc_destroy(msc); + mutex_unlock(&mpam_list_lock); + + synchronize_srcu(&mpam_srcu); +} + +static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) +{ + int err; + u32 tmp; + struct mpam_msc *msc; + struct resource *msc_res; + struct device *dev = &pdev->dev; + + lockdep_assert_held(&mpam_list_lock); + + msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL); + if (!msc) + return ERR_PTR(-ENOMEM); + + mutex_init(&msc->probe_lock); + mutex_init(&msc->part_sel_lock); + msc->id = pdev->id; + msc->pdev = pdev; + INIT_LIST_HEAD_RCU(&msc->all_msc_list); + INIT_LIST_HEAD_RCU(&msc->ris); + + err = update_msc_accessibility(msc); + if (err) + return ERR_PTR(err); + if (cpumask_empty(&msc->accessibility)) { + dev_err_once(dev, "MSC is not accessible from any CPU!"); + return ERR_PTR(-EINVAL); + } + + if (device_property_read_u32(&pdev->dev, "pcc-channel", &tmp)) + msc->iface = MPAM_IFACE_MMIO; + else + msc->iface = MPAM_IFACE_PCC; + + if (msc->iface == MPAM_IFACE_MMIO) { + void __iomem *io; + + io = devm_platform_get_and_ioremap_resource(pdev, 0, + &msc_res); + if (IS_ERR(io)) { + dev_err_once(dev, "Failed to map MSC base address\n"); + return (void *)io; + } + msc->mapped_hwpage_sz = msc_res->end - msc_res->start; + msc->mapped_hwpage = io; + } + + list_add_rcu(&msc->all_msc_list, &mpam_all_msc); + platform_set_drvdata(pdev, msc); + + return msc; +} + +static int mpam_msc_drv_probe(struct platform_device *pdev) +{ + int err; + struct mpam_msc *msc = NULL; + void *plat_data = pdev->dev.platform_data; + + mutex_lock(&mpam_list_lock); + msc = do_mpam_msc_drv_probe(pdev); + mutex_unlock(&mpam_list_lock); + if (!IS_ERR(msc)) { + /* Create RIS entries described by firmware */ + err = acpi_mpam_parse_resources(msc, plat_data); + if (err) + mpam_msc_drv_remove(pdev); + } else { + err = PTR_ERR(msc); + } + + if (!err && atomic_add_return(1, &mpam_num_msc) == fw_num_msc) + pr_info("Discovered all MSC\n"); + + return err; +} + +static struct platform_driver mpam_msc_driver = { + .driver = { + .name = "mpam_msc", + }, + .probe = mpam_msc_drv_probe, + .remove = mpam_msc_drv_remove, +}; + +static int __init mpam_msc_driver_init(void) +{ + if (!system_supports_mpam()) + return -EOPNOTSUPP; + + init_srcu_struct(&mpam_srcu); + + fw_num_msc = acpi_mpam_count_msc(); + + if (fw_num_msc <= 0) { + pr_err("No MSC devices found in firmware\n"); + return -EINVAL; + } + + return platform_driver_register(&mpam_msc_driver); +} +subsys_initcall(mpam_msc_driver_init); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h new file mode 100644 index 0000000000000..6ac75f3613c36 --- /dev/null +++ b/drivers/resctrl/mpam_internal.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (C) 2025 Arm Ltd. + +#ifndef MPAM_INTERNAL_H +#define MPAM_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include + +struct platform_device; + +struct mpam_msc { + /* member of mpam_all_msc */ + struct list_head all_msc_list; + + int id; + struct platform_device *pdev; + + /* Not modified after mpam_is_enabled() becomes true */ + enum mpam_msc_iface iface; + u32 nrdy_usec; + cpumask_t accessibility; + + /* + * probe_lock is only taken during discovery. After discovery these + * properties become read-only and the lists are protected by SRCU. + */ + struct mutex probe_lock; + unsigned long ris_idxs; + u32 ris_max; + + /* mpam_msc_ris of this component */ + struct list_head ris; + + /* + * part_sel_lock protects access to the MSC hardware registers that are + * affected by MPAMCFG_PART_SEL. (including the ID registers that vary + * by RIS). + * If needed, take msc->probe_lock first. + */ + struct mutex part_sel_lock; + + void __iomem *mapped_hwpage; + size_t mapped_hwpage_sz; +}; +#endif /* MPAM_INTERNAL_H */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 03ba4992ba54f..42cbeaba2a510 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -231,7 +231,7 @@ static inline struct acpi_table_header *acpi_get_table_ret(char *signature, u32 return ERR_PTR(-ENOENT); return table; } -DEFINE_FREE(acpi_table, struct acpi_table_header *, if (!IS_ERR(_T)) acpi_put_table(_T)) +DEFINE_FREE(acpi_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T)) int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init_or_acpilib acpi_table_parse_entries(char *id, From 49e1ff7d25ca2236d41c51658d96423b3c07fcff Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 23 Sep 2025 17:33:40 +0100 Subject: [PATCH 074/247] NVIDIA: SAUCE: arm_mpam: parse resources BugLink: https://bugs.launchpad.net/bugs/2122432 Parse resources from either MPAM ACPI table or device tree. The parsed resources are stored in ris[] per msc. The author is James. He didn't add Signed-off-by. (backported from commit a6ab8b6c77cbc78a57015363abad5a68b2c7f18b https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) [fenghuay: Change subject and add commit message.] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 245 ++++++++++++++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 4 + 2 files changed, 241 insertions(+), 8 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index d18eeec95f795..b76eac1be17da 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -38,6 +41,164 @@ static struct srcu_struct mpam_srcu; */ static atomic_t mpam_num_msc; +/* Called recursively to walk the list of caches from a particular CPU */ +static void __mpam_get_cpumask_from_cache_id(int cpu, struct device_node *cache_node, + unsigned long cache_id, + u32 cache_level, + cpumask_t *affinity) +{ + int err; + u32 iter_level; + unsigned long iter_cache_id; + struct device_node *iter_node __free(device_node) = of_find_next_cache_node(cache_node); + + if (!iter_node) + return; + + err = of_property_read_u32(iter_node, "cache-level", &iter_level); + if (err) + return; + + /* + * get_cpu_cacheinfo_id() isn't ready until sometime + * during device_initcall(). Use cache_of_calculate_id(). + */ + iter_cache_id = cache_of_calculate_id(iter_node); + if (iter_cache_id == ~0UL) + return; + + if (iter_level == cache_level && iter_cache_id == cache_id) + cpumask_set_cpu(cpu, affinity); + + if (iter_level < cache_level) + __mpam_get_cpumask_from_cache_id(cpu, iter_node, cache_id, + cache_level, affinity); +} + +/* + * The cacheinfo structures are only populated when CPUs are online. + * This helper walks the device tree to include offline CPUs too. + */ +int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, + cpumask_t *affinity) +{ + int cpu; + + if (!acpi_disabled) + return acpi_pptt_get_cpumask_from_cache_id(cache_id, affinity); + + for_each_possible_cpu(cpu) { + struct device_node *cpu_node __free(device_node) = of_get_cpu_node(cpu, NULL); + if (!cpu_node) { + pr_err("Failed to find cpu%d device node\n", cpu); + return -ENOENT; + } + + __mpam_get_cpumask_from_cache_id(cpu, cpu_node, cache_id, + cache_level, affinity); + continue; + } + + return 0; +} + +static int get_cpumask_from_cache(struct device_node *cache, + cpumask_t *affinity) +{ + int err; + u32 cache_level; + unsigned long cache_id; + + err = of_property_read_u32(cache, "cache-level", &cache_level); + if (err) { + pr_err("Failed to read cache-level from cache node\n"); + return -ENOENT; + } + + cache_id = cache_of_calculate_id(cache); + if (cache_id == ~0UL) { + pr_err("Failed to calculate cache-id from cache node\n"); + return -ENOENT; + } + + return mpam_get_cpumask_from_cache_id(cache_id, cache_level, affinity); +} + +static int mpam_dt_count_msc(void) +{ + int count = 0; + struct device_node *np; + + for_each_compatible_node(np, NULL, "arm,mpam-msc") { + if (of_device_is_available(np)) + count++; + } + + return count; +} + +static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np, + u32 ris_idx) +{ + int err = 0; + u32 level = 0; + unsigned long cache_id; + struct device *dev = &msc->pdev->dev; + struct device_node *cache __free(device_node) = NULL; + struct device_node *parent __free(device_node) = of_get_parent(np); + + if (of_device_is_compatible(np, "arm,mpam-cache")) { + cache = of_parse_phandle(np, "arm,mpam-device", 0); + if (!cache) { + dev_err_once(dev, "Failed to read phandle\n"); + return -EINVAL; + } + } else if (of_device_is_compatible(parent, "cache")) { + cache = parent; + } else { + /* For now, only caches are supported */ + cache = NULL; + return err; + } + + err = of_property_read_u32(cache, "cache-level", &level); + if (err) { + dev_err_once(dev, "Failed to read cache-level\n"); + return err; + } + + cache_id = cache_of_calculate_id(cache); + if (cache_id == ~0) { + dev_err_once(dev, "Failed to calculate cache-id\n"); + return -ENOENT; + } + + return mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level, cache_id); +} + +static int mpam_dt_parse_resources(struct mpam_msc *msc, void *ignored) +{ + u64 ris_idx = 0; + int err, num_ris = 0; + struct device_node *np; + + np = msc->pdev->dev.of_node; + for_each_available_child_of_node_scoped(np, iter) { + err = of_property_read_reg(iter, 0, &ris_idx, NULL); + if (!err) { + num_ris++; + err = mpam_dt_parse_resource(msc, iter, ris_idx); + if (err) + return err; + } + } + + if (!num_ris) + err = mpam_dt_parse_resource(msc, np, 0); + + return err; +} + /* * An MSC can control traffic from a set of CPUs, but may only be accessible * from a (hopefully wider) set of CPUs. The common reason for this is power @@ -47,16 +208,39 @@ static atomic_t mpam_num_msc; */ static int update_msc_accessibility(struct mpam_msc *msc) { + struct device *dev = &msc->pdev->dev; + struct device_node *parent; u32 affinity_id; int err; - err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity", - &affinity_id); - if (err) + if (!acpi_disabled) { + err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity", + &affinity_id); + if (err) + cpumask_copy(&msc->accessibility, cpu_possible_mask); + else + acpi_pptt_get_cpus_from_container(affinity_id, + &msc->accessibility); + + return 0; + } + + /* Where an MSC can be accessed from depends on the path to of_node. */ + parent = of_get_parent(msc->pdev->dev.of_node); + if (parent == of_root) { cpumask_copy(&msc->accessibility, cpu_possible_mask); - else - acpi_pptt_get_cpus_from_container(affinity_id, - &msc->accessibility); + err = 0; + } else { + if (of_device_is_compatible(parent, "cache")) { + err = get_cpumask_from_cache(parent, + &msc->accessibility); + } else { + err = -EINVAL; + dev_err_once(dev, "Cannot determine accessibility of MSC.\n"); + } + } + of_node_put(parent); + return err; } @@ -150,7 +334,10 @@ static int mpam_msc_drv_probe(struct platform_device *pdev) mutex_unlock(&mpam_list_lock); if (!IS_ERR(msc)) { /* Create RIS entries described by firmware */ - err = acpi_mpam_parse_resources(msc, plat_data); + if (!acpi_disabled) + err = acpi_mpam_parse_resources(msc, plat_data); + else + err = mpam_dt_parse_resources(msc, plat_data); if (err) mpam_msc_drv_remove(pdev); } else { @@ -163,14 +350,50 @@ static int mpam_msc_drv_probe(struct platform_device *pdev) return err; } +static const struct of_device_id mpam_of_match[] = { + { .compatible = "arm,mpam-msc", }, + {}, +}; +MODULE_DEVICE_TABLE(of, mpam_of_match); + static struct platform_driver mpam_msc_driver = { .driver = { .name = "mpam_msc", + .of_match_table = of_match_ptr(mpam_of_match), }, .probe = mpam_msc_drv_probe, .remove = mpam_msc_drv_remove, }; +/* + * MSCs that are declared by the firmware as being part of a cache may not + * be created automatically as platform devices, since there is no + * dedicated cache driver. + * + * Deal with theo MSCs here. + */ +static void mpam_dt_create_foundling_msc(void) +{ + struct platform_device *pdev; + struct device_node *cache; + + for_each_compatible_node(cache, NULL, "cache") { + struct device_node *cache_device; + + if (of_node_check_flag(cache, OF_POPULATED)) + continue; + + cache_device = of_find_matching_node_and_match(cache, mpam_of_match, NULL); + if (!cache_device) + continue; + of_node_put(cache_device); + + pdev = of_platform_device_create(cache, "cache", NULL); + if (!pdev) + pr_err_once("Failed to create MSC devices under caches\n"); + } +} + static int __init mpam_msc_driver_init(void) { if (!system_supports_mpam()) @@ -178,13 +401,19 @@ static int __init mpam_msc_driver_init(void) init_srcu_struct(&mpam_srcu); - fw_num_msc = acpi_mpam_count_msc(); + if (!acpi_disabled) + fw_num_msc = acpi_mpam_count_msc(); + else + fw_num_msc = mpam_dt_count_msc(); if (fw_num_msc <= 0) { pr_err("No MSC devices found in firmware\n"); return -EINVAL; } + if (acpi_disabled) + mpam_dt_create_foundling_msc(); + return platform_driver_register(&mpam_msc_driver); } subsys_initcall(mpam_msc_driver_init); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 6ac75f3613c36..f1c7180fe9aab 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -49,4 +49,8 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; }; + +int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, + cpumask_t *affinity); + #endif /* MPAM_INTERNAL_H */ From aa8b88ae09b96d300814784ffb4c33148c39b2b6 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 8 Apr 2025 17:17:04 +0100 Subject: [PATCH 075/247] NVIDIA: SAUCE: DT: arm_mpam: Add support for memory controller MSC on DT platforms BugLink: https://bugs.launchpad.net/bugs/2122432 The device-tree binding has two examples for MSC associated with memory controllers. Add the support to discover the component_id from the device-tree and create 'memory' RIS. [ morse: split out of a bigger patch, added affinity piece ] Signed-off-by: Shanker Donthineni Signed-off-by: James Morse (cherry picked from commit c1be40782ace54798333d5148ab8e82fc002fca5 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 56 +++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index b76eac1be17da..3d198e0dc7e89 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -141,10 +141,12 @@ static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np, u32 ris_idx) { int err = 0; - u32 level = 0; - unsigned long cache_id; + u32 class_id = 0; + unsigned long component_id = 0; struct device *dev = &msc->pdev->dev; + enum mpam_class_types type = MPAM_CLASS_UNKNOWN; struct device_node *cache __free(device_node) = NULL; + struct device_node *memory __free(device_node) = NULL; struct device_node *parent __free(device_node) = of_get_parent(np); if (of_device_is_compatible(np, "arm,mpam-cache")) { @@ -153,27 +155,48 @@ static int mpam_dt_parse_resource(struct mpam_msc *msc, struct device_node *np, dev_err_once(dev, "Failed to read phandle\n"); return -EINVAL; } + type = MPAM_CLASS_CACHE; + } else if (of_device_is_compatible(parent, "cache")) { cache = parent; + type = MPAM_CLASS_CACHE; + } else if (of_device_is_compatible(np, "arm,mpam-memory")) { + memory = of_parse_phandle(np, "arm,mpam-device", 0); + if (!memory) { + dev_err_once(dev, "Failed to read phandle\n"); + return -EINVAL; + } + type = MPAM_CLASS_MEMORY; + } else if (of_device_is_compatible(np, "arm,mpam-memory-controller-msc")) { + memory = parent; + type = MPAM_CLASS_MEMORY; } else { - /* For now, only caches are supported */ - cache = NULL; + /* + * For now, only caches and memory controllers are + * supported. + */ return err; } - err = of_property_read_u32(cache, "cache-level", &level); - if (err) { - dev_err_once(dev, "Failed to read cache-level\n"); - return err; - } - - cache_id = cache_of_calculate_id(cache); - if (cache_id == ~0) { - dev_err_once(dev, "Failed to calculate cache-id\n"); - return -ENOENT; + /* Determine the class and component ids, based on type. */ + if (type == MPAM_CLASS_CACHE) { + err = of_property_read_u32(cache, "cache-level", &class_id); + if (err) { + dev_err_once(dev, "Failed to read cache-level\n"); + return err; + } + component_id = cache_of_calculate_id(cache); + if (component_id == ~0) { + dev_err_once(dev, "Failed to calculate cache-id\n"); + return -ENOENT; + } + } else if (type == MPAM_CLASS_MEMORY) { + err = of_node_to_nid(np); + component_id = (err == NUMA_NO_NODE) ? 0 : err; + class_id = 255; } - return mpam_ris_create(msc, ris_idx, MPAM_CLASS_CACHE, level, cache_id); + return mpam_ris_create(msc, ris_idx, type, class_id, component_id); } static int mpam_dt_parse_resources(struct mpam_msc *msc, void *ignored) @@ -234,6 +257,9 @@ static int update_msc_accessibility(struct mpam_msc *msc) if (of_device_is_compatible(parent, "cache")) { err = get_cpumask_from_cache(parent, &msc->accessibility); + } else if (of_device_is_compatible(parent, "memory")) { + cpumask_copy(&msc->accessibility, cpu_possible_mask); + err = 0; } else { err = -EINVAL; dev_err_once(dev, "Cannot determine accessibility of MSC.\n"); From 6ca26e401ea73e96d037efc9df5358f2b91c9239 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 5 May 2021 17:18:41 +0100 Subject: [PATCH 076/247] NVIDIA: SAUCE: arm_mpam: Add the class and component structures for firmware described ris BugLink: https://bugs.launchpad.net/bugs/2122432 An MSC is a container of resources, each identified by their RIS index. Some RIS are described by firmware to provide their position in the system. Others are discovered when the driver probes the hardware. To configure a resource it needs to be found by its class, e.g. 'L2'. There are two kinds of grouping, a class is a set of components, which are visible to user-space as there are likely to be multiple instances of the L2 cache. (e.g. one per cluster or package) Add support for creating and destroying structures to allow a hierarchy of resources to be created. CC: Ben Horgan Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 30ed801f46cb339d91a4524503247c3fb36bc627 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 380 +++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 89 ++++++++ include/linux/arm_mpam.h | 8 +- 3 files changed, 469 insertions(+), 8 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 3d198e0dc7e89..518b0fcd35f4d 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -33,7 +33,7 @@ static DEFINE_MUTEX(mpam_list_lock); static LIST_HEAD(mpam_all_msc); -static struct srcu_struct mpam_srcu; +struct srcu_struct mpam_srcu; /* * Number of MSCs that have been probed. Once all MSC have been probed MPAM @@ -41,6 +41,246 @@ static struct srcu_struct mpam_srcu; */ static atomic_t mpam_num_msc; +/* + * An MSC is a physical container for controls and monitors, each identified by + * their RIS index. These share a base-address, interrupts and some MMIO + * registers. A vMSC is a virtual container for RIS in an MSC that control or + * monitor the same thing. Members of a vMSC are all RIS in the same MSC, but + * not all RIS in an MSC share a vMSC. + * Components are a group of vMSC that control or monitor the same thing but + * are from different MSC, so have different base-address, interrupts etc. + * Classes are the set components of the same type. + * + * The features of a vMSC is the union of the RIS it contains. + * The features of a Class and Component are the common subset of the vMSC + * they contain. + * + * e.g. The system cache may have bandwidth controls on multiple interfaces, + * for regulating traffic from devices independently of traffic from CPUs. + * If these are two RIS in one MSC, they will be treated as controlling + * different things, and will not share a vMSC/component/class. + * + * e.g. The L2 may have one MSC and two RIS, one for cache-controls another + * for bandwidth. These two RIS are members of the same vMSC. + * + * e.g. The set of RIS that make up the L2 are grouped as a component. These + * are sometimes termed slices. They should be configured the same, as if there + * were only one. + * + * e.g. The SoC probably has more than one L2, each attached to a distinct set + * of CPUs. All the L2 components are grouped as a class. + * + * When creating an MSC, struct mpam_msc is added to the all mpam_all_msc list, + * then linked via struct mpam_ris to a vmsc, component and class. + * The same MSC may exist under different class->component->vmsc paths, but the + * RIS index will be unique. + */ +LIST_HEAD(mpam_classes); + +/* List of all objects that can be free()d after synchronise_srcu() */ +static LLIST_HEAD(mpam_garbage); + +static inline void init_garbage(struct mpam_garbage *garbage) +{ + init_llist_node(&garbage->llist); +} + +static struct mpam_vmsc * +mpam_vmsc_alloc(struct mpam_component *comp, struct mpam_msc *msc) +{ + struct mpam_vmsc *vmsc; + + lockdep_assert_held(&mpam_list_lock); + + vmsc = kzalloc(sizeof(*vmsc), GFP_KERNEL); + if (!vmsc) + return ERR_PTR(-ENOMEM); + init_garbage(&vmsc->garbage); + + INIT_LIST_HEAD_RCU(&vmsc->ris); + INIT_LIST_HEAD_RCU(&vmsc->comp_list); + vmsc->comp = comp; + vmsc->msc = msc; + + list_add_rcu(&vmsc->comp_list, &comp->vmsc); + + return vmsc; +} + +static struct mpam_vmsc *mpam_vmsc_find(struct mpam_component *comp, + struct mpam_msc *msc) +{ + struct mpam_vmsc *vmsc; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + if (vmsc->msc->id == msc->id) + return vmsc; + } + + return mpam_vmsc_alloc(comp, msc); +} + +static struct mpam_component * +mpam_component_alloc(struct mpam_class *class, int id) +{ + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + comp = kzalloc(sizeof(*comp), GFP_KERNEL); + if (!comp) + return ERR_PTR(-ENOMEM); + init_garbage(&comp->garbage); + + comp->comp_id = id; + INIT_LIST_HEAD_RCU(&comp->vmsc); + /* affinity is updated when ris are added */ + INIT_LIST_HEAD_RCU(&comp->class_list); + comp->class = class; + + list_add_rcu(&comp->class_list, &class->components); + + return comp; +} + +static struct mpam_component * +mpam_component_find(struct mpam_class *class, int id) +{ + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(comp, &class->components, class_list) { + if (comp->comp_id == id) + return comp; + } + + return mpam_component_alloc(class, id); +} + +static struct mpam_class * +mpam_class_alloc(u8 level_idx, enum mpam_class_types type) +{ + struct mpam_class *class; + + lockdep_assert_held(&mpam_list_lock); + + class = kzalloc(sizeof(*class), GFP_KERNEL); + if (!class) + return ERR_PTR(-ENOMEM); + init_garbage(&class->garbage); + + INIT_LIST_HEAD_RCU(&class->components); + /* affinity is updated when ris are added */ + class->level = level_idx; + class->type = type; + INIT_LIST_HEAD_RCU(&class->classes_list); + + list_add_rcu(&class->classes_list, &mpam_classes); + + return class; +} + +static struct mpam_class * +mpam_class_find(u8 level_idx, enum mpam_class_types type) +{ + struct mpam_class *class; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(class, &mpam_classes, classes_list) { + if (class->type == type && class->level == level_idx) + return class; + } + + return mpam_class_alloc(level_idx, type); +} + +#define add_to_garbage(x) \ +do { \ + __typeof__(x) _x = (x); \ + _x->garbage.to_free = _x; \ + llist_add(&_x->garbage.llist, &mpam_garbage); \ +} while (0) + +static void mpam_class_destroy(struct mpam_class *class) +{ + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&class->classes_list); + add_to_garbage(class); +} + +static void mpam_comp_destroy(struct mpam_component *comp) +{ + struct mpam_class *class = comp->class; + + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&comp->class_list); + add_to_garbage(comp); + + if (list_empty(&class->components)) + mpam_class_destroy(class); +} + +static void mpam_vmsc_destroy(struct mpam_vmsc *vmsc) +{ + struct mpam_component *comp = vmsc->comp; + + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&vmsc->comp_list); + add_to_garbage(vmsc); + + if (list_empty(&comp->vmsc)) + mpam_comp_destroy(comp); +} + +static void mpam_ris_destroy(struct mpam_msc_ris *ris) +{ + struct mpam_vmsc *vmsc = ris->vmsc; + struct mpam_msc *msc = vmsc->msc; + struct mpam_component *comp = vmsc->comp; + struct mpam_class *class = comp->class; + + lockdep_assert_held(&mpam_list_lock); + + /* + * It is assumed affinities don't overlap. If they do the class becomes + * unusable immediately. + */ + cpumask_andnot(&comp->affinity, &comp->affinity, &ris->affinity); + cpumask_andnot(&class->affinity, &class->affinity, &ris->affinity); + clear_bit(ris->ris_idx, &msc->ris_idxs); + list_del_rcu(&ris->vmsc_list); + list_del_rcu(&ris->msc_list); + add_to_garbage(ris); + + if (list_empty(&vmsc->ris)) + mpam_vmsc_destroy(vmsc); +} + +static void mpam_free_garbage(void) +{ + struct mpam_garbage *iter, *tmp; + struct llist_node *to_free = llist_del_all(&mpam_garbage); + + if (!to_free) + return; + + synchronize_srcu(&mpam_srcu); + + llist_for_each_entry_safe(iter, tmp, to_free, llist) { + if (iter->pdev) + devm_kfree(&iter->pdev->dev, iter->to_free); + else + kfree(iter->to_free); + } +} + /* Called recursively to walk the list of caches from a particular CPU */ static void __mpam_get_cpumask_from_cache_id(int cpu, struct device_node *cache_node, unsigned long cache_id, @@ -124,6 +364,129 @@ static int get_cpumask_from_cache(struct device_node *cache, return mpam_get_cpumask_from_cache_id(cache_id, cache_level, affinity); } +/* + * cpumask_of_node() only knows about online CPUs. This can't tell us whether + * a class is represented on all possible CPUs. + */ +static void get_cpumask_from_node_id(u32 node_id, cpumask_t *affinity) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (node_id == cpu_to_node(cpu)) + cpumask_set_cpu(cpu, affinity); + } +} + +static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, + enum mpam_class_types type, + struct mpam_class *class, + struct mpam_component *comp) +{ + int err; + + switch (type) { + case MPAM_CLASS_CACHE: + err = mpam_get_cpumask_from_cache_id(comp->comp_id, class->level, + affinity); + if (err) + return err; + + if (cpumask_empty(affinity)) + dev_warn_once(&msc->pdev->dev, + "no CPUs associated with cache node\n"); + + break; + case MPAM_CLASS_MEMORY: + get_cpumask_from_node_id(comp->comp_id, affinity); + /* affinity may be empty for CPU-less memory nodes */ + break; + case MPAM_CLASS_UNKNOWN: + return 0; + } + + cpumask_and(affinity, affinity, &msc->accessibility); + + return 0; +} + +static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, + int component_id) +{ + int err; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + struct mpam_class *class; + struct mpam_component *comp; + struct platform_device *pdev = msc->pdev; + + lockdep_assert_held(&mpam_list_lock); + + if (ris_idx > MPAM_MSC_MAX_NUM_RIS) + return -EINVAL; + + if (test_and_set_bit(ris_idx, &msc->ris_idxs)) + return -EBUSY; + + ris = devm_kzalloc(&msc->pdev->dev, sizeof(*ris), GFP_KERNEL); + if (!ris) + return -ENOMEM; + init_garbage(&ris->garbage); + ris->garbage.pdev = pdev; + + class = mpam_class_find(class_id, type); + if (IS_ERR(class)) + return PTR_ERR(class); + + comp = mpam_component_find(class, component_id); + if (IS_ERR(comp)) { + if (list_empty(&class->components)) + mpam_class_destroy(class); + return PTR_ERR(comp); + } + + vmsc = mpam_vmsc_find(comp, msc); + if (IS_ERR(vmsc)) { + if (list_empty(&comp->vmsc)) + mpam_comp_destroy(comp); + return PTR_ERR(vmsc); + } + + err = mpam_ris_get_affinity(msc, &ris->affinity, type, class, comp); + if (err) { + if (list_empty(&vmsc->ris)) + mpam_vmsc_destroy(vmsc); + return err; + } + + ris->ris_idx = ris_idx; + INIT_LIST_HEAD_RCU(&ris->msc_list); + INIT_LIST_HEAD_RCU(&ris->vmsc_list); + ris->vmsc = vmsc; + + cpumask_or(&comp->affinity, &comp->affinity, &ris->affinity); + cpumask_or(&class->affinity, &class->affinity, &ris->affinity); + list_add_rcu(&ris->vmsc_list, &vmsc->ris); + + return 0; +} + +int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, int component_id) +{ + int err; + + mutex_lock(&mpam_list_lock); + err = mpam_ris_create_locked(msc, ris_idx, type, class_id, + component_id); + mutex_unlock(&mpam_list_lock); + if (err) + mpam_free_garbage(); + + return err; +} + static int mpam_dt_count_msc(void) { int count = 0; @@ -272,14 +635,25 @@ static int update_msc_accessibility(struct mpam_msc *msc) static int fw_num_msc; +/* + * There are two ways of reaching a struct mpam_msc_ris. Via the + * class->component->vmsc->ris, or via the msc. + * When destroying the msc, the other side needs unlinking and cleaning up too. + */ static void mpam_msc_destroy(struct mpam_msc *msc) { struct platform_device *pdev = msc->pdev; + struct mpam_msc_ris *ris, *tmp; lockdep_assert_held(&mpam_list_lock); + list_for_each_entry_safe(ris, tmp, &msc->ris, msc_list) + mpam_ris_destroy(ris); + list_del_rcu(&msc->all_msc_list); platform_set_drvdata(pdev, NULL); + + add_to_garbage(msc); } static void mpam_msc_drv_remove(struct platform_device *pdev) @@ -293,7 +667,7 @@ static void mpam_msc_drv_remove(struct platform_device *pdev) mpam_msc_destroy(msc); mutex_unlock(&mpam_list_lock); - synchronize_srcu(&mpam_srcu); + mpam_free_garbage(); } static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) @@ -309,6 +683,8 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL); if (!msc) return ERR_PTR(-ENOMEM); + init_garbage(&msc->garbage); + msc->garbage.pdev = pdev; mutex_init(&msc->probe_lock); mutex_init(&msc->part_sel_lock); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index f1c7180fe9aab..1a5d96660382f 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -7,14 +7,32 @@ #include #include #include +#include #include #include #include #include #include +#define MPAM_MSC_MAX_NUM_RIS 16 + struct platform_device; +/* + * Structures protected by SRCU may not be freed for a surprising amount of + * time (especially if perf is running). To ensure the MPAM error interrupt can + * tear down all the structures, build a list of objects that can be gargbage + * collected once synchronize_srcu() has returned. + * If pdev is non-NULL, use devm_kfree(). + */ +struct mpam_garbage { + /* member of mpam_garbage */ + struct llist_node llist; + + void *to_free; + struct platform_device *pdev; +}; + struct mpam_msc { /* member of mpam_all_msc */ struct list_head all_msc_list; @@ -48,8 +66,79 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + + struct mpam_garbage garbage; }; +struct mpam_class { + /* mpam_components in this class */ + struct list_head components; + + cpumask_t affinity; + + u8 level; + enum mpam_class_types type; + + /* member of mpam_classes */ + struct list_head classes_list; + + struct mpam_garbage garbage; +}; + +struct mpam_component { + u32 comp_id; + + /* mpam_vmsc in this component */ + struct list_head vmsc; + + cpumask_t affinity; + + /* member of mpam_class:components */ + struct list_head class_list; + + /* parent: */ + struct mpam_class *class; + + struct mpam_garbage garbage; +}; + +struct mpam_vmsc { + /* member of mpam_component:vmsc_list */ + struct list_head comp_list; + + /* mpam_msc_ris in this vmsc */ + struct list_head ris; + + /* All RIS in this vMSC are members of this MSC */ + struct mpam_msc *msc; + + /* parent: */ + struct mpam_component *comp; + + struct mpam_garbage garbage; +}; + +struct mpam_msc_ris { + u8 ris_idx; + + cpumask_t affinity; + + /* member of mpam_vmsc:ris */ + struct list_head vmsc_list; + + /* member of mpam_msc:ris */ + struct list_head msc_list; + + /* parent: */ + struct mpam_vmsc *vmsc; + + struct mpam_garbage garbage; +}; + +/* List of all classes - protected by srcu*/ +extern struct srcu_struct mpam_srcu; +extern struct list_head mpam_classes; + int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 3d6c39c667c39..3206f5ddc147a 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -38,11 +38,7 @@ static inline int acpi_mpam_parse_resources(struct mpam_msc *msc, static inline int acpi_mpam_count_msc(void) { return -EINVAL; } #endif -static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, - enum mpam_class_types type, u8 class_id, - int component_id) -{ - return -EINVAL; -} +int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, int component_id); #endif /* __LINUX_ARM_MPAM_H */ From 4ede65ad5017d34ad21e18cc512213d0c9567c94 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 13 Dec 2018 11:41:37 +0000 Subject: [PATCH 077/247] NVIDIA: SAUCE: arm_mpam: Add MPAM MSC register layout definitions BugLink: https://bugs.launchpad.net/bugs/2122432 Memory Partitioning and Monitoring (MPAM) has memory mapped devices (MSCs) with an identity/configuration page. Add the definitions for these registers as offset within the page(s). Link: https://developer.arm.com/documentation/ihi0099/latest/ Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 5da0d5b259df91842a3df81f08e658a942913f78 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_internal.h | 268 ++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1a5d96660382f..1ef3e8e1d0564 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -142,4 +142,272 @@ extern struct list_head mpam_classes; int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +/* + * MPAM MSCs have the following register layout. See: + * Arm Memory System Resource Partitioning and Monitoring (MPAM) System + * Component Specification. + * https://developer.arm.com/documentation/ihi0099/latest/ + */ +#define MPAM_ARCHITECTURE_V1 0x10 + +/* Memory mapped control pages */ +/* ID Register offsets in the memory mapped page */ +#define MPAMF_IDR 0x0000 /* features id register */ +#define MPAMF_IIDR 0x0018 /* implementer id register */ +#define MPAMF_AIDR 0x0020 /* architectural id register */ +#define MPAMF_IMPL_IDR 0x0028 /* imp-def partitioning */ +#define MPAMF_CPOR_IDR 0x0030 /* cache-portion partitioning */ +#define MPAMF_CCAP_IDR 0x0038 /* cache-capacity partitioning */ +#define MPAMF_MBW_IDR 0x0040 /* mem-bw partitioning */ +#define MPAMF_PRI_IDR 0x0048 /* priority partitioning */ +#define MPAMF_MSMON_IDR 0x0080 /* performance monitoring features */ +#define MPAMF_CSUMON_IDR 0x0088 /* cache-usage monitor */ +#define MPAMF_MBWUMON_IDR 0x0090 /* mem-bw usage monitor */ +#define MPAMF_PARTID_NRW_IDR 0x0050 /* partid-narrowing */ + +/* Configuration and Status Register offsets in the memory mapped page */ +#define MPAMCFG_PART_SEL 0x0100 /* partid to configure */ +#define MPAMCFG_CPBM 0x1000 /* cache-portion config */ +#define MPAMCFG_CMAX 0x0108 /* cache-capacity config */ +#define MPAMCFG_CMIN 0x0110 /* cache-capacity config */ +#define MPAMCFG_CASSOC 0x0118 /* cache-associativity config */ +#define MPAMCFG_MBW_MIN 0x0200 /* min mem-bw config */ +#define MPAMCFG_MBW_MAX 0x0208 /* max mem-bw config */ +#define MPAMCFG_MBW_WINWD 0x0220 /* mem-bw accounting window config */ +#define MPAMCFG_MBW_PBM 0x2000 /* mem-bw portion bitmap config */ +#define MPAMCFG_PRI 0x0400 /* priority partitioning config */ +#define MPAMCFG_MBW_PROP 0x0500 /* mem-bw stride config */ +#define MPAMCFG_INTPARTID 0x0600 /* partid-narrowing config */ + +#define MSMON_CFG_MON_SEL 0x0800 /* monitor selector */ +#define MSMON_CFG_CSU_FLT 0x0810 /* cache-usage monitor filter */ +#define MSMON_CFG_CSU_CTL 0x0818 /* cache-usage monitor config */ +#define MSMON_CFG_MBWU_FLT 0x0820 /* mem-bw monitor filter */ +#define MSMON_CFG_MBWU_CTL 0x0828 /* mem-bw monitor config */ +#define MSMON_CSU 0x0840 /* current cache-usage */ +#define MSMON_CSU_CAPTURE 0x0848 /* last cache-usage value captured */ +#define MSMON_MBWU 0x0860 /* current mem-bw usage value */ +#define MSMON_MBWU_CAPTURE 0x0868 /* last mem-bw value captured */ +#define MSMON_MBWU_L 0x0880 /* current long mem-bw usage value */ +#define MSMON_MBWU_CAPTURE_L 0x0890 /* last long mem-bw value captured */ +#define MSMON_CAPT_EVNT 0x0808 /* signal a capture event */ +#define MPAMF_ESR 0x00F8 /* error status register */ +#define MPAMF_ECR 0x00F0 /* error control register */ + +/* MPAMF_IDR - MPAM features ID register */ +#define MPAMF_IDR_PARTID_MAX GENMASK(15, 0) +#define MPAMF_IDR_PMG_MAX GENMASK(23, 16) +#define MPAMF_IDR_HAS_CCAP_PART BIT(24) +#define MPAMF_IDR_HAS_CPOR_PART BIT(25) +#define MPAMF_IDR_HAS_MBW_PART BIT(26) +#define MPAMF_IDR_HAS_PRI_PART BIT(27) +#define MPAMF_IDR_EXT BIT(28) +#define MPAMF_IDR_HAS_IMPL_IDR BIT(29) +#define MPAMF_IDR_HAS_MSMON BIT(30) +#define MPAMF_IDR_HAS_PARTID_NRW BIT(31) +#define MPAMF_IDR_HAS_RIS BIT(32) +#define MPAMF_IDR_HAS_EXTD_ESR BIT(38) +#define MPAMF_IDR_HAS_ESR BIT(39) +#define MPAMF_IDR_RIS_MAX GENMASK(59, 56) + +/* MPAMF_MSMON_IDR - MPAM performance monitoring ID register */ +#define MPAMF_MSMON_IDR_MSMON_CSU BIT(16) +#define MPAMF_MSMON_IDR_MSMON_MBWU BIT(17) +#define MPAMF_MSMON_IDR_HAS_LOCAL_CAPT_EVNT BIT(31) + +/* MPAMF_CPOR_IDR - MPAM features cache portion partitioning ID register */ +#define MPAMF_CPOR_IDR_CPBM_WD GENMASK(15, 0) + +/* MPAMF_CCAP_IDR - MPAM features cache capacity partitioning ID register */ +#define MPAMF_CCAP_IDR_CMAX_WD GENMASK(5, 0) +#define MPAMF_CCAP_IDR_CASSOC_WD GENMASK(12, 8) +#define MPAMF_CCAP_IDR_HAS_CASSOC BIT(28) +#define MPAMF_CCAP_IDR_HAS_CMIN BIT(29) +#define MPAMF_CCAP_IDR_NO_CMAX BIT(30) +#define MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM BIT(31) + +/* MPAMF_MBW_IDR - MPAM features memory bandwidth partitioning ID register */ +#define MPAMF_MBW_IDR_BWA_WD GENMASK(5, 0) +#define MPAMF_MBW_IDR_HAS_MIN BIT(10) +#define MPAMF_MBW_IDR_HAS_MAX BIT(11) +#define MPAMF_MBW_IDR_HAS_PBM BIT(12) +#define MPAMF_MBW_IDR_HAS_PROP BIT(13) +#define MPAMF_MBW_IDR_WINDWR BIT(14) +#define MPAMF_MBW_IDR_BWPBM_WD GENMASK(28, 16) + +/* MPAMF_PRI_IDR - MPAM features priority partitioning ID register */ +#define MPAMF_PRI_IDR_HAS_INTPRI BIT(0) +#define MPAMF_PRI_IDR_INTPRI_0_IS_LOW BIT(1) +#define MPAMF_PRI_IDR_INTPRI_WD GENMASK(9, 4) +#define MPAMF_PRI_IDR_HAS_DSPRI BIT(16) +#define MPAMF_PRI_IDR_DSPRI_0_IS_LOW BIT(17) +#define MPAMF_PRI_IDR_DSPRI_WD GENMASK(25, 20) + +/* MPAMF_CSUMON_IDR - MPAM cache storage usage monitor ID register */ +#define MPAMF_CSUMON_IDR_NUM_MON GENMASK(15, 0) +#define MPAMF_CSUMON_IDR_HAS_OFLOW_CAPT BIT(24) +#define MPAMF_CSUMON_IDR_HAS_CEVNT_OFLW BIT(25) +#define MPAMF_CSUMON_IDR_HAS_OFSR BIT(26) +#define MPAMF_CSUMON_IDR_HAS_OFLOW_LNKG BIT(27) +#define MPAMF_CSUMON_IDR_HAS_XCL BIT(29) +#define MPAMF_CSUMON_IDR_CSU_RO BIT(30) +#define MPAMF_CSUMON_IDR_HAS_CAPTURE BIT(31) + +/* MPAMF_MBWUMON_IDR - MPAM memory bandwidth usage monitor ID register */ +#define MPAMF_MBWUMON_IDR_NUM_MON GENMASK(15, 0) +#define MPAMF_MBWUMON_IDR_HAS_RWBW BIT(28) +#define MPAMF_MBWUMON_IDR_LWD BIT(29) +#define MPAMF_MBWUMON_IDR_HAS_LONG BIT(30) +#define MPAMF_MBWUMON_IDR_HAS_CAPTURE BIT(31) + +/* MPAMF_PARTID_NRW_IDR - MPAM PARTID narrowing ID register */ +#define MPAMF_PARTID_NRW_IDR_INTPARTID_MAX GENMASK(15, 0) + +/* MPAMF_IIDR - MPAM implementation ID register */ +#define MPAMF_IIDR_IMPLEMENTER GENMASK(11, 0) +#define MPAMF_IIDR_REVISION GENMASK(15, 12) +#define MPAMF_IIDR_VARIANT GENMASK(19, 16) +#define MPAMF_IIDR_PRODUCTID GENMASK(31, 20) + +/* MPAMF_AIDR - MPAM architecture ID register */ +#define MPAMF_AIDR_ARCH_MINOR_REV GENMASK(3, 0) +#define MPAMF_AIDR_ARCH_MAJOR_REV GENMASK(7, 4) + +/* MPAMCFG_PART_SEL - MPAM partition configuration selection register */ +#define MPAMCFG_PART_SEL_PARTID_SEL GENMASK(15, 0) +#define MPAMCFG_PART_SEL_INTERNAL BIT(16) +#define MPAMCFG_PART_SEL_RIS GENMASK(27, 24) + +/* MPAMCFG_CASSOC - MPAM cache maximum associativity partition configuration register */ +#define MPAMCFG_CASSOC_CASSOC GENMASK(15, 0) + +/* MPAMCFG_CMAX - MPAM cache capacity configuration register */ +#define MPAMCFG_CMAX_SOFTLIM BIT(31) +#define MPAMCFG_CMAX_CMAX GENMASK(15, 0) + +/* MPAMCFG_CMIN - MPAM cache capacity configuration register */ +#define MPAMCFG_CMIN_CMIN GENMASK(15, 0) + +/* + * MPAMCFG_MBW_MIN - MPAM memory minimum bandwidth partitioning configuration + * register + */ +#define MPAMCFG_MBW_MIN_MIN GENMASK(15, 0) + +/* + * MPAMCFG_MBW_MAX - MPAM memory maximum bandwidth partitioning configuration + * register + */ +#define MPAMCFG_MBW_MAX_MAX GENMASK(15, 0) +#define MPAMCFG_MBW_MAX_HARDLIM BIT(31) + +/* + * MPAMCFG_MBW_WINWD - MPAM memory bandwidth partitioning window width + * register + */ +#define MPAMCFG_MBW_WINWD_US_FRAC GENMASK(7, 0) +#define MPAMCFG_MBW_WINWD_US_INT GENMASK(23, 8) + +/* MPAMCFG_PRI - MPAM priority partitioning configuration register */ +#define MPAMCFG_PRI_INTPRI GENMASK(15, 0) +#define MPAMCFG_PRI_DSPRI GENMASK(31, 16) + +/* + * MPAMCFG_MBW_PROP - Memory bandwidth proportional stride partitioning + * configuration register + */ +#define MPAMCFG_MBW_PROP_STRIDEM1 GENMASK(15, 0) +#define MPAMCFG_MBW_PROP_EN BIT(31) + +/* + * MPAMCFG_INTPARTID - MPAM internal partition narrowing configuration register + */ +#define MPAMCFG_INTPARTID_INTPARTID GENMASK(15, 0) +#define MPAMCFG_INTPARTID_INTERNAL BIT(16) + +/* MSMON_CFG_MON_SEL - Memory system performance monitor selection register */ +#define MSMON_CFG_MON_SEL_MON_SEL GENMASK(15, 0) +#define MSMON_CFG_MON_SEL_RIS GENMASK(27, 24) + +/* MPAMF_ESR - MPAM Error Status Register */ +#define MPAMF_ESR_PARTID_MON GENMASK(15, 0) +#define MPAMF_ESR_PMG GENMASK(23, 16) +#define MPAMF_ESR_ERRCODE GENMASK(27, 24) +#define MPAMF_ESR_OVRWR BIT(31) +#define MPAMF_ESR_RIS GENMASK(35, 32) + +/* MPAMF_ECR - MPAM Error Control Register */ +#define MPAMF_ECR_INTEN BIT(0) + +/* Error conditions in accessing memory mapped registers */ +#define MPAM_ERRCODE_NONE 0 +#define MPAM_ERRCODE_PARTID_SEL_RANGE 1 +#define MPAM_ERRCODE_REQ_PARTID_RANGE 2 +#define MPAM_ERRCODE_MSMONCFG_ID_RANGE 3 +#define MPAM_ERRCODE_REQ_PMG_RANGE 4 +#define MPAM_ERRCODE_MONITOR_RANGE 5 +#define MPAM_ERRCODE_INTPARTID_RANGE 6 +#define MPAM_ERRCODE_UNEXPECTED_INTERNAL 7 +#define MPAM_ERRCODE_UNDEFINED_RIS_PART_SEL 8 +#define MPAM_ERRCODE_RIS_NO_CONTROL 9 +#define MPAM_ERRCODE_UNDEFINED_RIS_MON_SEL 10 +#define MPAM_ERRCODE_RIS_NO_MONITOR 11 + +/* + * MSMON_CFG_CSU_CTL - Memory system performance monitor configure cache storage + * usage monitor control register + * MSMON_CFG_MBWU_CTL - Memory system performance monitor configure memory + * bandwidth usage monitor control register + */ +#define MSMON_CFG_x_CTL_TYPE GENMASK(7, 0) +#define MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L BIT(15) +#define MSMON_CFG_x_CTL_MATCH_PARTID BIT(16) +#define MSMON_CFG_x_CTL_MATCH_PMG BIT(17) +#define MSMON_CFG_x_CTL_SUBTYPE GENMASK(22, 20) +#define MSMON_CFG_x_CTL_OFLOW_FRZ BIT(24) +#define MSMON_CFG_x_CTL_OFLOW_INTR BIT(25) +#define MSMON_CFG_x_CTL_OFLOW_STATUS BIT(26) +#define MSMON_CFG_x_CTL_CAPT_RESET BIT(27) +#define MSMON_CFG_x_CTL_CAPT_EVNT GENMASK(30, 28) +#define MSMON_CFG_x_CTL_EN BIT(31) + +#define MSMON_CFG_MBWU_CTL_TYPE_MBWU 0x42 +#define MSMON_CFG_CSU_CTL_TYPE_CSU 0x43 + +#define MSMON_CFG_MBWU_CTL_SCLEN BIT(19) + +/* + * MSMON_CFG_CSU_FLT - Memory system performance monitor configure cache storage + * usage monitor filter register + * MSMON_CFG_MBWU_FLT - Memory system performance monitor configure memory + * bandwidth usage monitor filter register + */ +#define MSMON_CFG_x_FLT_PARTID GENMASK(15, 0) +#define MSMON_CFG_x_FLT_PMG GENMASK(23, 16) + +#define MSMON_CFG_MBWU_FLT_RWBW GENMASK(31, 30) +#define MSMON_CFG_CSU_FLT_XCL BIT(31) + +/* + * MSMON_CSU - Memory system performance monitor cache storage usage monitor + * register + * MSMON_CSU_CAPTURE - Memory system performance monitor cache storage usage + * capture register + * MSMON_MBWU - Memory system performance monitor memory bandwidth usage + * monitor register + * MSMON_MBWU_CAPTURE - Memory system performance monitor memory bandwidth usage + * capture register + */ +#define MSMON___VALUE GENMASK(30, 0) +#define MSMON___NRDY BIT(31) +#define MSMON___NRDY_L BIT(63) +#define MSMON___L_VALUE GENMASK(43, 0) +#define MSMON___LWD_VALUE GENMASK(62, 0) + +/* + * MSMON_CAPT_EVNT - Memory system performance monitoring capture event + * generation register + */ +#define MSMON_CAPT_EVNT_NOW BIT(0) + #endif /* MPAM_INTERNAL_H */ From a9e6dc34137478bc803debcc50761ec0b6914222 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 4 May 2021 18:12:42 +0100 Subject: [PATCH 078/247] NVIDIA: SAUCE: arm_mpam: Add cpuhp callbacks to probe MSC hardware BugLink: https://bugs.launchpad.net/bugs/2122432 Because an MSC can only by accessed from the CPUs in its cpu-affinity set we need to be running on one of those CPUs to probe the MSC hardware. Do this work in the cpuhp callback. Probing the hardware will only happen before MPAM is enabled, walk all the MSCs and probe those we can reach that haven't already been probed as each CPU's online call is made. This adds the low-level MSC register accessors. Once all MSCs reported by the firmware have been probed from a CPU in their respective cpu-affinity set, the probe-time cpuhp callbacks are replaced. The replacement callbacks will ultimately need to handle save/restore of the runtime MSC state across power transitions, but for now there is nothing to do in them: so do nothing. The architecture's context switch code will be enabled by a static-key, this can be set by mpam_enable(), but must be done from process context, not a cpuhp callback because both take the cpuhp lock. Whenever a new MSC has been probed, the mpam_enable() work is scheduled to test if all the MSCs have been probed. If probing fails, mpam_disable() is scheduled to unregister the cpuhp callbacks and free memory. CC: Lecopzer Chen Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 8164f1cf4f4c50e2fe39e9091066176e6bcdf7f2 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 174 +++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 5 + 2 files changed, 178 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 518b0fcd35f4d..ab983158d1fba 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -4,6 +4,7 @@ #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ #include +#include #include #include #include @@ -22,6 +23,7 @@ #include #include #include +#include #include "mpam_internal.h" @@ -41,6 +43,25 @@ struct srcu_struct mpam_srcu; */ static atomic_t mpam_num_msc; +static int mpam_cpuhp_state; +static DEFINE_MUTEX(mpam_cpuhp_state_lock); + +/* + * mpam is enabled once all devices have been probed from CPU online callbacks, + * scheduled via this work_struct. If access to an MSC depends on a CPU that + * was not brought online at boot, this can happen surprisingly late. + */ +static DECLARE_WORK(mpam_enable_work, &mpam_enable); + +/* + * All mpam error interrupts indicate a software bug. On receipt, disable the + * driver. + */ +static DECLARE_WORK(mpam_broken_work, &mpam_disable); + +/* When mpam is disabled, the printed reason to aid debugging */ +static char *mpam_disable_reason; + /* * An MSC is a physical container for controls and monitors, each identified by * their RIS index. These share a base-address, interrupts and some MMIO @@ -84,6 +105,20 @@ static inline void init_garbage(struct mpam_garbage *garbage) { init_llist_node(&garbage->llist); } +static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) +{ + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + return readl_relaxed(msc->mapped_hwpage + reg); +} + +static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg) +{ + lockdep_assert_held_once(&msc->part_sel_lock); + return __mpam_read_reg(msc, reg); +} + +#define mpam_read_partsel_reg(msc, reg) _mpam_read_partsel_reg(msc, MPAMF_##reg) static struct mpam_vmsc * mpam_vmsc_alloc(struct mpam_component *comp, struct mpam_msc *msc) @@ -487,6 +522,86 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, return err; } +static int mpam_msc_hw_probe(struct mpam_msc *msc) +{ + u64 idr; + struct device *dev = &msc->pdev->dev; + + lockdep_assert_held(&msc->probe_lock); + + idr = __mpam_read_reg(msc, MPAMF_AIDR); + if ((idr & MPAMF_AIDR_ARCH_MAJOR_REV) != MPAM_ARCHITECTURE_V1) { + dev_err_once(dev, "MSC does not match MPAM architecture v1.x\n"); + return -EIO; + } + + msc->probed = true; + + return 0; +} + +static int mpam_cpu_online(unsigned int cpu) +{ + return 0; +} + +/* Before mpam is enabled, try to probe new MSC */ +static int mpam_discovery_cpu_online(unsigned int cpu) +{ + int err = 0; + struct mpam_msc *msc; + bool new_device_probed = false; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &msc->accessibility)) + continue; + + mutex_lock(&msc->probe_lock); + if (!msc->probed) + err = mpam_msc_hw_probe(msc); + mutex_unlock(&msc->probe_lock); + + if (err) + break; + new_device_probed = true; + } + + if (new_device_probed && !err) + schedule_work(&mpam_enable_work); + if (err) { + mpam_disable_reason = "error during probing"; + schedule_work(&mpam_broken_work); + } + + return err; +} + +static int mpam_cpu_offline(unsigned int cpu) +{ + return 0; +} + +static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online), + int (*offline)(unsigned int offline), + char *name) +{ + mutex_lock(&mpam_cpuhp_state_lock); + if (mpam_cpuhp_state) { + cpuhp_remove_state(mpam_cpuhp_state); + mpam_cpuhp_state = 0; + } + + mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, name, online, + offline); + if (mpam_cpuhp_state <= 0) { + pr_err("Failed to register cpuhp callbacks"); + mpam_cpuhp_state = 0; + } + mutex_unlock(&mpam_cpuhp_state_lock); +} + static int mpam_dt_count_msc(void) { int count = 0; @@ -747,7 +862,8 @@ static int mpam_msc_drv_probe(struct platform_device *pdev) } if (!err && atomic_add_return(1, &mpam_num_msc) == fw_num_msc) - pr_info("Discovered all MSC\n"); + mpam_register_cpuhp_callbacks(mpam_discovery_cpu_online, NULL, + "mpam:drv_probe"); return err; } @@ -767,6 +883,62 @@ static struct platform_driver mpam_msc_driver = { .remove = mpam_msc_drv_remove, }; +static void mpam_enable_once(void) +{ + mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, + "mpam:online"); + + pr_info("MPAM enabled\n"); +} + +void mpam_disable(struct work_struct *ignored) +{ + struct mpam_msc *msc, *tmp; + + mutex_lock(&mpam_cpuhp_state_lock); + if (mpam_cpuhp_state) { + cpuhp_remove_state(mpam_cpuhp_state); + mpam_cpuhp_state = 0; + } + mutex_unlock(&mpam_cpuhp_state_lock); + + mutex_lock(&mpam_list_lock); + list_for_each_entry_safe(msc, tmp, &mpam_all_msc, all_msc_list) + mpam_msc_destroy(msc); + mutex_unlock(&mpam_list_lock); + mpam_free_garbage(); + + pr_err_once("MPAM disabled due to %s\n", mpam_disable_reason); +} + +/* + * Enable mpam once all devices have been probed. + * Scheduled by mpam_discovery_cpu_online() once all devices have been created. + * Also scheduled when new devices are probed when new CPUs come online. + */ +void mpam_enable(struct work_struct *work) +{ + static atomic_t once; + struct mpam_msc *msc; + bool all_devices_probed = true; + + /* Have we probed all the hw devices? */ + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + mutex_lock(&msc->probe_lock); + if (!msc->probed) + all_devices_probed = false; + mutex_unlock(&msc->probe_lock); + + if (!all_devices_probed) + break; + } + + if (all_devices_probed && !atomic_fetch_inc(&once)) + mpam_enable_once(); +} + /* * MSCs that are declared by the firmware as being part of a cache may not * be created automatically as platform devices, since there is no diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1ef3e8e1d0564..8865a7d81dd1b 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -50,6 +50,7 @@ struct mpam_msc { * properties become read-only and the lists are protected by SRCU. */ struct mutex probe_lock; + bool probed; unsigned long ris_idxs; u32 ris_max; @@ -139,6 +140,10 @@ struct mpam_msc_ris { extern struct srcu_struct mpam_srcu; extern struct list_head mpam_classes; +/* Scheduled work callback to enable mpam once all MSC have been probed */ +void mpam_enable(struct work_struct *work); +void mpam_disable(struct work_struct *work); + int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); From 05d5591eee21bd625f8515d9d0c95e2c0019731e Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 5 Dec 2023 14:04:33 +0000 Subject: [PATCH 079/247] NVIDIA: SAUCE: arm_mpam: Probe hardware to find the supported partid/pmg values BugLink: https://bugs.launchpad.net/bugs/2122432 CPUs can generate traffic with a range of PARTID and PMG values, but each MSC may also have its own maximum size for these fields. Before MPAM can be used, the driver needs to probe each RIS on each MSC, to find the system-wide smallest value that can be used. The limits from requestors (e.g. CPUs) also need taking into account. While doing this, RIS entries that firmware didn't describe are created under MPAM_CLASS_UNKNOWN. While we're here, implement the mpam_register_requestor() call for the arch code to register the CPU limits. Future callers of this will tell us about the SMMU and ITS. Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 517cf73f94c7101c07315c7086384b30e6889365 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 148 +++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 6 ++ include/linux/arm_mpam.h | 14 +++ 3 files changed, 167 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index ab983158d1fba..27753aa482585 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,15 @@ static atomic_t mpam_num_msc; static int mpam_cpuhp_state; static DEFINE_MUTEX(mpam_cpuhp_state_lock); +/* + * The smallest common values for any CPU or MSC in the system. + * Generating traffic outside this range will result in screaming interrupts. + */ +u16 mpam_partid_max; +u8 mpam_pmg_max; +static bool partid_max_init, partid_max_published; +static DEFINE_SPINLOCK(partid_max_lock); + /* * mpam is enabled once all devices have been probed from CPU online callbacks, * scheduled via this work_struct. If access to an MSC depends on a CPU that @@ -120,6 +130,69 @@ static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg) #define mpam_read_partsel_reg(msc, reg) _mpam_read_partsel_reg(msc, MPAMF_##reg) +static void __mpam_write_reg(struct mpam_msc *msc, u16 reg, u32 val) +{ + WARN_ON_ONCE(reg + sizeof(u32) >= msc->mapped_hwpage_sz); + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + writel_relaxed(val, msc->mapped_hwpage + reg); +} + +static inline void _mpam_write_partsel_reg(struct mpam_msc *msc, u16 reg, u32 val) +{ + lockdep_assert_held_once(&msc->part_sel_lock); + __mpam_write_reg(msc, reg, val); +} +#define mpam_write_partsel_reg(msc, reg, val) _mpam_write_partsel_reg(msc, MPAMCFG_##reg, val) + +static u64 mpam_msc_read_idr(struct mpam_msc *msc) +{ + u64 idr_high = 0, idr_low; + + lockdep_assert_held(&msc->part_sel_lock); + + idr_low = mpam_read_partsel_reg(msc, IDR); + if (FIELD_GET(MPAMF_IDR_EXT, idr_low)) + idr_high = mpam_read_partsel_reg(msc, IDR + 4); + + return (idr_high << 32) | idr_low; +} + +static void __mpam_part_sel_raw(u32 partsel, struct mpam_msc *msc) +{ + lockdep_assert_held(&msc->part_sel_lock); + + mpam_write_partsel_reg(msc, PART_SEL, partsel); +} + +static void __mpam_part_sel(u8 ris_idx, u16 partid, struct mpam_msc *msc) +{ + u32 partsel = FIELD_PREP(MPAMCFG_PART_SEL_RIS, ris_idx) | + FIELD_PREP(MPAMCFG_PART_SEL_PARTID_SEL, partid); + + __mpam_part_sel_raw(partsel, msc); +} + +int mpam_register_requestor(u16 partid_max, u8 pmg_max) +{ + guard(spinlock)(&partid_max_lock); + if (!partid_max_init) { + mpam_partid_max = partid_max; + mpam_pmg_max = pmg_max; + partid_max_init = true; + } else if (!partid_max_published) { + mpam_partid_max = min(mpam_partid_max, partid_max); + mpam_pmg_max = min(mpam_pmg_max, pmg_max); + } else { + /* New requestors can't lower the values */ + if (partid_max < mpam_partid_max || pmg_max < mpam_pmg_max) + return -EBUSY; + } + + return 0; +} +EXPORT_SYMBOL(mpam_register_requestor); + static struct mpam_vmsc * mpam_vmsc_alloc(struct mpam_component *comp, struct mpam_msc *msc) { @@ -503,6 +576,7 @@ static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx, cpumask_or(&comp->affinity, &comp->affinity, &ris->affinity); cpumask_or(&class->affinity, &class->affinity, &ris->affinity); list_add_rcu(&ris->vmsc_list, &vmsc->ris); + list_add_rcu(&ris->msc_list, &msc->ris); return 0; } @@ -522,9 +596,36 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, return err; } +static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, + u8 ris_idx) +{ + int err; + struct mpam_msc_ris *ris; + + lockdep_assert_held(&mpam_list_lock); + + if (!test_bit(ris_idx, &msc->ris_idxs)) { + err = mpam_ris_create_locked(msc, ris_idx, MPAM_CLASS_UNKNOWN, + 0, 0); + if (err) + return ERR_PTR(err); + } + + list_for_each_entry(ris, &msc->ris, msc_list) { + if (ris->ris_idx == ris_idx) { + return ris; + } + } + + return ERR_PTR(-ENOENT); +} + static int mpam_msc_hw_probe(struct mpam_msc *msc) { u64 idr; + u16 partid_max; + u8 ris_idx, pmg_max; + struct mpam_msc_ris *ris; struct device *dev = &msc->pdev->dev; lockdep_assert_held(&msc->probe_lock); @@ -535,6 +636,40 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) return -EIO; } + /* Grab an IDR value to find out how many RIS there are */ + mutex_lock(&msc->part_sel_lock); + idr = mpam_msc_read_idr(msc); + mutex_unlock(&msc->part_sel_lock); + + msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr); + + /* Use these values so partid/pmg always starts with a valid value */ + msc->partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr); + msc->pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr); + + for (ris_idx = 0; ris_idx <= msc->ris_max; ris_idx++) { + mutex_lock(&msc->part_sel_lock); + __mpam_part_sel(ris_idx, 0, msc); + idr = mpam_msc_read_idr(msc); + mutex_unlock(&msc->part_sel_lock); + + partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr); + pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr); + msc->partid_max = min(msc->partid_max, partid_max); + msc->pmg_max = min(msc->pmg_max, pmg_max); + + mutex_lock(&mpam_list_lock); + ris = mpam_get_or_create_ris(msc, ris_idx); + mutex_unlock(&mpam_list_lock); + if (IS_ERR(ris)) + return PTR_ERR(ris); + } + + spin_lock(&partid_max_lock); + mpam_partid_max = min(mpam_partid_max, msc->partid_max); + mpam_pmg_max = min(mpam_pmg_max, msc->pmg_max); + spin_unlock(&partid_max_lock); + msc->probed = true; return 0; @@ -885,10 +1020,20 @@ static struct platform_driver mpam_msc_driver = { static void mpam_enable_once(void) { + /* + * Once the cpuhp callbacks have been changed, mpam_partid_max can no + * longer change. + */ + spin_lock(&partid_max_lock); + partid_max_published = true; + spin_unlock(&partid_max_lock); + mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); - pr_info("MPAM enabled\n"); + /* Use printk() to avoid the pr_fmt adding the function name. */ + printk(KERN_INFO, "MPAM enabled with %u PARTIDs and %u PMGs\n", + mpam_partid_max + 1, mpam_pmg_max + 1); } void mpam_disable(struct work_struct *ignored) @@ -990,4 +1135,5 @@ static int __init mpam_msc_driver_init(void) return platform_driver_register(&mpam_msc_driver); } +/* Must occur after arm64_mpam_register_cpus() from arch_initcall() */ subsys_initcall(mpam_msc_driver_init); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 8865a7d81dd1b..9c08502e9c768 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -51,6 +51,8 @@ struct mpam_msc { */ struct mutex probe_lock; bool probed; + u16 partid_max; + u8 pmg_max; unsigned long ris_idxs; u32 ris_max; @@ -140,6 +142,10 @@ struct mpam_msc_ris { extern struct srcu_struct mpam_srcu; extern struct list_head mpam_classes; +/* System wide partid/pmg values */ +extern u16 mpam_partid_max; +extern u8 mpam_pmg_max; + /* Scheduled work callback to enable mpam once all MSC have been probed */ void mpam_enable(struct work_struct *work); void mpam_disable(struct work_struct *work); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 3206f5ddc147a..cb6e6cfbea0bc 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -41,4 +41,18 @@ static inline int acpi_mpam_count_msc(void) { return -EINVAL; } int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, int component_id); +/** + * mpam_register_requestor() - Register a requestor with the MPAM driver + * @partid_max: The maximum PARTID value the requestor can generate. + * @pmg_max: The maximum PMG value the requestor can generate. + * + * Registers a requestor with the MPAM driver to ensure the chosen system-wide + * minimum PARTID and PMG values will allow the requestors features to be used. + * + * Returns an error if the registration is too late, and a larger PARTID/PMG + * value has been advertised to user-space. In this case the requestor should + * not use its MPAM features. Returns 0 on success. + */ +int mpam_register_requestor(u16 partid_max, u8 pmg_max); + #endif /* __LINUX_ARM_MPAM_H */ From 2b4f793e187d3e07d9a1d91587a7472abd710779 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 15 Jul 2024 11:18:57 +0100 Subject: [PATCH 080/247] NVIDIA: SAUCE: arm_mpam: Add helpers for managing the locking around the mon_sel registers BugLink: https://bugs.launchpad.net/bugs/2122432 The MSC MON_SEL register needs to be accessed from hardirq for the overflow interrupt, and when taking an IPI to access these registers on platforms where MSC are not accessible from every CPU. This makes an irqsave spinlock the obvious lock to protect these registers. On systems with SCMI or PCC mailboxes it must be able to sleep, meaning a mutex must be used. The SCMI or PCC platforms can't support an overflow interrupt, and can't access the registers from hardirq context. Clearly these two can't exist for one MSC at the same time. Add helpers for the MON_SEL locking. For now, use a irqsave spinlock and only support 'real' MMIO platforms. In the future this lock will be split in two allowing SCMI/PCC platforms to take a mutex. Because there are contexts where the SCMI/PCC platforms can't make an access, mpam_mon_sel_lock() needs to be able to fail. Do this now, so that all the error handling on these paths is present. This allows the relevant paths to fail if they are needed on a platform where this isn't possible, instead of having to make explicit checks of the interface type. Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 65f75f143d41bf1d00ec2064ae63b0760a41c217 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 3 ++- drivers/resctrl/mpam_internal.h | 38 +++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 27753aa482585..ff5df084ee05c 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -938,6 +938,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) mutex_init(&msc->probe_lock); mutex_init(&msc->part_sel_lock); + mpam_mon_sel_lock_init(msc); msc->id = pdev->id; msc->pdev = pdev; INIT_LIST_HEAD_RCU(&msc->all_msc_list); @@ -1032,7 +1033,7 @@ static void mpam_enable_once(void) "mpam:online"); /* Use printk() to avoid the pr_fmt adding the function name. */ - printk(KERN_INFO, "MPAM enabled with %u PARTIDs and %u PMGs\n", + printk(KERN_INFO "MPAM enabled with %u PARTIDs and %u PMGs\n", mpam_partid_max + 1, mpam_pmg_max + 1); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 9c08502e9c768..1afc52b363287 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -67,12 +67,50 @@ struct mpam_msc { */ struct mutex part_sel_lock; + /* + * mon_sel_lock protects access to the MSC hardware registers that are + * affected by MPAMCFG_MON_SEL, and the mbwu_state. + * Access to mon_sel is needed from both process and interrupt contexts, + * but is complicated by firmware-backed platforms that can't make any + * access unless they can sleep. + * Always use the mpam_mon_sel_lock() helpers. + * Accesses to mon_sel need to be able to fail if they occur in the wrong + * context. + * If needed, take msc->probe_lock first. + */ + raw_spinlock_t _mon_sel_lock; + unsigned long _mon_sel_flags; + void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; struct mpam_garbage garbage; }; +/* Returning false here means accesses to mon_sel must fail and report an error. */ +static inline bool __must_check mpam_mon_sel_lock(struct mpam_msc *msc) +{ + WARN_ON_ONCE(msc->iface != MPAM_IFACE_MMIO); + + raw_spin_lock_irqsave(&msc->_mon_sel_lock, msc->_mon_sel_flags); + return true; +} + +static inline void mpam_mon_sel_unlock(struct mpam_msc *msc) +{ + raw_spin_unlock_irqrestore(&msc->_mon_sel_lock, msc->_mon_sel_flags); +} + +static inline void mpam_mon_sel_lock_held(struct mpam_msc *msc) +{ + lockdep_assert_held_once(&msc->_mon_sel_lock); +} + +static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) +{ + raw_spin_lock_init(&msc->_mon_sel_lock); +} + struct mpam_class { /* mpam_components in this class */ struct list_head components; From 94602a8b410dc0369d9b6a15b9dd0f1c6d4d5a20 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 26 Jan 2021 17:10:44 +0000 Subject: [PATCH 081/247] NVIDIA: SAUCE: arm_mpam: Probe the hardware features resctrl supports BugLink: https://bugs.launchpad.net/bugs/2122432 Expand the probing support with the control and monitor types we can use with resctrl. CC: Dave Martin Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit f5a241b9b8cd085b492914b1ed0f9a71b55e9406 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 147 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 33 +++++++ 2 files changed, 180 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index ff5df084ee05c..eac7e37dfc159 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -145,6 +145,20 @@ static inline void _mpam_write_partsel_reg(struct mpam_msc *msc, u16 reg, u32 va } #define mpam_write_partsel_reg(msc, reg, val) _mpam_write_partsel_reg(msc, MPAMCFG_##reg, val) +static inline u32 _mpam_read_monsel_reg(struct mpam_msc *msc, u16 reg) +{ + mpam_mon_sel_lock_held(msc); + return __mpam_read_reg(msc, reg); +} +#define mpam_read_monsel_reg(msc, reg) _mpam_read_monsel_reg(msc, MSMON_##reg) + +static inline void _mpam_write_monsel_reg(struct mpam_msc *msc, u16 reg, u32 val) +{ + mpam_mon_sel_lock_held(msc); + __mpam_write_reg(msc, reg, val); +} +#define mpam_write_monsel_reg(msc, reg, val) _mpam_write_monsel_reg(msc, MSMON_##reg, val) + static u64 mpam_msc_read_idr(struct mpam_msc *msc) { u64 idr_high = 0, idr_low; @@ -620,6 +634,133 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +/* + * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour + * of NRDY, software can use this bit for any purpose" - so hardware might not + * implement this - but it isn't RES0. + * + * Try and see what values stick in this bit. If we can write either value, + * its probably not implemented by hardware. + */ +static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) +{ + u32 now; + u64 mon_sel; + bool can_set, can_clear; + struct mpam_msc *msc = ris->vmsc->msc; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + return false; + + mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) | + FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); + _mpam_write_monsel_reg(msc, mon_reg, mon_sel); + + _mpam_write_monsel_reg(msc, mon_reg, MSMON___NRDY); + now = _mpam_read_monsel_reg(msc, mon_reg); + can_set = now & MSMON___NRDY; + + _mpam_write_monsel_reg(msc, mon_reg, 0); + now = _mpam_read_monsel_reg(msc, mon_reg); + can_clear = !(now & MSMON___NRDY); + mpam_mon_sel_unlock(msc); + + return (!can_set || !can_clear); +} + +#define mpam_ris_hw_probe_hw_nrdy(_ris, _mon_reg) \ + _mpam_ris_hw_probe_hw_nrdy(_ris, MSMON_##_mon_reg) + +static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) +{ + int err; + struct mpam_msc *msc = ris->vmsc->msc; + struct device *dev = &msc->pdev->dev; + struct mpam_props *props = &ris->props; + + lockdep_assert_held(&msc->probe_lock); + lockdep_assert_held(&msc->part_sel_lock); + + /* Cache Portion partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_CPOR_PART, ris->idr)) { + u32 cpor_features = mpam_read_partsel_reg(msc, CPOR_IDR); + + props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, cpor_features); + if (props->cpbm_wd) + mpam_set_feature(mpam_feat_cpor_part, props); + } + + /* Memory bandwidth partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_MBW_PART, ris->idr)) { + u32 mbw_features = mpam_read_partsel_reg(msc, MBW_IDR); + + /* portion bitmap resolution */ + props->mbw_pbm_bits = FIELD_GET(MPAMF_MBW_IDR_BWPBM_WD, mbw_features); + if (props->mbw_pbm_bits && + FIELD_GET(MPAMF_MBW_IDR_HAS_PBM, mbw_features)) + mpam_set_feature(mpam_feat_mbw_part, props); + + props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) + mpam_set_feature(mpam_feat_mbw_max, props); + } + + /* Performance Monitoring */ + if (FIELD_GET(MPAMF_IDR_HAS_MSMON, ris->idr)) { + u32 msmon_features = mpam_read_partsel_reg(msc, MSMON_IDR); + + /* + * If the firmware max-nrdy-us property is missing, the + * CSU counters can't be used. Should we wait forever? + */ + err = device_property_read_u32(&msc->pdev->dev, + "arm,not-ready-us", + &msc->nrdy_usec); + + if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_CSU, msmon_features)) { + u32 csumonidr; + + csumonidr = mpam_read_partsel_reg(msc, CSUMON_IDR); + props->num_csu_mon = FIELD_GET(MPAMF_CSUMON_IDR_NUM_MON, csumonidr); + if (props->num_csu_mon) { + bool hw_managed; + + mpam_set_feature(mpam_feat_msmon_csu, props); + + /* Is NRDY hardware managed? */ + hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); + if (hw_managed) + mpam_set_feature(mpam_feat_msmon_csu_hw_nrdy, props); + } + + /* + * Accept the missing firmware property if NRDY appears + * un-implemented. + */ + if (err && mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, props)) + dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware."); + } + if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_MBWU, msmon_features)) { + bool hw_managed; + u32 mbwumon_idr = mpam_read_partsel_reg(msc, MBWUMON_IDR); + + props->num_mbwu_mon = FIELD_GET(MPAMF_MBWUMON_IDR_NUM_MON, mbwumon_idr); + if (props->num_mbwu_mon) + mpam_set_feature(mpam_feat_msmon_mbwu, props); + + /* Is NRDY hardware managed? */ + hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); + if (hw_managed) + mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); + + /* + * Don't warn about any missing firmware property for + * MBWU NRDY - it doesn't make any sense! + */ + } + } +} + static int mpam_msc_hw_probe(struct mpam_msc *msc) { u64 idr; @@ -663,6 +804,12 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) mutex_unlock(&mpam_list_lock); if (IS_ERR(ris)) return PTR_ERR(ris); + ris->idr = idr; + + mutex_lock(&msc->part_sel_lock); + __mpam_part_sel(ris_idx, 0, msc); + mpam_ris_hw_probe(ris); + mutex_unlock(&msc->part_sel_lock); } spin_lock(&partid_max_lock); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1afc52b363287..be9ea0aab6d21 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -5,6 +5,7 @@ #define MPAM_INTERNAL_H #include +#include #include #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #define MPAM_MSC_MAX_NUM_RIS 16 @@ -111,6 +113,33 @@ static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) raw_spin_lock_init(&msc->_mon_sel_lock); } +/* Bits for mpam features bitmaps */ +enum mpam_device_features { + mpam_feat_cpor_part = 0, + mpam_feat_mbw_part, + mpam_feat_mbw_min, + mpam_feat_mbw_max, + mpam_feat_msmon, + mpam_feat_msmon_csu, + mpam_feat_msmon_csu_hw_nrdy, + mpam_feat_msmon_mbwu, + mpam_feat_msmon_mbwu_hw_nrdy, + MPAM_FEATURE_LAST +}; + +struct mpam_props { + DECLARE_BITMAP(features, MPAM_FEATURE_LAST); + + u16 cpbm_wd; + u16 mbw_pbm_bits; + u16 bwa_wd; + u16 num_csu_mon; + u16 num_mbwu_mon; +}; + +#define mpam_has_feature(_feat, x) test_bit(_feat, (x)->features) +#define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) + struct mpam_class { /* mpam_components in this class */ struct list_head components; @@ -150,6 +179,8 @@ struct mpam_vmsc { /* mpam_msc_ris in this vmsc */ struct list_head ris; + struct mpam_props props; + /* All RIS in this vMSC are members of this MSC */ struct mpam_msc *msc; @@ -161,6 +192,8 @@ struct mpam_vmsc { struct mpam_msc_ris { u8 ris_idx; + u64 idr; + struct mpam_props props; cpumask_t affinity; From c668fac3f2904489f4a49abf5ba695f609093b69 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 7 May 2021 12:45:15 +0100 Subject: [PATCH 082/247] NVIDIA: SAUCE: arm_mpam: Merge supported features during mpam_enable() into mpam_class BugLink: https://bugs.launchpad.net/bugs/2122432 To make a decision about whether to expose an mpam class as a resctrl resource we need to know its overall supported features and properties. Once we've probed all the resources, we can walk the tree and produce overall values by merging the bitmaps. This eliminates features that are only supported by some MSC that make up a component or class. If bitmap properties are mismatched within a component we cannot support the mismatched feature. Care has to be taken as vMSC may hold mismatched RIS. Reviewed-by: Ben Horgan Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit e06f0b201b617255c7e02f3673380b78a7096bff https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 214 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 3 + 2 files changed, 217 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index eac7e37dfc159..62a7101b8d82b 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1166,8 +1166,222 @@ static struct platform_driver mpam_msc_driver = { .remove = mpam_msc_drv_remove, }; +/* Any of these features mean the BWA_WD field is valid. */ +static bool mpam_has_bwa_wd_feature(struct mpam_props *props) +{ + if (mpam_has_feature(mpam_feat_mbw_min, props)) + return true; + if (mpam_has_feature(mpam_feat_mbw_max, props)) + return true; + return false; +} + +#define MISMATCHED_HELPER(parent, child, helper, field, alias) \ + helper(parent) && \ + ((helper(child) && (parent)->field != (child)->field) || \ + (!helper(child) && !(alias))) + +#define MISMATCHED_FEAT(parent, child, feat, field, alias) \ + mpam_has_feature((feat), (parent)) && \ + ((mpam_has_feature((feat), (child)) && (parent)->field != (child)->field) || \ + (!mpam_has_feature((feat), (child)) && !(alias))) + +#define CAN_MERGE_FEAT(parent, child, feat, alias) \ + (alias) && !mpam_has_feature((feat), (parent)) && \ + mpam_has_feature((feat), (child)) + +/* + * Combine two props fields. + * If this is for controls that alias the same resource, it is safe to just + * copy the values over. If two aliasing controls implement the same scheme + * a safe value must be picked. + * For non-aliasing controls, these control different resources, and the + * resulting safe value must be compatible with both. When merging values in + * the tree, all the aliasing resources must be handled first. + * On mismatch, parent is modified. + */ +static void __props_mismatch(struct mpam_props *parent, + struct mpam_props *child, bool alias) +{ + if (CAN_MERGE_FEAT(parent, child, mpam_feat_cpor_part, alias)) { + parent->cpbm_wd = child->cpbm_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_cpor_part, + cpbm_wd, alias)) { + pr_debug("cleared cpor_part\n"); + mpam_clear_feature(mpam_feat_cpor_part, parent); + parent->cpbm_wd = 0; + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_mbw_part, alias)) { + parent->mbw_pbm_bits = child->mbw_pbm_bits; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_mbw_part, + mbw_pbm_bits, alias)) { + pr_debug("cleared mbw_part\n"); + mpam_clear_feature(mpam_feat_mbw_part, parent); + parent->mbw_pbm_bits = 0; + } + + /* bwa_wd is a count of bits, fewer bits means less precision */ + if (alias && !mpam_has_bwa_wd_feature(parent) && + mpam_has_bwa_wd_feature(child)) { + parent->bwa_wd = child->bwa_wd; + } else if (MISMATCHED_HELPER(parent, child, mpam_has_bwa_wd_feature, + bwa_wd, alias)) { + pr_debug("took the min bwa_wd\n"); + parent->bwa_wd = min(parent->bwa_wd, child->bwa_wd); + } + + /* For num properties, take the minimum */ + if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_csu, alias)) { + parent->num_csu_mon = child->num_csu_mon; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_msmon_csu, + num_csu_mon, alias)) { + pr_debug("took the min num_csu_mon\n"); + parent->num_csu_mon = min(parent->num_csu_mon, + child->num_csu_mon); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_mbwu, alias)) { + parent->num_mbwu_mon = child->num_mbwu_mon; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_msmon_mbwu, + num_mbwu_mon, alias)) { + pr_debug("took the min num_mbwu_mon\n"); + parent->num_mbwu_mon = min(parent->num_mbwu_mon, + child->num_mbwu_mon); + } + + if (alias) { + /* Merge features for aliased resources */ + bitmap_or(parent->features, parent->features, child->features, MPAM_FEATURE_LAST); + } else { + /* Clear missing features for non aliasing */ + bitmap_and(parent->features, parent->features, child->features, MPAM_FEATURE_LAST); + } +} + +/* + * If a vmsc doesn't match class feature/configuration, do the right thing(tm). + * For 'num' properties we can just take the minimum. + * For properties where the mismatched unused bits would make a difference, we + * nobble the class feature, as we can't configure all the resources. + * e.g. The L3 cache is composed of two resources with 13 and 17 portion + * bitmaps respectively. + */ +static void +__class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) +{ + struct mpam_props *cprops = &class->props; + struct mpam_props *vprops = &vmsc->props; + struct device *dev = &vmsc->msc->pdev->dev; + + lockdep_assert_held(&mpam_list_lock); /* we modify class */ + + dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n", + (long)cprops->features, (long)vprops->features); + + /* Take the safe value for any common features */ + __props_mismatch(cprops, vprops, false); +} + +static void +__vmsc_props_mismatch(struct mpam_vmsc *vmsc, struct mpam_msc_ris *ris) +{ + struct mpam_props *rprops = &ris->props; + struct mpam_props *vprops = &vmsc->props; + struct device *dev = &vmsc->msc->pdev->dev; + + lockdep_assert_held(&mpam_list_lock); /* we modify vmsc */ + + dev_dbg(dev, "Merging features for vmsc:0x%lx |= ris:0x%lx\n", + (long)vprops->features, (long)rprops->features); + + /* + * Merge mismatched features - Copy any features that aren't common, + * but take the safe value for any common features. + */ + __props_mismatch(vprops, rprops, true); +} + +/* + * Copy the first component's first vMSC's properties and features to the + * class. __class_props_mismatch() will remove conflicts. + * It is not possible to have a class with no components, or a component with + * no resources. The vMSC properties have already been built. + */ +static void mpam_enable_init_class_features(struct mpam_class *class) +{ + struct mpam_vmsc *vmsc; + struct mpam_component *comp; + + comp = list_first_entry(&class->components, + struct mpam_component, class_list); + vmsc = list_first_entry(&comp->vmsc, + struct mpam_vmsc, comp_list); + + class->props = vmsc->props; +} + +static void mpam_enable_merge_vmsc_features(struct mpam_component *comp) +{ + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + struct mpam_class *class = comp->class; + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + list_for_each_entry(ris, &vmsc->ris, vmsc_list) { + __vmsc_props_mismatch(vmsc, ris); + class->nrdy_usec = max(class->nrdy_usec, + vmsc->msc->nrdy_usec); + } + } +} + +static void mpam_enable_merge_class_features(struct mpam_component *comp) +{ + struct mpam_vmsc *vmsc; + struct mpam_class *class = comp->class; + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) + __class_props_mismatch(class, vmsc); +} + +/* + * Merge all the common resource features into class. + * vmsc features are bitwise-or'd together by mpam_enable_merge_vmsc_features() + * as the first step so that mpam_enable_init_class_features() can initialise + * the class with a representive set of features. + * Next the mpam_enable_merge_class_features() bitwise-and's all the vmsc + * features to form the class features. + * Other features are the min/max as appropriate. + * + * To avoid walking the whole tree twice, the class->nrdy_usec property is + * updated when working with the vmsc as it is a max(), and doesn't need + * initialising first. + */ +static void mpam_enable_merge_features(struct list_head *all_classes_list) +{ + struct mpam_class *class; + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(class, all_classes_list, classes_list) { + list_for_each_entry(comp, &class->components, class_list) + mpam_enable_merge_vmsc_features(comp); + + mpam_enable_init_class_features(class); + + list_for_each_entry(comp, &class->components, class_list) + mpam_enable_merge_class_features(comp); + } +} + static void mpam_enable_once(void) { + mutex_lock(&mpam_list_lock); + mpam_enable_merge_features(&mpam_classes); + mutex_unlock(&mpam_list_lock); + /* * Once the cpuhp callbacks have been changed, mpam_partid_max can no * longer change. diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index be9ea0aab6d21..39331d81c4818 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -139,6 +139,7 @@ struct mpam_props { #define mpam_has_feature(_feat, x) test_bit(_feat, (x)->features) #define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) +#define mpam_clear_feature(_feat, x) clear_bit(_feat, (x)->features) struct mpam_class { /* mpam_components in this class */ @@ -146,6 +147,8 @@ struct mpam_class { cpumask_t affinity; + struct mpam_props props; + u32 nrdy_usec; u8 level; enum mpam_class_types type; From 35c6ce4549e1f051eee8209518c605334d0d0073 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 28 Feb 2019 18:06:57 +0000 Subject: [PATCH 083/247] NVIDIA: SAUCE: arm_mpam: Reset MSC controls from cpuhp callbacks BugLink: https://bugs.launchpad.net/bugs/2122432 When a CPU comes online, it may bring a newly accessible MSC with it. Only the default partid has its value reset by hardware, and even then the MSC might not have been reset since its config was previously dirtied. e.g. Kexec. Any in-use partid must have its configuration restored, or reset. In-use partids may be held in caches and evicted later. MSC are also reset when CPUs are taken offline to cover cases where firmware doesn't reset the MSC over reboot using UEFI, or kexec where there is no firmware involvement. If the configuration for a RIS has not been touched since it was brought online, it does not need resetting again. To reset, write the maximum values for all discovered controls. CC: Rohit Mathew Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 8ed3d7ab1b69b1501e6445688171d0315b6a7919 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 109 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 3 + 2 files changed, 112 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 62a7101b8d82b..f70afa908bc23 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -822,8 +823,104 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) return 0; } +static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) +{ + u32 num_words, msb; + u32 bm = ~0; + int i; + + lockdep_assert_held(&msc->part_sel_lock); + + if (wd == 0) + return; + + /* + * Write all ~0 to all but the last 32bit-word, which may + * have fewer bits... + */ + num_words = DIV_ROUND_UP(wd, 32); + for (i = 0; i < num_words - 1; i++, reg += sizeof(bm)) + __mpam_write_reg(msc, reg, bm); + + /* + * ....and then the last (maybe) partial 32bit word. When wd is a + * multiple of 32, msb should be 31 to write a full 32bit word. + */ + msb = (wd - 1) % 32; + bm = GENMASK(msb, 0); + __mpam_write_reg(msc, reg, bm); +} + +static void mpam_reset_ris_partid(struct mpam_msc_ris *ris, u16 partid) +{ + struct mpam_msc *msc = ris->vmsc->msc; + struct mpam_props *rprops = &ris->props; + + WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); + + mutex_lock(&msc->part_sel_lock); + __mpam_part_sel(ris->ris_idx, partid, msc); + + if (mpam_has_feature(mpam_feat_cpor_part, rprops)) + mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); + + if (mpam_has_feature(mpam_feat_mbw_part, rprops)) + mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); + + if (mpam_has_feature(mpam_feat_mbw_min, rprops)) + mpam_write_partsel_reg(msc, MBW_MIN, 0); + + if (mpam_has_feature(mpam_feat_mbw_max, rprops)) + mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); + + mutex_unlock(&msc->part_sel_lock); +} + +static void mpam_reset_ris(struct mpam_msc_ris *ris) +{ + u16 partid, partid_max; + + WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); + + if (ris->in_reset_state) + return; + + spin_lock(&partid_max_lock); + partid_max = mpam_partid_max; + spin_unlock(&partid_max_lock); + for (partid = 0; partid < partid_max + 1; partid++) + mpam_reset_ris_partid(ris, partid); +} + +static void mpam_reset_msc(struct mpam_msc *msc, bool online) +{ + struct mpam_msc_ris *ris; + + list_for_each_entry_srcu(ris, &msc->ris, msc_list, srcu_read_lock_held(&mpam_srcu)) { + mpam_reset_ris(ris); + + /* + * Set in_reset_state when coming online. The reset state + * for non-zero partid may be lost while the CPUs are offline. + */ + ris->in_reset_state = online; + } +} + static int mpam_cpu_online(unsigned int cpu) { + struct mpam_msc *msc; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &msc->accessibility)) + continue; + + if (atomic_fetch_inc(&msc->online_refs) == 0) + mpam_reset_msc(msc, true); + } + return 0; } @@ -862,6 +959,18 @@ static int mpam_discovery_cpu_online(unsigned int cpu) static int mpam_cpu_offline(unsigned int cpu) { + struct mpam_msc *msc; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &msc->accessibility)) + continue; + + if (atomic_dec_and_test(&msc->online_refs)) + mpam_reset_msc(msc, false); + } + return 0; } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 39331d81c4818..9f062dd5a0bbc 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -5,6 +5,7 @@ #define MPAM_INTERNAL_H #include +#include #include #include #include @@ -46,6 +47,7 @@ struct mpam_msc { enum mpam_msc_iface iface; u32 nrdy_usec; cpumask_t accessibility; + atomic_t online_refs; /* * probe_lock is only taken during discovery. After discovery these @@ -197,6 +199,7 @@ struct mpam_msc_ris { u8 ris_idx; u64 idr; struct mpam_props props; + bool in_reset_state; cpumask_t affinity; From 4b302380720020a79f7b38622543893097a8d0e5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 May 2021 12:45:16 +0100 Subject: [PATCH 084/247] NVIDIA: SAUCE: arm_mpam: Add a helper to touch an MSC from any CPU BugLink: https://bugs.launchpad.net/bugs/2122432 Resetting RIS entries from the cpuhp callback is easy as the callback occurs on the correct CPU. This won't be true for any other caller that wants to reset or configure an MSC. Add a helper that schedules the provided function if necessary. Callers should take the cpuhp lock to prevent the cpuhp callbacks from changing the MSC state. Reviewed-by: Ben Horgan Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 2ac4287d617339c32c82e012b7548811994f802f https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 37 +++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index f70afa908bc23..3d34e5e4d51cc 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -876,20 +876,51 @@ static void mpam_reset_ris_partid(struct mpam_msc_ris *ris, u16 partid) mutex_unlock(&msc->part_sel_lock); } -static void mpam_reset_ris(struct mpam_msc_ris *ris) +/* + * Called via smp_call_on_cpu() to prevent migration, while still being + * pre-emptible. + */ +static int mpam_reset_ris(void *arg) { u16 partid, partid_max; + struct mpam_msc_ris *ris = arg; WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); if (ris->in_reset_state) - return; + return 0; spin_lock(&partid_max_lock); partid_max = mpam_partid_max; spin_unlock(&partid_max_lock); for (partid = 0; partid < partid_max + 1; partid++) mpam_reset_ris_partid(ris, partid); + + return 0; +} + +/* + * Get the preferred CPU for this MSC. If it is accessible from this CPU, + * this CPU is preferred. This can be preempted/migrated, it will only result + * in more work. + */ +static int mpam_get_msc_preferred_cpu(struct mpam_msc *msc) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_cpu(cpu, &msc->accessibility)) + return cpu; + + return cpumask_first_and(&msc->accessibility, cpu_online_mask); +} + +static int mpam_touch_msc(struct mpam_msc *msc, int (*fn)(void *a), void *arg) +{ + lockdep_assert_irqs_enabled(); + lockdep_assert_cpus_held(); + WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); + + return smp_call_on_cpu(mpam_get_msc_preferred_cpu(msc), fn, arg, true); } static void mpam_reset_msc(struct mpam_msc *msc, bool online) @@ -897,7 +928,7 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online) struct mpam_msc_ris *ris; list_for_each_entry_srcu(ris, &msc->ris, msc_list, srcu_read_lock_held(&mpam_srcu)) { - mpam_reset_ris(ris); + mpam_touch_msc(msc, &mpam_reset_ris, ris); /* * Set in_reset_state when coming online. The reset state From 69e75fd62307cb9249de36ca7791387ca23ff982 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 9 Feb 2021 13:46:35 +0000 Subject: [PATCH 085/247] NVIDIA: SAUCE: arm_mpam: Extend reset logic to allow devices to be reset any time BugLink: https://bugs.launchpad.net/bugs/2122432 cpuhp callbacks aren't the only time the MSC configuration may need to be reset. Resctrl has an API call to reset a class. If an MPAM error interrupt arrives it indicates the driver has misprogrammed an MSC. The safest thing to do is reset all the MSCs and disable MPAM. Add a helper to reset RIS via their class. Call this from mpam_disable(), which can be scheduled from the error interrupt handler. Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 888b77167d8bd718e2733865285c3b53a0d4af56 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 58 ++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 3d34e5e4d51cc..6ba277b44c4a5 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -878,15 +878,13 @@ static void mpam_reset_ris_partid(struct mpam_msc_ris *ris, u16 partid) /* * Called via smp_call_on_cpu() to prevent migration, while still being - * pre-emptible. + * pre-emptible. Caller must hold mpam_srcu. */ static int mpam_reset_ris(void *arg) { u16 partid, partid_max; struct mpam_msc_ris *ris = arg; - WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); - if (ris->in_reset_state) return 0; @@ -1538,8 +1536,56 @@ static void mpam_enable_once(void) mpam_partid_max + 1, mpam_pmg_max + 1); } +static void mpam_reset_component_locked(struct mpam_component *comp) +{ + + struct mpam_vmsc *vmsc; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_msc *msc = vmsc->msc; + struct mpam_msc_ris *ris; + + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!ris->in_reset_state) + mpam_touch_msc(msc, mpam_reset_ris, ris); + ris->in_reset_state = true; + } + } +} + +static void mpam_reset_class_locked(struct mpam_class *class) +{ + struct mpam_component *comp; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) + mpam_reset_component_locked(comp); +} + +static void mpam_reset_class(struct mpam_class *class) +{ + cpus_read_lock(); + mpam_reset_class_locked(class); + cpus_read_unlock(); +} + +/* + * Called in response to an error IRQ. + * All of MPAMs errors indicate a software bug, restore any modified + * controls to their reset values. + */ void mpam_disable(struct work_struct *ignored) { + int idx; + struct mpam_class *class; struct mpam_msc *msc, *tmp; mutex_lock(&mpam_cpuhp_state_lock); @@ -1549,6 +1595,12 @@ void mpam_disable(struct work_struct *ignored) } mutex_unlock(&mpam_cpuhp_state_lock); + idx = srcu_read_lock(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) + mpam_reset_class(class); + srcu_read_unlock(&mpam_srcu, idx); + mutex_lock(&mpam_list_lock); list_for_each_entry_safe(msc, tmp, &mpam_all_msc, all_msc_list) mpam_msc_destroy(msc); From 75d60db87012c0753b4d5d7dcbb94f5e08b4a97a Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 8 Feb 2021 13:09:09 +0000 Subject: [PATCH 086/247] NVIDIA: SAUCE: arm_mpam: Register and enable IRQs BugLink: https://bugs.launchpad.net/bugs/2122432 Register and enable error IRQs. All the MPAM error interrupts indicate a software bug, e.g. out of range partid. If the error interrupt is ever signalled, attempt to disable MPAM. Only the irq handler accesses the MPAMF_ESR register, so no locking is needed. The work to disable MPAM after an error needs to happen at process context as it takes mutex. It also unregisters the interrupts, meaning it can't be done from the threaded part of a threaded interrupt. Instead, mpam_disable() gets scheduled. Enabling the IRQs in the MSC may involve cross calling to a CPU that can access the MSC. Once the IRQ is requested, the mpam_disable() path can be called asynchronously, which will walk structures sized by max_partid. Ensure this size is fixed before the interrupt is requested. CC: Rohit Mathew Tested-by: Rohit Mathew Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 9cc38597ebd5423a1e85c773aca85c099a4c47e0 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 283 +++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 13 ++ 2 files changed, 293 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 6ba277b44c4a5..e7e8d7927e869 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -173,6 +176,34 @@ static u64 mpam_msc_read_idr(struct mpam_msc *msc) return (idr_high << 32) | idr_low; } +static void mpam_msc_clear_esr(struct mpam_msc *msc) +{ + u64 esr_low = __mpam_read_reg(msc, MPAMF_ESR); + if (!esr_low) + return; + + /* + * Clearing the high/low bits of MPAMF_ESR can not be atomic. + * Clear the top half first, so that the pending error bits in the + * lower half prevent hardware from updating either half of the + * register. + */ + if (msc->has_extd_esr) + __mpam_write_reg(msc, MPAMF_ESR + 4, 0); + __mpam_write_reg(msc, MPAMF_ESR, 0); +} + +static u64 mpam_msc_read_esr(struct mpam_msc *msc) +{ + u64 esr_high = 0, esr_low; + + esr_low = __mpam_read_reg(msc, MPAMF_ESR); + if (msc->has_extd_esr) + esr_high = __mpam_read_reg(msc, MPAMF_ESR + 4); + + return (esr_high << 32) | esr_low; +} + static void __mpam_part_sel_raw(u32 partsel, struct mpam_msc *msc) { lockdep_assert_held(&msc->part_sel_lock); @@ -799,6 +830,7 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr); msc->partid_max = min(msc->partid_max, partid_max); msc->pmg_max = min(msc->pmg_max, pmg_max); + msc->has_extd_esr = FIELD_GET(MPAMF_IDR_HAS_EXTD_ESR, idr); mutex_lock(&mpam_list_lock); ris = mpam_get_or_create_ris(msc, ris_idx); @@ -813,6 +845,9 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) mutex_unlock(&msc->part_sel_lock); } + /* Clear any stale errors */ + mpam_msc_clear_esr(msc); + spin_lock(&partid_max_lock); mpam_partid_max = min(mpam_partid_max, msc->partid_max); mpam_pmg_max = min(mpam_pmg_max, msc->pmg_max); @@ -936,6 +971,13 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online) } } +static void _enable_percpu_irq(void *_irq) +{ + int *irq = _irq; + + enable_percpu_irq(*irq, IRQ_TYPE_NONE); +} + static int mpam_cpu_online(unsigned int cpu) { struct mpam_msc *msc; @@ -946,6 +988,9 @@ static int mpam_cpu_online(unsigned int cpu) if (!cpumask_test_cpu(cpu, &msc->accessibility)) continue; + if (msc->reenable_error_ppi) + _enable_percpu_irq(&msc->reenable_error_ppi); + if (atomic_fetch_inc(&msc->online_refs) == 0) mpam_reset_msc(msc, true); } @@ -996,6 +1041,9 @@ static int mpam_cpu_offline(unsigned int cpu) if (!cpumask_test_cpu(cpu, &msc->accessibility)) continue; + if (msc->reenable_error_ppi) + disable_percpu_irq(msc->reenable_error_ppi); + if (atomic_dec_and_test(&msc->online_refs)) mpam_reset_msc(msc, false); } @@ -1022,6 +1070,42 @@ static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online), mutex_unlock(&mpam_cpuhp_state_lock); } +static int __setup_ppi(struct mpam_msc *msc) +{ + int cpu; + + msc->error_dev_id = alloc_percpu(struct mpam_msc *); + if (!msc->error_dev_id) + return -ENOMEM; + + for_each_cpu(cpu, &msc->accessibility) + *per_cpu_ptr(msc->error_dev_id, cpu) = msc; + + return 0; +} + +static int mpam_msc_setup_error_irq(struct mpam_msc *msc) +{ + int irq; + + irq = platform_get_irq_byname_optional(msc->pdev, "error"); + if (irq <= 0) + return 0; + + /* Allocate and initialise the percpu device pointer for PPI */ + if (irq_is_percpu(irq)) + return __setup_ppi(msc); + + /* sanity check: shared interrupts can be routed anywhere? */ + if (!cpumask_equal(&msc->accessibility, cpu_possible_mask)) { + pr_err_once("msc:%u is a private resource with a shared error interrupt", + msc->id); + return -EINVAL; + } + + return 0; +} + static int mpam_dt_count_msc(void) { int count = 0; @@ -1223,6 +1307,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) mutex_init(&msc->probe_lock); mutex_init(&msc->part_sel_lock); + mutex_init(&msc->error_irq_lock); mpam_mon_sel_lock_init(msc); msc->id = pdev->id; msc->pdev = pdev; @@ -1237,6 +1322,10 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) return ERR_PTR(-EINVAL); } + err = mpam_msc_setup_error_irq(msc); + if (err) + return ERR_PTR(err); + if (device_property_read_u32(&pdev->dev, "pcc-channel", &tmp)) msc->iface = MPAM_IFACE_MMIO; else @@ -1514,11 +1603,176 @@ static void mpam_enable_merge_features(struct list_head *all_classes_list) } } +static char *mpam_errcode_names[16] = { + [MPAM_ERRCODE_NONE] = "No error", + [MPAM_ERRCODE_PARTID_SEL_RANGE] = "PARTID_SEL_Range", + [MPAM_ERRCODE_REQ_PARTID_RANGE] = "Req_PARTID_Range", + [MPAM_ERRCODE_MSMONCFG_ID_RANGE] = "MSMONCFG_ID_RANGE", + [MPAM_ERRCODE_REQ_PMG_RANGE] = "Req_PMG_Range", + [MPAM_ERRCODE_MONITOR_RANGE] = "Monitor_Range", + [MPAM_ERRCODE_INTPARTID_RANGE] = "intPARTID_Range", + [MPAM_ERRCODE_UNEXPECTED_INTERNAL] = "Unexpected_INTERNAL", + [MPAM_ERRCODE_UNDEFINED_RIS_PART_SEL] = "Undefined_RIS_PART_SEL", + [MPAM_ERRCODE_RIS_NO_CONTROL] = "RIS_No_Control", + [MPAM_ERRCODE_UNDEFINED_RIS_MON_SEL] = "Undefined_RIS_MON_SEL", + [MPAM_ERRCODE_RIS_NO_MONITOR] = "RIS_No_Monitor", + [12 ... 15] = "Reserved" +}; + +static int mpam_enable_msc_ecr(void *_msc) +{ + struct mpam_msc *msc = _msc; + + __mpam_write_reg(msc, MPAMF_ECR, MPAMF_ECR_INTEN); + + return 0; +} + +/* This can run in mpam_disable(), and the interrupt handler on the same CPU */ +static int mpam_disable_msc_ecr(void *_msc) +{ + struct mpam_msc *msc = _msc; + + __mpam_write_reg(msc, MPAMF_ECR, 0); + + return 0; +} + +static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc) +{ + u64 reg; + u16 partid; + u8 errcode, pmg, ris; + + if (WARN_ON_ONCE(!msc) || + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &msc->accessibility))) + return IRQ_NONE; + + reg = mpam_msc_read_esr(msc); + + errcode = FIELD_GET(MPAMF_ESR_ERRCODE, reg); + if (!errcode) + return IRQ_NONE; + + /* Clear level triggered irq */ + mpam_msc_clear_esr(msc); + + partid = FIELD_GET(MPAMF_ESR_PARTID_MON, reg); + pmg = FIELD_GET(MPAMF_ESR_PMG, reg); + ris = FIELD_GET(MPAMF_ESR_RIS, reg); + + pr_err_ratelimited("error irq from msc:%u '%s', partid:%u, pmg: %u, ris: %u\n", + msc->id, mpam_errcode_names[errcode], partid, pmg, + ris); + + /* Disable this interrupt. */ + mpam_disable_msc_ecr(msc); + + /* + * Schedule the teardown work. Don't use a threaded IRQ as we can't + * unregister the interrupt from the threaded part of the handler. + */ + mpam_disable_reason = "hardware error interrupt"; + schedule_work(&mpam_broken_work); + + return IRQ_HANDLED; +} + +static irqreturn_t mpam_ppi_handler(int irq, void *dev_id) +{ + struct mpam_msc *msc = *(struct mpam_msc **)dev_id; + + return __mpam_irq_handler(irq, msc); +} + +static irqreturn_t mpam_spi_handler(int irq, void *dev_id) +{ + struct mpam_msc *msc = dev_id; + + return __mpam_irq_handler(irq, msc); +} + +static int mpam_register_irqs(void) +{ + int err, irq; + struct mpam_msc *msc; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + irq = platform_get_irq_byname_optional(msc->pdev, "error"); + if (irq <= 0) + continue; + + /* The MPAM spec says the interrupt can be SPI, PPI or LPI */ + /* We anticipate sharing the interrupt with other MSCs */ + if (irq_is_percpu(irq)) { + err = request_percpu_irq(irq, &mpam_ppi_handler, + "mpam:msc:error", + msc->error_dev_id); + if (err) + return err; + + msc->reenable_error_ppi = irq; + smp_call_function_many(&msc->accessibility, + &_enable_percpu_irq, &irq, + true); + } else { + err = devm_request_irq(&msc->pdev->dev,irq, + &mpam_spi_handler, IRQF_SHARED, + "mpam:msc:error", msc); + if (err) + return err; + } + + mutex_lock(&msc->error_irq_lock); + msc->error_irq_req = true; + mpam_touch_msc(msc, mpam_enable_msc_ecr, msc); + msc->error_irq_hw_enabled = true; + mutex_unlock(&msc->error_irq_lock); + } + + return 0; +} + +static void mpam_unregister_irqs(void) +{ + int irq; + struct mpam_msc *msc; + + guard(cpus_read_lock)(); + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + irq = platform_get_irq_byname_optional(msc->pdev, "error"); + if (irq <= 0) + continue; + + mutex_lock(&msc->error_irq_lock); + if (msc->error_irq_hw_enabled) { + mpam_touch_msc(msc, mpam_disable_msc_ecr, msc); + msc->error_irq_hw_enabled = false; + } + + if (msc->error_irq_req) { + if (irq_is_percpu(irq)) { + msc->reenable_error_ppi = 0; + free_percpu_irq(irq, msc->error_dev_id); + } else { + devm_free_irq(&msc->pdev->dev, irq, msc); + } + msc->error_irq_req = false; + } + mutex_unlock(&msc->error_irq_lock); + } +} + static void mpam_enable_once(void) { - mutex_lock(&mpam_list_lock); - mpam_enable_merge_features(&mpam_classes); - mutex_unlock(&mpam_list_lock); + int err; /* * Once the cpuhp callbacks have been changed, mpam_partid_max can no @@ -1528,6 +1782,27 @@ static void mpam_enable_once(void) partid_max_published = true; spin_unlock(&partid_max_lock); + /* + * If all the MSC have been probed, enabling the IRQs happens next. + * That involves cross-calling to a CPU that can reach the MSC, and + * the locks must be taken in this order: + */ + cpus_read_lock(); + mutex_lock(&mpam_list_lock); + mpam_enable_merge_features(&mpam_classes); + + err = mpam_register_irqs(); + + mutex_unlock(&mpam_list_lock); + cpus_read_unlock(); + + if (err) { + pr_warn("Failed to register irqs: %d\n", err); + mpam_disable_reason = "Failed to enable."; + schedule_work(&mpam_broken_work); + return; + } + mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); @@ -1595,6 +1870,8 @@ void mpam_disable(struct work_struct *ignored) } mutex_unlock(&mpam_cpuhp_state_lock); + mpam_unregister_irqs(); + idx = srcu_read_lock(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 9f062dd5a0bbc..a04b09abd814d 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -47,6 +47,11 @@ struct mpam_msc { enum mpam_msc_iface iface; u32 nrdy_usec; cpumask_t accessibility; + bool has_extd_esr; + + int reenable_error_ppi; + struct mpam_msc * __percpu *error_dev_id; + atomic_t online_refs; /* @@ -60,6 +65,14 @@ struct mpam_msc { unsigned long ris_idxs; u32 ris_max; + /* + * error_irq_lock is taken when registering/unregistering the error + * interrupt and maniupulating the below flags. + */ + struct mutex error_irq_lock; + bool error_irq_req; + bool error_irq_hw_enabled; + /* mpam_msc_ris of this component */ struct list_head ris; From 6deb76a9394cf170a9894449cafa1d85b9de22db Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 13 May 2021 15:21:13 +0100 Subject: [PATCH 087/247] NVIDIA: SAUCE: arm_mpam: Use a static key to indicate when mpam is enabled BugLink: https://bugs.launchpad.net/bugs/2122432 Once all the MSC have been probed, the system wide usable number of PARTID is known and the configuration arrays can be allocated. After this point, checking all the MSC have been probed is pointless, and the cpuhp callbacks should restore the configuration, instead of just resetting the MSC. Add a static key to enable this behaviour. This will also allow MPAM to be disabled in response to an error, and the architecture code to enable/disable the context switch of the MPAM system registers. Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 34e36d87fe274396f66f0b73f9846e2036e1a4d8 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 12 ++++++++++++ drivers/resctrl/mpam_internal.h | 9 +++++++++ 2 files changed, 21 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index e7e8d7927e869..6f4e0092d44ff 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -32,6 +32,8 @@ #include "mpam_internal.h" +DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */ + /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, @@ -1005,6 +1007,9 @@ static int mpam_discovery_cpu_online(unsigned int cpu) struct mpam_msc *msc; bool new_device_probed = false; + if (mpam_is_enabled()) + return 0; + guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, srcu_read_lock_held(&mpam_srcu)) { @@ -1669,6 +1674,10 @@ static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc) /* Disable this interrupt. */ mpam_disable_msc_ecr(msc); + /* Are we racing with the thread disabling MPAM? */ + if (!mpam_is_enabled()) + return IRQ_HANDLED; + /* * Schedule the teardown work. Don't use a threaded IRQ as we can't * unregister the interrupt from the threaded part of the handler. @@ -1803,6 +1812,7 @@ static void mpam_enable_once(void) return; } + static_branch_enable(&mpam_enabled); mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); @@ -1870,6 +1880,8 @@ void mpam_disable(struct work_struct *ignored) } mutex_unlock(&mpam_cpuhp_state_lock); + static_branch_disable(&mpam_enabled); + mpam_unregister_irqs(); idx = srcu_read_lock(&mpam_srcu); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index a04b09abd814d..d492df9a1735e 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -19,8 +20,16 @@ #define MPAM_MSC_MAX_NUM_RIS 16 + struct platform_device; +DECLARE_STATIC_KEY_FALSE(mpam_enabled); + +static inline bool mpam_is_enabled(void) +{ + return static_branch_likely(&mpam_enabled); +} + /* * Structures protected by SRCU may not be freed for a surprising amount of * time (especially if perf is running). To ensure the MPAM error interrupt can From 0a70329e0d350adb028de95552c46dad5eac09e7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 10 Feb 2021 18:11:20 +0000 Subject: [PATCH 088/247] NVIDIA: SAUCE: arm_mpam: Allow configuration to be applied and restored during cpu online BugLink: https://bugs.launchpad.net/bugs/2122432 When CPUs come online the MSC's original configuration should be restored. Add struct mpam_config to hold the configuration. This has a bitmap of features that were modified. Once the maximum partid is known, allocate a configuration array for each component, and reprogram each RIS configuration from this. CC: Dave Martin Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 6af806f712f02b54c3e617686dc288f280ffc61b https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 284 +++++++++++++++++++++++++++++--- drivers/resctrl/mpam_internal.h | 23 +++ 2 files changed, 287 insertions(+), 20 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 6f4e0092d44ff..6bcc97fb51252 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -121,6 +121,17 @@ static inline void init_garbage(struct mpam_garbage *garbage) { init_llist_node(&garbage->llist); } + +/* + * Once mpam is enabled, new requestors cannot further reduce the available + * partid. Assert that the size is fixed, and new requestors will be turned + * away. + */ +static void mpam_assert_partid_sizes_fixed(void) +{ + WARN_ON_ONCE(!partid_max_published); +} + static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) { WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); @@ -369,12 +380,16 @@ static void mpam_class_destroy(struct mpam_class *class) add_to_garbage(class); } +static void __destroy_component_cfg(struct mpam_component *comp); + static void mpam_comp_destroy(struct mpam_component *comp) { struct mpam_class *class = comp->class; lockdep_assert_held(&mpam_list_lock); + __destroy_component_cfg(comp); + list_del_rcu(&comp->class_list); add_to_garbage(comp); @@ -888,48 +903,102 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) __mpam_write_reg(msc, reg, bm); } -static void mpam_reset_ris_partid(struct mpam_msc_ris *ris, u16 partid) +/* Called via IPI. Call while holding an SRCU reference */ +static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, + struct mpam_config *cfg) { struct mpam_msc *msc = ris->vmsc->msc; struct mpam_props *rprops = &ris->props; - WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); - mutex_lock(&msc->part_sel_lock); __mpam_part_sel(ris->ris_idx, partid, msc); - if (mpam_has_feature(mpam_feat_cpor_part, rprops)) - mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); + if (mpam_has_feature(mpam_feat_cpor_part, rprops) && + mpam_has_feature(mpam_feat_cpor_part, cfg)) { + if (cfg->reset_cpbm) + mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, + rprops->cpbm_wd); + else + mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); + } - if (mpam_has_feature(mpam_feat_mbw_part, rprops)) - mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); + if (mpam_has_feature(mpam_feat_mbw_part, rprops) && + mpam_has_feature(mpam_feat_mbw_part, cfg)) { + if (cfg->reset_mbw_pbm) + mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, + rprops->mbw_pbm_bits); + else + mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); + } - if (mpam_has_feature(mpam_feat_mbw_min, rprops)) + if (mpam_has_feature(mpam_feat_mbw_min, rprops) && + mpam_has_feature(mpam_feat_mbw_min, cfg)) mpam_write_partsel_reg(msc, MBW_MIN, 0); - if (mpam_has_feature(mpam_feat_mbw_max, rprops)) - mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); + if (mpam_has_feature(mpam_feat_mbw_max, rprops) && + mpam_has_feature(mpam_feat_mbw_max, cfg)) + mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); mutex_unlock(&msc->part_sel_lock); } +struct reprogram_ris { + struct mpam_msc_ris *ris; + struct mpam_config *cfg; +}; + +/* Call with MSC lock held */ +static int mpam_reprogram_ris(void *_arg) +{ + u16 partid, partid_max; + struct reprogram_ris *arg = _arg; + struct mpam_msc_ris *ris = arg->ris; + struct mpam_config *cfg = arg->cfg; + + if (ris->in_reset_state) + return 0; + + spin_lock(&partid_max_lock); + partid_max = mpam_partid_max; + spin_unlock(&partid_max_lock); + for (partid = 0; partid <= partid_max + 1; partid++) + mpam_reprogram_ris_partid(ris, partid, cfg); + + return 0; +} + +static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) +{ + *reset_cfg = (struct mpam_config) { + .cpbm = ~0, + .mbw_pbm = ~0, + .mbw_max = MPAMCFG_MBW_MAX_MAX, + + .reset_cpbm = true, + .reset_mbw_pbm = true, + }; + bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); +} + /* * Called via smp_call_on_cpu() to prevent migration, while still being * pre-emptible. Caller must hold mpam_srcu. */ static int mpam_reset_ris(void *arg) { - u16 partid, partid_max; + struct mpam_config reset_cfg; struct mpam_msc_ris *ris = arg; + struct reprogram_ris reprogram_arg; if (ris->in_reset_state) return 0; - spin_lock(&partid_max_lock); - partid_max = mpam_partid_max; - spin_unlock(&partid_max_lock); - for (partid = 0; partid < partid_max + 1; partid++) - mpam_reset_ris_partid(ris, partid); + mpam_init_reset_cfg(&reset_cfg); + + reprogram_arg.ris = ris; + reprogram_arg.cfg = &reset_cfg; + + mpam_reprogram_ris(&reprogram_arg); return 0; } @@ -973,6 +1042,39 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online) } } +static void mpam_reprogram_msc(struct mpam_msc *msc) +{ + u16 partid; + bool reset; + struct mpam_config *cfg; + struct mpam_msc_ris *ris; + + /* + * No lock for mpam_partid_max as partid_max_published has been + * set by mpam_enabled(), so the values can no longer change. + */ + mpam_assert_partid_sizes_fixed(); + + list_for_each_entry_srcu(ris, &msc->ris, msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!mpam_is_enabled() && !ris->in_reset_state) { + mpam_touch_msc(msc, &mpam_reset_ris, ris); + ris->in_reset_state = true; + continue; + } + + reset = true; + for (partid = 0; partid <= mpam_partid_max; partid++) { + cfg = &ris->vmsc->comp->cfg[partid]; + if (!bitmap_empty(cfg->features, MPAM_FEATURE_LAST)) + reset = false; + + mpam_reprogram_ris_partid(ris, partid, cfg); + } + ris->in_reset_state = reset; + } +} + static void _enable_percpu_irq(void *_irq) { int *irq = _irq; @@ -994,7 +1096,7 @@ static int mpam_cpu_online(unsigned int cpu) _enable_percpu_irq(&msc->reenable_error_ppi); if (atomic_fetch_inc(&msc->online_refs) == 0) - mpam_reset_msc(msc, true); + mpam_reprogram_msc(msc); } return 0; @@ -1779,6 +1881,64 @@ static void mpam_unregister_irqs(void) } } +static void __destroy_component_cfg(struct mpam_component *comp) +{ + add_to_garbage(comp->cfg); +} + +static void mpam_reset_component_cfg(struct mpam_component *comp) +{ + int i; + + mpam_assert_partid_sizes_fixed(); + + if (!comp->cfg) + return; + + for (i = 0; i < mpam_partid_max + 1; i++) + mpam_init_reset_cfg(&comp->cfg[i]); +} + +static int __allocate_component_cfg(struct mpam_component *comp) +{ + mpam_assert_partid_sizes_fixed(); + + if (comp->cfg) + return 0; + + comp->cfg = kcalloc(mpam_partid_max + 1, sizeof(*comp->cfg), GFP_KERNEL); + if (!comp->cfg) + return -ENOMEM; + + /* + * The array is free()d in one go, so only cfg[0]'s struture needs + * to be initialised. + */ + init_garbage(&comp->cfg[0].garbage); + + mpam_reset_component_cfg(comp); + + return 0; +} + +static int mpam_allocate_config(void) +{ + struct mpam_class *class; + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(class, &mpam_classes, classes_list) { + list_for_each_entry(comp, &class->components, class_list) { + int err = __allocate_component_cfg(comp); + if (err) + return err; + } + } + + return 0; +} + static void mpam_enable_once(void) { int err; @@ -1798,15 +1958,25 @@ static void mpam_enable_once(void) */ cpus_read_lock(); mutex_lock(&mpam_list_lock); - mpam_enable_merge_features(&mpam_classes); + do { + mpam_enable_merge_features(&mpam_classes); - err = mpam_register_irqs(); + err = mpam_register_irqs(); + if (err) { + pr_warn("Failed to register irqs: %d\n", err); + break; + } + err = mpam_allocate_config(); + if (err) { + pr_err("Failed to allocate configuration arrays.\n"); + break; + } + } while (0); mutex_unlock(&mpam_list_lock); cpus_read_unlock(); if (err) { - pr_warn("Failed to register irqs: %d\n", err); mpam_disable_reason = "Failed to enable."; schedule_work(&mpam_broken_work); return; @@ -1827,6 +1997,9 @@ static void mpam_reset_component_locked(struct mpam_component *comp) struct mpam_vmsc *vmsc; lockdep_assert_cpus_held(); + mpam_assert_partid_sizes_fixed(); + + mpam_reset_component_cfg(comp); guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, @@ -1927,6 +2100,77 @@ void mpam_enable(struct work_struct *work) mpam_enable_once(); } +struct mpam_write_config_arg { + struct mpam_msc_ris *ris; + struct mpam_component *comp; + u16 partid; +}; + +static int __write_config(void *arg) +{ + struct mpam_write_config_arg *c = arg; + + mpam_reprogram_ris_partid(c->ris, c->partid, &c->comp->cfg[c->partid]); + + return 0; +} + +#define maybe_update_config(cfg, feature, newcfg, member, changes) do { \ + if (mpam_has_feature(feature, newcfg) && \ + (newcfg)->member != (cfg)->member) { \ + (cfg)->member = (newcfg)->member; \ + mpam_set_feature(feature, cfg); \ + \ + (changes) = true; \ + } \ +} while (0) + +static bool mpam_update_config(struct mpam_config *cfg, + const struct mpam_config *newcfg) +{ + bool has_changes = false; + + maybe_update_config(cfg, mpam_feat_cpor_part, newcfg, cpbm, has_changes); + maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes); + maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes); + + return has_changes; +} + +int mpam_apply_config(struct mpam_component *comp, u16 partid, + struct mpam_config *cfg) +{ + struct mpam_write_config_arg arg; + struct mpam_msc_ris *ris; + struct mpam_vmsc *vmsc; + struct mpam_msc *msc; + + lockdep_assert_cpus_held(); + + /* Don't pass in the current config! */ + WARN_ON_ONCE(&comp->cfg[partid] == cfg); + + if (!mpam_update_config(&comp->cfg[partid], cfg)) + return 0; + + arg.comp = comp; + arg.partid = partid; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + msc = vmsc->msc; + + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + arg.ris = ris; + mpam_touch_msc(msc, __write_config, &arg); + } + } + + return 0; +} + /* * MSCs that are declared by the firmware as being part of a cache may not * be created automatically as platform devices, since there is no diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index d492df9a1735e..2f2a7369107bb 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -182,6 +182,20 @@ struct mpam_class { struct mpam_garbage garbage; }; +struct mpam_config { + /* Which configuration values are valid. */ + DECLARE_BITMAP(features, MPAM_FEATURE_LAST); + + u32 cpbm; + u32 mbw_pbm; + u16 mbw_max; + + bool reset_cpbm; + bool reset_mbw_pbm; + + struct mpam_garbage garbage; +}; + struct mpam_component { u32 comp_id; @@ -190,6 +204,12 @@ struct mpam_component { cpumask_t affinity; + /* + * Array of configuration values, indexed by partid. + * Read from cpuhp callbacks, hold the cpuhp lock when writing. + */ + struct mpam_config *cfg; + /* member of mpam_class:components */ struct list_head class_list; @@ -249,6 +269,9 @@ extern u8 mpam_pmg_max; void mpam_enable(struct work_struct *work); void mpam_disable(struct work_struct *work); +int mpam_apply_config(struct mpam_component *comp, u16 partid, + struct mpam_config *cfg); + int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); From 1043a5e2c21d177b1ddd780da5fc33fc87f4df54 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 17 Oct 2025 18:38:45 +0100 Subject: [PATCH 089/247] NVIDIA: SAUCE: arm_mpam: Probe and reset the rest of the features BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM supports more features than are going to be exposed to resctrl. For partid other than 0, the reset values of these controls isn't known. Discover the rest of the features so they can be reset to avoid any side effects when resctrl is in use. PARTID narrowing allows MSC/RIS to support less configuration space than is usable. If this feature is found on a class of device we are likely to use, then reduce the partid_max to make it usable. This allows us to map a PARTID to itself. CC: Rohit Mathew CC: Zeng Heng CC: Dave Martin Reviewed-by: Jonathan Cameron Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 342bfa69997131bebce86f273bc0b12014ffc519 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 188 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 22 +++- 2 files changed, 208 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 6bcc97fb51252..75acb1a0c81d7 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -232,6 +232,15 @@ static void __mpam_part_sel(u8 ris_idx, u16 partid, struct mpam_msc *msc) __mpam_part_sel_raw(partsel, msc); } +static void __mpam_intpart_sel(u8 ris_idx, u16 intpartid, struct mpam_msc *msc) +{ + u32 partsel = FIELD_PREP(MPAMCFG_PART_SEL_RIS, ris_idx) | + FIELD_PREP(MPAMCFG_PART_SEL_PARTID_SEL, intpartid) | + MPAMCFG_PART_SEL_INTERNAL; + + __mpam_part_sel_raw(partsel, msc); +} + int mpam_register_requestor(u16 partid_max, u8 pmg_max) { guard(spinlock)(&partid_max_lock); @@ -726,10 +735,34 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) struct mpam_msc *msc = ris->vmsc->msc; struct device *dev = &msc->pdev->dev; struct mpam_props *props = &ris->props; + struct mpam_class *class = ris->vmsc->comp->class; lockdep_assert_held(&msc->probe_lock); lockdep_assert_held(&msc->part_sel_lock); + /* Cache Capacity Partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_CCAP_PART, ris->idr)) { + u32 ccap_features = mpam_read_partsel_reg(msc, CCAP_IDR); + + props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ccap_features); + if (props->cmax_wd && + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ccap_features)) + mpam_set_feature(mpam_feat_cmax_softlim, props); + + if (props->cmax_wd && + !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ccap_features)) + mpam_set_feature(mpam_feat_cmax_cmax, props); + + if (props->cmax_wd && + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ccap_features)) + mpam_set_feature(mpam_feat_cmax_cmin, props); + + props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ccap_features); + if (props->cassoc_wd && + FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ccap_features)) + mpam_set_feature(mpam_feat_cmax_cassoc, props); + } + /* Cache Portion partitioning */ if (FIELD_GET(MPAMF_IDR_HAS_CPOR_PART, ris->idr)) { u32 cpor_features = mpam_read_partsel_reg(msc, CPOR_IDR); @@ -752,6 +785,31 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) mpam_set_feature(mpam_feat_mbw_max, props); + + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MIN, mbw_features)) + mpam_set_feature(mpam_feat_mbw_min, props); + + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_PROP, mbw_features)) + mpam_set_feature(mpam_feat_mbw_prop, props); + } + + /* Priority partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_PRI_PART, ris->idr)) { + u32 pri_features = mpam_read_partsel_reg(msc, PRI_IDR); + + props->intpri_wd = FIELD_GET(MPAMF_PRI_IDR_INTPRI_WD, pri_features); + if (props->intpri_wd && FIELD_GET(MPAMF_PRI_IDR_HAS_INTPRI, pri_features)) { + mpam_set_feature(mpam_feat_intpri_part, props); + if (FIELD_GET(MPAMF_PRI_IDR_INTPRI_0_IS_LOW, pri_features)) + mpam_set_feature(mpam_feat_intpri_part_0_low, props); + } + + props->dspri_wd = FIELD_GET(MPAMF_PRI_IDR_DSPRI_WD, pri_features); + if (props->dspri_wd && FIELD_GET(MPAMF_PRI_IDR_HAS_DSPRI, pri_features)) { + mpam_set_feature(mpam_feat_dspri_part, props); + if (FIELD_GET(MPAMF_PRI_IDR_DSPRI_0_IS_LOW, pri_features)) + mpam_set_feature(mpam_feat_dspri_part_0_low, props); + } } /* Performance Monitoring */ @@ -776,6 +834,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_msmon_csu, props); + if (FIELD_GET(MPAMF_CSUMON_IDR_HAS_XCL, csumonidr)) + mpam_set_feature(mpam_feat_msmon_csu_xcl, props); + /* Is NRDY hardware managed? */ hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); if (hw_managed) @@ -797,6 +858,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) if (props->num_mbwu_mon) mpam_set_feature(mpam_feat_msmon_mbwu, props); + if (FIELD_GET(MPAMF_MBWUMON_IDR_HAS_RWBW, mbwumon_idr)) + mpam_set_feature(mpam_feat_msmon_mbwu_rwbw, props); + /* Is NRDY hardware managed? */ hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); if (hw_managed) @@ -808,6 +872,21 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) */ } } + + /* + * RIS with PARTID narrowing don't have enough storage for one + * configuration per PARTID. If these are in a class we could use, + * reduce the supported partid_max to match the number of intpartid. + * If the class is unknown, just ignore it. + */ + if (FIELD_GET(MPAMF_IDR_HAS_PARTID_NRW, ris->idr) && + class->type != MPAM_CLASS_UNKNOWN) { + u32 nrwidr = mpam_read_partsel_reg(msc, PARTID_NRW_IDR); + u16 partid_max = FIELD_GET(MPAMF_PARTID_NRW_IDR_INTPARTID_MAX, nrwidr); + + mpam_set_feature(mpam_feat_partid_nrw, props); + msc->partid_max = min(msc->partid_max, partid_max); + } } static int mpam_msc_hw_probe(struct mpam_msc *msc) @@ -907,12 +986,28 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) { + u32 pri_val = 0; + u16 cmax = MPAMCFG_CMAX_CMAX; struct mpam_msc *msc = ris->vmsc->msc; struct mpam_props *rprops = &ris->props; + u16 dspri = GENMASK(rprops->dspri_wd, 0); + u16 intpri = GENMASK(rprops->intpri_wd, 0); mutex_lock(&msc->part_sel_lock); __mpam_part_sel(ris->ris_idx, partid, msc); + if (mpam_has_feature(mpam_feat_partid_nrw, rprops)) { + /* Update the intpartid mapping */ + mpam_write_partsel_reg(msc, INTPARTID, + MPAMCFG_INTPARTID_INTERNAL | partid); + + /* + * Then switch to the 'internal' partid to update the + * configuration. + */ + __mpam_intpart_sel(ris->ris_idx, partid, msc); + } + if (mpam_has_feature(mpam_feat_cpor_part, rprops) && mpam_has_feature(mpam_feat_cpor_part, cfg)) { if (cfg->reset_cpbm) @@ -939,6 +1034,35 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_has_feature(mpam_feat_mbw_max, cfg)) mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); + if (mpam_has_feature(mpam_feat_mbw_prop, rprops) && + mpam_has_feature(mpam_feat_mbw_prop, cfg)) + mpam_write_partsel_reg(msc, MBW_PROP, 0); + + if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) + mpam_write_partsel_reg(msc, CMAX, cmax); + + if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) + mpam_write_partsel_reg(msc, CMIN, 0); + + if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops)) + mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC); + + if (mpam_has_feature(mpam_feat_intpri_part, rprops) || + mpam_has_feature(mpam_feat_dspri_part, rprops)) { + /* aces high? */ + if (!mpam_has_feature(mpam_feat_intpri_part_0_low, rprops)) + intpri = 0; + if (!mpam_has_feature(mpam_feat_dspri_part_0_low, rprops)) + dspri = 0; + + if (mpam_has_feature(mpam_feat_intpri_part, rprops)) + pri_val |= FIELD_PREP(MPAMCFG_PRI_INTPRI, intpri); + if (mpam_has_feature(mpam_feat_dspri_part, rprops)) + pri_val |= FIELD_PREP(MPAMCFG_PRI_DSPRI, dspri); + + mpam_write_partsel_reg(msc, PRI, pri_val); + } + mutex_unlock(&msc->part_sel_lock); } @@ -1507,6 +1631,18 @@ static bool mpam_has_bwa_wd_feature(struct mpam_props *props) return true; if (mpam_has_feature(mpam_feat_mbw_max, props)) return true; + if (mpam_has_feature(mpam_feat_mbw_prop, props)) + return true; + return false; +} + +/* Any of these features mean the CMAX_WD field is valid. */ +static bool mpam_has_cmax_wd_feature(struct mpam_props *props) +{ + if (mpam_has_feature(mpam_feat_cmax_cmax, props)) + return true; + if (mpam_has_feature(mpam_feat_cmax_cmin, props)) + return true; return false; } @@ -1565,6 +1701,23 @@ static void __props_mismatch(struct mpam_props *parent, parent->bwa_wd = min(parent->bwa_wd, child->bwa_wd); } + if (alias && !mpam_has_cmax_wd_feature(parent) && mpam_has_cmax_wd_feature(child)) { + parent->cmax_wd = child->cmax_wd; + } else if (MISMATCHED_HELPER(parent, child, mpam_has_cmax_wd_feature, + cmax_wd, alias)) { + pr_debug("%s took the min cmax_wd\n", __func__); + parent->cmax_wd = min(parent->cmax_wd, child->cmax_wd); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_cmax_cassoc, alias)) { + parent->cassoc_wd = child->cassoc_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_cmax_cassoc, + cassoc_wd, alias)) { + pr_debug("%s cleared cassoc_wd\n", __func__); + mpam_clear_feature(mpam_feat_cmax_cassoc, parent); + parent->cassoc_wd = 0; + } + /* For num properties, take the minimum */ if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_csu, alias)) { parent->num_csu_mon = child->num_csu_mon; @@ -1584,6 +1737,41 @@ static void __props_mismatch(struct mpam_props *parent, child->num_mbwu_mon); } + if (CAN_MERGE_FEAT(parent, child, mpam_feat_intpri_part, alias)) { + parent->intpri_wd = child->intpri_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_intpri_part, + intpri_wd, alias)) { + pr_debug("%s took the min intpri_wd\n", __func__); + parent->intpri_wd = min(parent->intpri_wd, child->intpri_wd); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_dspri_part, alias)) { + parent->dspri_wd = child->dspri_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_dspri_part, + dspri_wd, alias)) { + pr_debug("%s took the min dspri_wd\n", __func__); + parent->dspri_wd = min(parent->dspri_wd, child->dspri_wd); + } + + /* TODO: alias support for these two */ + /* {int,ds}pri may not have differing 0-low behaviour */ + if (mpam_has_feature(mpam_feat_intpri_part, parent) && + (!mpam_has_feature(mpam_feat_intpri_part, child) || + mpam_has_feature(mpam_feat_intpri_part_0_low, parent) != + mpam_has_feature(mpam_feat_intpri_part_0_low, child))) { + pr_debug("%s cleared intpri_part\n", __func__); + mpam_clear_feature(mpam_feat_intpri_part, parent); + mpam_clear_feature(mpam_feat_intpri_part_0_low, parent); + } + if (mpam_has_feature(mpam_feat_dspri_part, parent) && + (!mpam_has_feature(mpam_feat_dspri_part, child) || + mpam_has_feature(mpam_feat_dspri_part_0_low, parent) != + mpam_has_feature(mpam_feat_dspri_part_0_low, child))) { + pr_debug("%s cleared dspri_part\n", __func__); + mpam_clear_feature(mpam_feat_dspri_part, parent); + mpam_clear_feature(mpam_feat_dspri_part_0_low, parent); + } + if (alias) { /* Merge features for aliased resources */ bitmap_or(parent->features, parent->features, child->features, MPAM_FEATURE_LAST); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 2f2a7369107bb..00edee9ebc6cb 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -139,16 +139,30 @@ static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) /* Bits for mpam features bitmaps */ enum mpam_device_features { - mpam_feat_cpor_part = 0, + mpam_feat_cmax_softlim, + mpam_feat_cmax_cmax, + mpam_feat_cmax_cmin, + mpam_feat_cmax_cassoc, + mpam_feat_cpor_part, mpam_feat_mbw_part, mpam_feat_mbw_min, mpam_feat_mbw_max, + mpam_feat_mbw_prop, + mpam_feat_intpri_part, + mpam_feat_intpri_part_0_low, + mpam_feat_dspri_part, + mpam_feat_dspri_part_0_low, mpam_feat_msmon, mpam_feat_msmon_csu, + mpam_feat_msmon_csu_capture, + mpam_feat_msmon_csu_xcl, mpam_feat_msmon_csu_hw_nrdy, mpam_feat_msmon_mbwu, + mpam_feat_msmon_mbwu_capture, + mpam_feat_msmon_mbwu_rwbw, mpam_feat_msmon_mbwu_hw_nrdy, - MPAM_FEATURE_LAST + mpam_feat_partid_nrw, + MPAM_FEATURE_LAST, }; struct mpam_props { @@ -157,6 +171,10 @@ struct mpam_props { u16 cpbm_wd; u16 mbw_pbm_bits; u16 bwa_wd; + u16 cmax_wd; + u16 cassoc_wd; + u16 intpri_wd; + u16 dspri_wd; u16 num_csu_mon; u16 num_mbwu_mon; }; From ac21afecb8de24d1cd398e16d9e329c9198e816c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 25 Jun 2021 12:53:12 +0100 Subject: [PATCH 090/247] NVIDIA: SAUCE: arm_mpam: Add helpers to allocate monitors BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM's MSC support a number of monitors, each of which supports bandwidth counters, or cache-storage-utilisation counters. To use a counter, a monitor needs to be configured. Add helpers to allocate and free CSU or MBWU monitors. Reviewed-by: Ben Horgan Reviewed-by: Jonathan Cameron Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 7d8120abd471b76d6c6523b7f9807381e8df18d7 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 2 ++ drivers/resctrl/mpam_internal.h | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 75acb1a0c81d7..e54109378b571 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -353,6 +353,8 @@ mpam_class_alloc(u8 level_idx, enum mpam_class_types type) class->level = level_idx; class->type = type; INIT_LIST_HEAD_RCU(&class->classes_list); + ida_init(&class->ida_csu_mon); + ida_init(&class->ida_mbwu_mon); list_add_rcu(&class->classes_list, &mpam_classes); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 00edee9ebc6cb..96a02ea955839 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -197,6 +197,9 @@ struct mpam_class { /* member of mpam_classes */ struct list_head classes_list; + struct ida ida_csu_mon; + struct ida ida_mbwu_mon; + struct mpam_garbage garbage; }; @@ -275,6 +278,38 @@ struct mpam_msc_ris { struct mpam_garbage garbage; }; +static inline int mpam_alloc_csu_mon(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return -EOPNOTSUPP; + + return ida_alloc_max(&class->ida_csu_mon, cprops->num_csu_mon - 1, + GFP_KERNEL); +} + +static inline void mpam_free_csu_mon(struct mpam_class *class, int csu_mon) +{ + ida_free(&class->ida_csu_mon, csu_mon); +} + +static inline int mpam_alloc_mbwu_mon(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_mbwu, cprops)) + return -EOPNOTSUPP; + + return ida_alloc_max(&class->ida_mbwu_mon, cprops->num_mbwu_mon - 1, + GFP_KERNEL); +} + +static inline void mpam_free_mbwu_mon(struct mpam_class *class, int mbwu_mon) +{ + ida_free(&class->ida_mbwu_mon, mbwu_mon); +} + /* List of all classes - protected by srcu*/ extern struct srcu_struct mpam_srcu; extern struct list_head mpam_classes; From 22dbd24ce61930592b73dee9401f342a0196d2b0 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 24 Jun 2021 16:49:50 +0100 Subject: [PATCH 091/247] NVIDIA: SAUCE: arm_mpam: Add mpam_msmon_read() to read monitor value BugLink: https://bugs.launchpad.net/bugs/2122432 Reading a monitor involves configuring what you want to monitor, and reading the value. Components made up of multiple MSC may need values from each MSC. MSCs may take time to configure, returning 'not ready'. The maximum 'not ready' time should have been provided by firmware. Add mpam_msmon_read() to hide all this. If (one of) the MSC returns not ready, then wait the full timeout value before trying again. CC: Shanker Donthineni Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit c6bc912118f3aeceb03408d72f8fe66f643d9464 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 227 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 19 +++ 2 files changed, 246 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index e54109378b571..e9778a204d94a 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -956,6 +956,233 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) return 0; } +struct mon_read { + struct mpam_msc_ris *ris; + struct mon_cfg *ctx; + enum mpam_device_features type; + u64 *val; + int err; +}; + +static void gen_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, + u32 *flt_val) +{ + struct mon_cfg *ctx = m->ctx; + + /* + * For CSU counters its implementation-defined what happens when not + * filtering by partid. + */ + *ctl_val = MSMON_CFG_x_CTL_MATCH_PARTID; + + *flt_val = FIELD_PREP(MSMON_CFG_x_FLT_PARTID, ctx->partid); + + if (m->ctx->match_pmg) { + *ctl_val |= MSMON_CFG_x_CTL_MATCH_PMG; + *flt_val |= FIELD_PREP(MSMON_CFG_x_FLT_PMG, ctx->pmg); + } + + switch (m->type) { + case mpam_feat_msmon_csu: + *ctl_val |= MSMON_CFG_CSU_CTL_TYPE_CSU; + + if (mpam_has_feature(mpam_feat_msmon_csu_xcl, &m->ris->props)) + *flt_val |= FIELD_PREP(MSMON_CFG_CSU_FLT_XCL, + ctx->csu_exclude_clean); + + break; + case mpam_feat_msmon_mbwu: + *ctl_val |= MSMON_CFG_MBWU_CTL_TYPE_MBWU; + + if (mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, &m->ris->props)) + *flt_val |= FIELD_PREP(MSMON_CFG_MBWU_FLT_RWBW, ctx->opts); + + break; + default: + return; + } +} + +static void read_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, + u32 *flt_val) +{ + struct mpam_msc *msc = m->ris->vmsc->msc; + + switch (m->type) { + case mpam_feat_msmon_csu: + *ctl_val = mpam_read_monsel_reg(msc, CFG_CSU_CTL); + *flt_val = mpam_read_monsel_reg(msc, CFG_CSU_FLT); + return; + case mpam_feat_msmon_mbwu: + *ctl_val = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); + *flt_val = mpam_read_monsel_reg(msc, CFG_MBWU_FLT); + return; + default: + return; + } +} + +/* Remove values set by the hardware to prevent apparent mismatches. */ +static void clean_msmon_ctl_val(u32 *cur_ctl) +{ + *cur_ctl &= ~MSMON_CFG_x_CTL_OFLOW_STATUS; +} + +static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, + u32 flt_val) +{ + struct mpam_msc *msc = m->ris->vmsc->msc; + + /* + * Write the ctl_val with the enable bit cleared, reset the counter, + * then enable counter. + */ + switch (m->type) { + case mpam_feat_msmon_csu: + mpam_write_monsel_reg(msc, CFG_CSU_FLT, flt_val); + mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val); + mpam_write_monsel_reg(msc, CSU, 0); + mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); + break; + case mpam_feat_msmon_mbwu: + mpam_write_monsel_reg(msc, CFG_MBWU_FLT, flt_val); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val); + mpam_write_monsel_reg(msc, MBWU, 0); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); + break; + default: + return; + } +} + +/* Call with MSC lock held */ +static void __ris_msmon_read(void *arg) +{ + u64 now; + bool nrdy = false; + struct mon_read *m = arg; + struct mon_cfg *ctx = m->ctx; + struct mpam_msc_ris *ris = m->ris; + struct mpam_props *rprops = &ris->props; + struct mpam_msc *msc = m->ris->vmsc->msc; + u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; + + if (!mpam_mon_sel_lock(msc)) { + m->err = -EIO; + return; + } + mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, ctx->mon) | + FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); + mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + + /* + * Read the existing configuration to avoid re-writing the same values. + * This saves waiting for 'nrdy' on subsequent reads. + */ + read_msmon_ctl_flt_vals(m, &cur_ctl, &cur_flt); + clean_msmon_ctl_val(&cur_ctl); + gen_msmon_ctl_flt_vals(m, &ctl_val, &flt_val); + if (cur_flt != flt_val || cur_ctl != (ctl_val | MSMON_CFG_x_CTL_EN)) + write_msmon_ctl_flt_vals(m, ctl_val, flt_val); + + switch (m->type) { + case mpam_feat_msmon_csu: + now = mpam_read_monsel_reg(msc, CSU); + if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) + nrdy = now & MSMON___NRDY; + break; + case mpam_feat_msmon_mbwu: + now = mpam_read_monsel_reg(msc, MBWU); + if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) + nrdy = now & MSMON___NRDY; + break; + default: + m->err = -EINVAL; + break; + } + mpam_mon_sel_unlock(msc); + + if (nrdy) { + m->err = -EBUSY; + return; + } + + now = FIELD_GET(MSMON___VALUE, now); + *m->val += now; +} + +static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) +{ + int err; + struct mpam_vmsc *vmsc; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_msc *msc = vmsc->msc; + struct mpam_msc_ris *ris; + + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + arg->ris = ris; + + err = smp_call_function_any(&msc->accessibility, + __ris_msmon_read, arg, + true); + if (!err && arg->err) + err = arg->err; + if (err) + return err; + } + } + + return 0; +} + +int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, + enum mpam_device_features type, u64 *val) +{ + int err; + struct mon_read arg; + u64 wait_jiffies = 0; + struct mpam_props *cprops = &comp->class->props; + + might_sleep(); + + if (!mpam_is_enabled()) + return -EIO; + + if (!mpam_has_feature(type, cprops)) + return -EOPNOTSUPP; + + arg = (struct mon_read) { + .ctx = ctx, + .type = type, + .val = val, + }; + *val = 0; + + err = _msmon_read(comp, &arg); + if (err == -EBUSY && comp->class->nrdy_usec) + wait_jiffies = usecs_to_jiffies(comp->class->nrdy_usec); + + while (wait_jiffies) + wait_jiffies = schedule_timeout_uninterruptible(wait_jiffies); + + if (err == -EBUSY) { + arg = (struct mon_read) { + .ctx = ctx, + .type = type, + .val = val, + }; + *val = 0; + + err = _msmon_read(comp, &arg); + } + + return err; +} + static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) { u32 num_words, msb; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 96a02ea955839..0c84e945c8914 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -183,6 +183,22 @@ struct mpam_props { #define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) #define mpam_clear_feature(_feat, x) clear_bit(_feat, (x)->features) +/* The values for MSMON_CFG_MBWU_FLT.RWBW */ +enum mon_filter_options { + COUNT_BOTH = 0, + COUNT_WRITE = 1, + COUNT_READ = 2, +}; + +struct mon_cfg { + u16 mon; + u8 pmg; + bool match_pmg; + bool csu_exclude_clean; + u32 partid; + enum mon_filter_options opts; +}; + struct mpam_class { /* mpam_components in this class */ struct list_head components; @@ -325,6 +341,9 @@ void mpam_disable(struct work_struct *work); int mpam_apply_config(struct mpam_component *comp, u16 partid, struct mpam_config *cfg); +int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, + enum mpam_device_features, u64 *val); + int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); From 8ab0b1acbe4542d85794bac0584fac7a65f89236 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 19 Sep 2025 12:03:02 +0100 Subject: [PATCH 092/247] NVIDIA: SAUCE: fixup for _msmon_read, reported by Zeng BugLink: https://bugs.launchpad.net/bugs/2122432 (cherry picked from commit 107021b4f39c086b64a0d7e7142cea0c6c6bdf7a https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index e9778a204d94a..0a744b8df5080 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1113,7 +1113,7 @@ static void __ris_msmon_read(void *arg) static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) { - int err; + int err, any_err = 0; struct mpam_vmsc *vmsc; guard(srcu)(&mpam_srcu); @@ -1131,12 +1131,18 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) true); if (!err && arg->err) err = arg->err; + + /* + * Save one error to be returned to the caller, but + * keep reading counters so that get reprogrammed. On + * platforms with NRDY this lets us wait once. + */ if (err) - return err; + any_err = err; } } - return 0; + return any_err; } int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, From 4c0a422e022889014d29f5fafad8efcdb116607a Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 8 Sep 2021 12:23:40 +0100 Subject: [PATCH 093/247] NVIDIA: SAUCE: arm_mpam: Track bandwidth counter state for overflow and power management BugLink: https://bugs.launchpad.net/bugs/2122432 Bandwidth counters need to run continuously to correctly reflect the bandwidth. The value read may be lower than the previous value read in the case of overflow and when the hardware is reset due to CPU hotplug. Add struct mbwu_state to track the bandwidth counter to allow overflow and power management to be handled. Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit d2737c47cbdafc47494316a58c43c2f0c5d5bf5b https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 146 +++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 23 +++++ 2 files changed, 167 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0a744b8df5080..180be23ebf262 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1031,6 +1031,7 @@ static void clean_msmon_ctl_val(u32 *cur_ctl) static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, u32 flt_val) { + struct msmon_mbwu_state *mbwu_state; struct mpam_msc *msc = m->ris->vmsc->msc; /* @@ -1049,20 +1050,31 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val); mpam_write_monsel_reg(msc, MBWU, 0); mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); + + mbwu_state = &m->ris->mbwu_state[m->ctx->mon]; + mbwu_state->prev_val = 0; + break; default: return; } } +static u64 mpam_msmon_overflow_val(struct mpam_msc_ris *ris) +{ + /* TODO: scaling, and long counters */ + return GENMASK_ULL(30, 0); +} + /* Call with MSC lock held */ static void __ris_msmon_read(void *arg) { - u64 now; bool nrdy = false; struct mon_read *m = arg; + u64 now, overflow_val = 0; struct mon_cfg *ctx = m->ctx; struct mpam_msc_ris *ris = m->ris; + struct msmon_mbwu_state *mbwu_state; struct mpam_props *rprops = &ris->props; struct mpam_msc *msc = m->ris->vmsc->msc; u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; @@ -1090,11 +1102,28 @@ static void __ris_msmon_read(void *arg) now = mpam_read_monsel_reg(msc, CSU); if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) nrdy = now & MSMON___NRDY; + now = FIELD_GET(MSMON___VALUE, now); break; case mpam_feat_msmon_mbwu: now = mpam_read_monsel_reg(msc, MBWU); if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) nrdy = now & MSMON___NRDY; + now = FIELD_GET(MSMON___VALUE, now); + + if (nrdy) + break; + + mbwu_state = &ris->mbwu_state[ctx->mon]; + + /* Add any pre-overflow value to the mbwu_state->val */ + if (mbwu_state->prev_val > now) + overflow_val = mpam_msmon_overflow_val(ris) - mbwu_state->prev_val; + + mbwu_state->prev_val = now; + mbwu_state->correction += overflow_val; + + /* Include bandwidth consumed before the last hardware reset */ + now += mbwu_state->correction; break; default: m->err = -EINVAL; @@ -1107,7 +1136,6 @@ static void __ris_msmon_read(void *arg) return; } - now = FIELD_GET(MSMON___VALUE, now); *m->val += now; } @@ -1326,6 +1354,67 @@ static int mpam_reprogram_ris(void *_arg) return 0; } +/* Call with MSC lock held */ +static int mpam_restore_mbwu_state(void *_ris) +{ + int i; + struct mon_read mwbu_arg; + struct mpam_msc_ris *ris = _ris; + + for (i = 0; i < ris->props.num_mbwu_mon; i++) { + if (ris->mbwu_state[i].enabled) { + mwbu_arg.ris = ris; + mwbu_arg.ctx = &ris->mbwu_state[i].cfg; + mwbu_arg.type = mpam_feat_msmon_mbwu; + + __ris_msmon_read(&mwbu_arg); + } + } + + return 0; +} + +/* Call with MSC lock and held */ +static int mpam_save_mbwu_state(void *arg) +{ + int i; + u64 val; + struct mon_cfg *cfg; + u32 cur_flt, cur_ctl, mon_sel; + struct mpam_msc_ris *ris = arg; + struct msmon_mbwu_state *mbwu_state; + struct mpam_msc *msc = ris->vmsc->msc; + + for (i = 0; i < ris->props.num_mbwu_mon; i++) { + mbwu_state = &ris->mbwu_state[i]; + cfg = &mbwu_state->cfg; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + return -EIO; + + mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, i) | + FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); + mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + + cur_flt = mpam_read_monsel_reg(msc, CFG_MBWU_FLT); + cur_ctl = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, 0); + + val = mpam_read_monsel_reg(msc, MBWU); + mpam_write_monsel_reg(msc, MBWU, 0); + + cfg->mon = i; + cfg->pmg = FIELD_GET(MSMON_CFG_x_FLT_PMG, cur_flt); + cfg->match_pmg = FIELD_GET(MSMON_CFG_x_CTL_MATCH_PMG, cur_ctl); + cfg->partid = FIELD_GET(MSMON_CFG_x_FLT_PARTID, cur_flt); + mbwu_state->correction += val; + mbwu_state->enabled = FIELD_GET(MSMON_CFG_x_CTL_EN, cur_ctl); + mpam_mon_sel_unlock(msc); + } + + return 0; +} + static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) { *reset_cfg = (struct mpam_config) { @@ -1398,6 +1487,9 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online) * for non-zero partid may be lost while the CPUs are offline. */ ris->in_reset_state = online; + + if (mpam_is_enabled() && !online) + mpam_touch_msc(msc, &mpam_save_mbwu_state, ris); } } @@ -1431,6 +1523,9 @@ static void mpam_reprogram_msc(struct mpam_msc *msc) mpam_reprogram_ris_partid(ris, partid, cfg); } ris->in_reset_state = reset; + + if (mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) + mpam_touch_msc(msc, &mpam_restore_mbwu_state, ris); } } @@ -2306,7 +2401,22 @@ static void mpam_unregister_irqs(void) static void __destroy_component_cfg(struct mpam_component *comp) { + struct mpam_msc *msc; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + + lockdep_assert_held(&mpam_list_lock); + add_to_garbage(comp->cfg); + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + msc = vmsc->msc; + + if (mpam_mon_sel_lock(msc)) { + list_for_each_entry(ris, &vmsc->ris, vmsc_list) + add_to_garbage(ris->mbwu_state); + mpam_mon_sel_unlock(msc); + } + } } static void mpam_reset_component_cfg(struct mpam_component *comp) @@ -2324,6 +2434,8 @@ static void mpam_reset_component_cfg(struct mpam_component *comp) static int __allocate_component_cfg(struct mpam_component *comp) { + struct mpam_vmsc *vmsc; + mpam_assert_partid_sizes_fixed(); if (comp->cfg) @@ -2341,6 +2453,36 @@ static int __allocate_component_cfg(struct mpam_component *comp) mpam_reset_component_cfg(comp); + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + struct mpam_msc *msc; + struct mpam_msc_ris *ris; + struct msmon_mbwu_state *mbwu_state; + + if (!vmsc->props.num_mbwu_mon) + continue; + + msc = vmsc->msc; + list_for_each_entry(ris, &vmsc->ris, vmsc_list) { + if (!ris->props.num_mbwu_mon) + continue; + + mbwu_state = kcalloc(ris->props.num_mbwu_mon, + sizeof(*ris->mbwu_state), + GFP_KERNEL); + if (!mbwu_state) { + __destroy_component_cfg(comp); + return -ENOMEM; + } + + init_garbage(&mbwu_state[0].garbage); + + if (mpam_mon_sel_lock(msc)) { + ris->mbwu_state = mbwu_state; + mpam_mon_sel_unlock(msc); + } + } + } + return 0; } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 0c84e945c8914..28c475d18d869 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -199,6 +199,26 @@ struct mon_cfg { enum mon_filter_options opts; }; +/* + * Changes to enabled and cfg are protected by the msc->lock. + * Changes to prev_val and correction are protected by the msc's mon_sel_lock. + */ +struct msmon_mbwu_state { + bool enabled; + struct mon_cfg cfg; + + /* The value last read from the hardware. Used to detect overflow. */ + u64 prev_val; + + /* + * The value to add to the new reading to account for power management, + * and shifts to trigger the overflow interrupt. + */ + u64 correction; + + struct mpam_garbage garbage; +}; + struct mpam_class { /* mpam_components in this class */ struct list_head components; @@ -291,6 +311,9 @@ struct mpam_msc_ris { /* parent: */ struct mpam_vmsc *vmsc; + /* msmon mbwu configuration is preserved over reset */ + struct msmon_mbwu_state *mbwu_state; + struct mpam_garbage garbage; }; From 37f801dc11562f2be67870fdbc5388541aff88a0 Mon Sep 17 00:00:00 2001 From: Rohit Mathew Date: Tue, 7 Feb 2023 19:14:17 +0000 Subject: [PATCH 094/247] NVIDIA: SAUCE: arm_mpam: Probe for long/lwd mbwu counters BugLink: https://bugs.launchpad.net/bugs/2122432 mpam v0.1 and versions above v1.0 support optional long counter for memory bandwidth monitoring. The MPAMF_MBWUMON_IDR register has fields indicating support for long counters. Probe these feature bits. The mpam_feat_msmon_mbwu feature is used to indicate that bandwidth monitors are supported, instead of muddling this with which size of bandwidth monitors, add an explicit 31 bit counter feature. [ morse: Added 31bit counter feature to simplify later logic ] Reviewed-by: Ben Horgan Tested-by: Fenghua Yu Signed-off-by: Rohit Mathew Signed-off-by: James Morse (cherry picked from commit f882fb5d0f37e8a8d13bd6fac173ee1fb6f9d7e5 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 13 +++++++++++-- drivers/resctrl/mpam_internal.h | 3 +++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 180be23ebf262..bb50ec2763bc1 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -853,16 +853,25 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware."); } if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_MBWU, msmon_features)) { - bool hw_managed; + bool has_long, hw_managed; u32 mbwumon_idr = mpam_read_partsel_reg(msc, MBWUMON_IDR); props->num_mbwu_mon = FIELD_GET(MPAMF_MBWUMON_IDR_NUM_MON, mbwumon_idr); - if (props->num_mbwu_mon) + if (props->num_mbwu_mon) { mpam_set_feature(mpam_feat_msmon_mbwu, props); + mpam_set_feature(mpam_feat_msmon_mbwu_31counter, props); + } if (FIELD_GET(MPAMF_MBWUMON_IDR_HAS_RWBW, mbwumon_idr)) mpam_set_feature(mpam_feat_msmon_mbwu_rwbw, props); + has_long = FIELD_GET(MPAMF_MBWUMON_IDR_HAS_LONG, mbwumon_idr); + if (props->num_mbwu_mon && has_long) { + mpam_set_feature(mpam_feat_msmon_mbwu_44counter, props); + if (FIELD_GET(MPAMF_MBWUMON_IDR_LWD, mbwumon_idr)) + mpam_set_feature(mpam_feat_msmon_mbwu_63counter, props); + } + /* Is NRDY hardware managed? */ hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); if (hw_managed) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 28c475d18d869..ff38b4bbfc2b5 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -158,6 +158,9 @@ enum mpam_device_features { mpam_feat_msmon_csu_xcl, mpam_feat_msmon_csu_hw_nrdy, mpam_feat_msmon_mbwu, + mpam_feat_msmon_mbwu_31counter, + mpam_feat_msmon_mbwu_44counter, + mpam_feat_msmon_mbwu_63counter, mpam_feat_msmon_mbwu_capture, mpam_feat_msmon_mbwu_rwbw, mpam_feat_msmon_mbwu_hw_nrdy, From a2e165656ccac3a30a0a1cb5278956c29c96ce2f Mon Sep 17 00:00:00 2001 From: Rohit Mathew Date: Mon, 20 Feb 2023 16:06:39 +0000 Subject: [PATCH 095/247] NVIDIA: SAUCE: arm_mpam: Use long MBWU counters if supported BugLink: https://bugs.launchpad.net/bugs/2122432 Now that the larger counter sizes are probed, make use of them. Callers of mpam_msmon_read() may not know (or care!) about the different counter sizes. Allow them to specify mpam_feat_msmon_mbwu and have the driver pick the counter to use. Only 32bit accesses to the MSC are required to be supported by the spec, but these registers are 64bits. The lower half may overflow into the higher half between two 32bit reads. To avoid this, use a helper that reads the top half multiple times to check for overflow. [morse: merged multiple patches from Rohit, added explicit counter selection ] Reviewed-by: Ben Horgan Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: Rohit Mathew Signed-off-by: James Morse (cherry picked from commit 9afed066bb775f9a7c5bac8d4f382b261bc20ca9 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 134 ++++++++++++++++++++++++++++----- 1 file changed, 116 insertions(+), 18 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index bb50ec2763bc1..dbb17350ecc27 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -973,6 +973,48 @@ struct mon_read { int err; }; +static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris) +{ + return (mpam_has_feature(mpam_feat_msmon_mbwu_63counter, &ris->props) || + mpam_has_feature(mpam_feat_msmon_mbwu_44counter, &ris->props)); +} + +static u64 mpam_msc_read_mbwu_l(struct mpam_msc *msc) +{ + int retry = 3; + u32 mbwu_l_low; + u64 mbwu_l_high1, mbwu_l_high2; + + mpam_mon_sel_lock_held(msc); + + WARN_ON_ONCE((MSMON_MBWU_L + sizeof(u64)) > msc->mapped_hwpage_sz); + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + mbwu_l_high2 = __mpam_read_reg(msc, MSMON_MBWU_L + 4); + do { + mbwu_l_high1 = mbwu_l_high2; + mbwu_l_low = __mpam_read_reg(msc, MSMON_MBWU_L); + mbwu_l_high2 = __mpam_read_reg(msc, MSMON_MBWU_L + 4); + + retry--; + } while (mbwu_l_high1 != mbwu_l_high2 && retry > 0); + + if (mbwu_l_high1 == mbwu_l_high2) + return (mbwu_l_high1 << 32) | mbwu_l_low; + return MSMON___NRDY_L; +} + +static void mpam_msc_zero_mbwu_l(struct mpam_msc *msc) +{ + mpam_mon_sel_lock_held(msc); + + WARN_ON_ONCE((MSMON_MBWU_L + sizeof(u64)) > msc->mapped_hwpage_sz); + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + __mpam_write_reg(msc, MSMON_MBWU_L, 0); + __mpam_write_reg(msc, MSMON_MBWU_L + 4, 0); +} + static void gen_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, u32 *flt_val) { @@ -1000,7 +1042,9 @@ static void gen_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, ctx->csu_exclude_clean); break; - case mpam_feat_msmon_mbwu: + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: *ctl_val |= MSMON_CFG_MBWU_CTL_TYPE_MBWU; if (mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, &m->ris->props)) @@ -1022,7 +1066,9 @@ static void read_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, *ctl_val = mpam_read_monsel_reg(msc, CFG_CSU_CTL); *flt_val = mpam_read_monsel_reg(msc, CFG_CSU_FLT); return; - case mpam_feat_msmon_mbwu: + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: *ctl_val = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); *flt_val = mpam_read_monsel_reg(msc, CFG_MBWU_FLT); return; @@ -1035,6 +1081,9 @@ static void read_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, static void clean_msmon_ctl_val(u32 *cur_ctl) { *cur_ctl &= ~MSMON_CFG_x_CTL_OFLOW_STATUS; + + if (FIELD_GET(MSMON_CFG_x_CTL_TYPE, *cur_ctl) == MSMON_CFG_MBWU_CTL_TYPE_MBWU) + *cur_ctl &= ~MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L; } static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, @@ -1054,10 +1103,15 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, mpam_write_monsel_reg(msc, CSU, 0); mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); break; - case mpam_feat_msmon_mbwu: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: + mpam_msc_zero_mbwu_l(m->ris->vmsc->msc); + fallthrough; + case mpam_feat_msmon_mbwu_31counter: mpam_write_monsel_reg(msc, CFG_MBWU_FLT, flt_val); mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val); mpam_write_monsel_reg(msc, MBWU, 0); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); mbwu_state = &m->ris->mbwu_state[m->ctx->mon]; @@ -1069,10 +1123,19 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, } } -static u64 mpam_msmon_overflow_val(struct mpam_msc_ris *ris) +static u64 mpam_msmon_overflow_val(enum mpam_device_features type) { - /* TODO: scaling, and long counters */ - return GENMASK_ULL(30, 0); + /* TODO: implement scaling counters */ + switch (type) { + case mpam_feat_msmon_mbwu_63counter: + return GENMASK_ULL(62, 0); + case mpam_feat_msmon_mbwu_44counter: + return GENMASK_ULL(43, 0); + case mpam_feat_msmon_mbwu_31counter: + return GENMASK_ULL(30, 0); + default: + return 0; + } } /* Call with MSC lock held */ @@ -1113,11 +1176,24 @@ static void __ris_msmon_read(void *arg) nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); break; - case mpam_feat_msmon_mbwu: - now = mpam_read_monsel_reg(msc, MBWU); - if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) - nrdy = now & MSMON___NRDY; - now = FIELD_GET(MSMON___VALUE, now); + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: + if (m->type != mpam_feat_msmon_mbwu_31counter) { + now = mpam_msc_read_mbwu_l(msc); + if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) + nrdy = now & MSMON___NRDY_L; + + if (m->type == mpam_feat_msmon_mbwu_63counter) + now = FIELD_GET(MSMON___LWD_VALUE, now); + else + now = FIELD_GET(MSMON___L_VALUE, now); + } else { + now = mpam_read_monsel_reg(msc, MBWU); + if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) + nrdy = now & MSMON___NRDY; + now = FIELD_GET(MSMON___VALUE, now); + } if (nrdy) break; @@ -1126,7 +1202,7 @@ static void __ris_msmon_read(void *arg) /* Add any pre-overflow value to the mbwu_state->val */ if (mbwu_state->prev_val > now) - overflow_val = mpam_msmon_overflow_val(ris) - mbwu_state->prev_val; + overflow_val = mpam_msmon_overflow_val(m->type) - mbwu_state->prev_val; mbwu_state->prev_val = now; mbwu_state->correction += overflow_val; @@ -1182,13 +1258,26 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) return any_err; } +static enum mpam_device_features mpam_msmon_choose_counter(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (mpam_has_feature(mpam_feat_msmon_mbwu_44counter, cprops)) + return mpam_feat_msmon_mbwu_44counter; + if (mpam_has_feature(mpam_feat_msmon_mbwu_63counter, cprops)) + return mpam_feat_msmon_mbwu_63counter; + + return mpam_feat_msmon_mbwu_31counter; +} + int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, enum mpam_device_features type, u64 *val) { int err; struct mon_read arg; u64 wait_jiffies = 0; - struct mpam_props *cprops = &comp->class->props; + struct mpam_class *class = comp->class; + struct mpam_props *cprops = &class->props; might_sleep(); @@ -1205,9 +1294,12 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, }; *val = 0; + if (type == mpam_feat_msmon_mbwu) + type = mpam_msmon_choose_counter(class); + err = _msmon_read(comp, &arg); - if (err == -EBUSY && comp->class->nrdy_usec) - wait_jiffies = usecs_to_jiffies(comp->class->nrdy_usec); + if (err == -EBUSY && class->nrdy_usec) + wait_jiffies = usecs_to_jiffies(class->nrdy_usec); while (wait_jiffies) wait_jiffies = schedule_timeout_uninterruptible(wait_jiffies); @@ -1369,12 +1461,13 @@ static int mpam_restore_mbwu_state(void *_ris) int i; struct mon_read mwbu_arg; struct mpam_msc_ris *ris = _ris; + struct mpam_class *class = ris->vmsc->comp->class; for (i = 0; i < ris->props.num_mbwu_mon; i++) { if (ris->mbwu_state[i].enabled) { mwbu_arg.ris = ris; mwbu_arg.ctx = &ris->mbwu_state[i].cfg; - mwbu_arg.type = mpam_feat_msmon_mbwu; + mwbu_arg.type = mpam_msmon_choose_counter(class); __ris_msmon_read(&mwbu_arg); } @@ -1409,8 +1502,13 @@ static int mpam_save_mbwu_state(void *arg) cur_ctl = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); mpam_write_monsel_reg(msc, CFG_MBWU_CTL, 0); - val = mpam_read_monsel_reg(msc, MBWU); - mpam_write_monsel_reg(msc, MBWU, 0); + if (mpam_ris_has_mbwu_long_counter(ris)) { + val = mpam_msc_read_mbwu_l(msc); + mpam_msc_zero_mbwu_l(msc); + } else { + val = mpam_read_monsel_reg(msc, MBWU); + mpam_write_monsel_reg(msc, MBWU, 0); + } cfg->mon = i; cfg->pmg = FIELD_GET(MSMON_CFG_x_FLT_PMG, cur_flt); From 68cb9f94fd06bd1f5d899d2bd5da007ed4d50eb9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 10 Sep 2021 12:00:01 +0100 Subject: [PATCH 096/247] NVIDIA: SAUCE: arm_mpam: Add helper to reset saved mbwu state BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl expects to reset the bandwidth counters when the filesystem is mounted. To allow this, add a helper that clears the saved mbwu state. Instead of cross calling to each CPU that can access the component MSC to write to the counter, set a flag that causes it to be zero'd on the the next read. This is easily done by forcing a configuration update. Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 6731e078c20836769f8b4bd9a2ebcac440586039 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 46 ++++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 7 ++++- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index dbb17350ecc27..fca76ddf014ef 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1142,9 +1142,11 @@ static u64 mpam_msmon_overflow_val(enum mpam_device_features type) static void __ris_msmon_read(void *arg) { bool nrdy = false; + bool config_mismatch; struct mon_read *m = arg; u64 now, overflow_val = 0; struct mon_cfg *ctx = m->ctx; + bool reset_on_next_read = false; struct mpam_msc_ris *ris = m->ris; struct msmon_mbwu_state *mbwu_state; struct mpam_props *rprops = &ris->props; @@ -1159,6 +1161,14 @@ static void __ris_msmon_read(void *arg) FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + if (m->type == mpam_feat_msmon_mbwu) { + mbwu_state = &ris->mbwu_state[ctx->mon]; + if (mbwu_state) { + reset_on_next_read = mbwu_state->reset_on_next_read; + mbwu_state->reset_on_next_read = false; + } + } + /* * Read the existing configuration to avoid re-writing the same values. * This saves waiting for 'nrdy' on subsequent reads. @@ -1166,7 +1176,10 @@ static void __ris_msmon_read(void *arg) read_msmon_ctl_flt_vals(m, &cur_ctl, &cur_flt); clean_msmon_ctl_val(&cur_ctl); gen_msmon_ctl_flt_vals(m, &ctl_val, &flt_val); - if (cur_flt != flt_val || cur_ctl != (ctl_val | MSMON_CFG_x_CTL_EN)) + config_mismatch = cur_flt != flt_val || + cur_ctl != (ctl_val | MSMON_CFG_x_CTL_EN); + + if (config_mismatch || reset_on_next_read) write_msmon_ctl_flt_vals(m, ctl_val, flt_val); switch (m->type) { @@ -1318,6 +1331,37 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, return err; } +void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) +{ + struct mpam_msc *msc; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + + if (!mpam_is_enabled()) + return; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!mpam_has_feature(mpam_feat_msmon_mbwu, &vmsc->props)) + continue; + + msc = vmsc->msc; + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) + continue; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + continue; + + ris->mbwu_state[ctx->mon].correction = 0; + ris->mbwu_state[ctx->mon].reset_on_next_read = true; + mpam_mon_sel_unlock(msc); + } + } +} + static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) { u32 num_words, msb; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index ff38b4bbfc2b5..6632699ae814b 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -204,10 +204,14 @@ struct mon_cfg { /* * Changes to enabled and cfg are protected by the msc->lock. - * Changes to prev_val and correction are protected by the msc's mon_sel_lock. + * The msc's mon_sel_lock protects: + * - reset_on_next_read + * - prev_val + * - correction */ struct msmon_mbwu_state { bool enabled; + bool reset_on_next_read; struct mon_cfg cfg; /* The value last read from the hardware. Used to detect overflow. */ @@ -369,6 +373,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid, int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, enum mpam_device_features, u64 *val); +void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); From edbf82d25f2e6b09f01e188e67fc2ca7e0a7f8a4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 14 Feb 2024 18:04:49 +0000 Subject: [PATCH 097/247] NVIDIA: SAUCE: arm_mpam: Add kunit test for bitmap reset BugLink: https://bugs.launchpad.net/bugs/2122432 The bitmap reset code has been a source of bugs. Add a unit test. This currently has to be built in, as the rest of the driver is builtin. Suggested-by: Jonathan Cameron Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit 8cc7f34b3bc33ca6ccdc1a0e827c3ac34b590fe2 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/Kconfig | 9 ++++ drivers/resctrl/mpam_devices.c | 4 ++ drivers/resctrl/test_mpam_devices.c | 69 +++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 drivers/resctrl/test_mpam_devices.c diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index 58c83b5c8bfdf..a2e9a61304617 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -10,4 +10,13 @@ config ARM64_MPAM_DRIVER_DEBUG help Say yes here to enable debug messages from the MPAM driver. +config MPAM_KUNIT_TEST + bool "KUnit tests for MPAM driver " if !KUNIT_ALL_TESTS + depends on KUNIT=y + default KUNIT_ALL_TESTS + help + Enable this option to run tests in the MPAM driver. + + If unsure, say N. + endif diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index fca76ddf014ef..8c1fe791ef7fc 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2940,3 +2940,7 @@ static int __init mpam_msc_driver_init(void) } /* Must occur after arm64_mpam_register_cpus() from arch_initcall() */ subsys_initcall(mpam_msc_driver_init); + +#ifdef CONFIG_MPAM_KUNIT_TEST +#include "test_mpam_devices.c" +#endif diff --git a/drivers/resctrl/test_mpam_devices.c b/drivers/resctrl/test_mpam_devices.c new file mode 100644 index 0000000000000..0cfb41b665c4c --- /dev/null +++ b/drivers/resctrl/test_mpam_devices.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. +/* This file is intended to be included into mpam_devices.c */ + +#include + +static void test_mpam_reset_msc_bitmap(struct kunit *test) +{ + char __iomem *buf = kunit_kzalloc(test, SZ_16K, GFP_KERNEL); + struct mpam_msc fake_msc = {}; + u32 *test_result; + + if (!buf) + return; + + fake_msc.mapped_hwpage = buf; + fake_msc.mapped_hwpage_sz = SZ_16K; + cpumask_copy(&fake_msc.accessibility, cpu_possible_mask); + + /* Satisfy lockdep checks */ + mutex_init(&fake_msc.part_sel_lock); + mutex_lock(&fake_msc.part_sel_lock); + + test_result = (u32 *)(buf + MPAMCFG_CPBM); + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 0); + KUNIT_EXPECT_EQ(test, test_result[0], 0); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 1); + KUNIT_EXPECT_EQ(test, test_result[0], 1); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 16); + KUNIT_EXPECT_EQ(test, test_result[0], 0xffff); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 32); + KUNIT_EXPECT_EQ(test, test_result[0], 0xffffffff); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 33); + KUNIT_EXPECT_EQ(test, test_result[0], 0xffffffff); + KUNIT_EXPECT_EQ(test, test_result[1], 1); + test_result[0] = 0; + test_result[1] = 0; + + mutex_unlock(&fake_msc.part_sel_lock); +} + +static struct kunit_case mpam_devices_test_cases[] = { + KUNIT_CASE(test_mpam_reset_msc_bitmap), + {} +}; + +static struct kunit_suite mpam_devices_test_suite = { + .name = "mpam_devices_test_suite", + .test_cases = mpam_devices_test_cases, +}; + +kunit_test_suites(&mpam_devices_test_suite); From 827c2fb0787afdcbe6878d07279c56dc1603e13f Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 27 Aug 2024 13:41:03 +0100 Subject: [PATCH 098/247] NVIDIA: SAUCE: arm_mpam: Add kunit tests for props_mismatch() BugLink: https://bugs.launchpad.net/bugs/2122432 When features are mismatched between MSC the way features are combined to the class determines whether resctrl can support this SoC. Add some tests to illustrate the sort of thing that is expected to work, and those that must be removed. Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Signed-off-by: James Morse (cherry picked from commit c31ec1bb514c90ed5610ed67de44257a8e9a5748 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_internal.h | 14 +- drivers/resctrl/test_mpam_devices.c | 320 ++++++++++++++++++++++++++++ 2 files changed, 333 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 6632699ae814b..4f25681b56abd 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -25,6 +25,12 @@ struct platform_device; DECLARE_STATIC_KEY_FALSE(mpam_enabled); +#ifdef CONFIG_MPAM_KUNIT_TEST +#define PACKED_FOR_KUNIT __packed +#else +#define PACKED_FOR_KUNIT +#endif + static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -180,7 +186,13 @@ struct mpam_props { u16 dspri_wd; u16 num_csu_mon; u16 num_mbwu_mon; -}; + +/* + * Kunit tests use memset() to set up feature combinations that should be + * removed, and will false-positive if the compiler introduces padding that + * isn't cleared during sanitisation. + */ +} PACKED_FOR_KUNIT; #define mpam_has_feature(_feat, x) test_bit(_feat, (x)->features) #define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) diff --git a/drivers/resctrl/test_mpam_devices.c b/drivers/resctrl/test_mpam_devices.c index 0cfb41b665c4c..3e8d564a0c647 100644 --- a/drivers/resctrl/test_mpam_devices.c +++ b/drivers/resctrl/test_mpam_devices.c @@ -4,6 +4,324 @@ #include +/* + * This test catches fields that aren't being sanitised - but can't tell you + * which one... + */ +static void test__props_mismatch(struct kunit *test) +{ + struct mpam_props parent = { 0 }; + struct mpam_props child; + + memset(&child, 0xff, sizeof(child)); + __props_mismatch(&parent, &child, false); + + memset(&child, 0, sizeof(child)); + KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0); + + memset(&child, 0xff, sizeof(child)); + __props_mismatch(&parent, &child, true); + + KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0); +} + +static struct list_head fake_classes_list; +static struct mpam_class fake_class = { 0 }; +static struct mpam_component fake_comp1 = { 0 }; +static struct mpam_component fake_comp2 = { 0 }; +static struct mpam_vmsc fake_vmsc1 = { 0 }; +static struct mpam_vmsc fake_vmsc2 = { 0 }; +static struct mpam_msc fake_msc1 = { 0 }; +static struct mpam_msc fake_msc2 = { 0 }; +static struct mpam_msc_ris fake_ris1 = { 0 }; +static struct mpam_msc_ris fake_ris2 = { 0 }; +static struct platform_device fake_pdev = { 0 }; + +static inline void reset_fake_hierarchy(void) +{ + INIT_LIST_HEAD(&fake_classes_list); + + memset(&fake_class, 0, sizeof(fake_class)); + fake_class.level = 3; + fake_class.type = MPAM_CLASS_CACHE; + INIT_LIST_HEAD_RCU(&fake_class.components); + INIT_LIST_HEAD(&fake_class.classes_list); + + memset(&fake_comp1, 0, sizeof(fake_comp1)); + memset(&fake_comp2, 0, sizeof(fake_comp2)); + fake_comp1.comp_id = 1; + fake_comp2.comp_id = 2; + INIT_LIST_HEAD(&fake_comp1.vmsc); + INIT_LIST_HEAD(&fake_comp1.class_list); + INIT_LIST_HEAD(&fake_comp2.vmsc); + INIT_LIST_HEAD(&fake_comp2.class_list); + + memset(&fake_vmsc1, 0, sizeof(fake_vmsc1)); + memset(&fake_vmsc2, 0, sizeof(fake_vmsc2)); + INIT_LIST_HEAD(&fake_vmsc1.ris); + INIT_LIST_HEAD(&fake_vmsc1.comp_list); + fake_vmsc1.msc = &fake_msc1; + INIT_LIST_HEAD(&fake_vmsc2.ris); + INIT_LIST_HEAD(&fake_vmsc2.comp_list); + fake_vmsc2.msc = &fake_msc2; + + memset(&fake_ris1, 0, sizeof(fake_ris1)); + memset(&fake_ris2, 0, sizeof(fake_ris2)); + fake_ris1.ris_idx = 1; + INIT_LIST_HEAD(&fake_ris1.msc_list); + fake_ris2.ris_idx = 2; + INIT_LIST_HEAD(&fake_ris2.msc_list); + + fake_msc1.pdev = &fake_pdev; + fake_msc2.pdev = &fake_pdev; + + list_add(&fake_class.classes_list, &fake_classes_list); +} + +static void test_mpam_enable_merge_features(struct kunit *test) +{ + reset_fake_hierarchy(); + + mutex_lock(&mpam_list_lock); + + /* One Class+Comp, two RIS in one vMSC with common features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = NULL; + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc1; + list_add(&fake_ris2.vmsc_list, &fake_vmsc1.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cpbm_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + + reset_fake_hierarchy(); + + /* One Class+Comp, two RIS in one vMSC with non-overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = NULL; + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc1; + list_add(&fake_ris2.vmsc_list, &fake_vmsc1.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cmax_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + /* Multiple RIS within one MSC controlling the same resource can be mismatched */ + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_vmsc1.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + KUNIT_EXPECT_EQ(test, fake_vmsc1.props.cmax_wd, 4); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 4); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cpbm_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with non-overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cmax_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * Multiple RIS in different MSC can't control the same resource, + * mismatched features can not be supported. + */ + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 0); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with incompatible overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + mpam_set_feature(mpam_feat_mbw_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_mbw_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 5; + fake_ris2.props.cpbm_wd = 3; + fake_ris1.props.mbw_pbm_bits = 5; + fake_ris2.props.mbw_pbm_bits = 3; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * Multiple RIS in different MSC can't control the same resource, + * mismatched features can not be supported. + */ + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_mbw_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0); + KUNIT_EXPECT_EQ(test, fake_class.props.mbw_pbm_bits, 0); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with overlapping features that need tweaking */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_mbw_min, &fake_ris1.props); + mpam_set_feature(mpam_feat_mbw_min, &fake_ris2.props); + mpam_set_feature(mpam_feat_cmax_cmax, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmax, &fake_ris2.props); + fake_ris1.props.bwa_wd = 5; + fake_ris2.props.bwa_wd = 3; + fake_ris1.props.cmax_wd = 5; + fake_ris2.props.cmax_wd = 3; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * RIS with different control properties need to be sanitised so the + * class has the common set of properties. + */ + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_min, &fake_class.props)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmax, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.bwa_wd, 3); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 3); + + reset_fake_hierarchy(); + + /* One Class Two Comp with overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = &fake_class; + list_add(&fake_comp2.class_list, &fake_class.components); + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp2; + list_add(&fake_vmsc2.comp_list, &fake_comp2.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cpbm_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + + reset_fake_hierarchy(); + + /* One Class Two Comp with non-overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = &fake_class; + list_add(&fake_comp2.class_list, &fake_class.components); + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp2; + list_add(&fake_vmsc2.comp_list, &fake_comp2.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cmax_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * Multiple components can't control the same resource, mismatched features can + * not be supported. + */ + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 0); + + mutex_unlock(&mpam_list_lock); +} + static void test_mpam_reset_msc_bitmap(struct kunit *test) { char __iomem *buf = kunit_kzalloc(test, SZ_16K, GFP_KERNEL); @@ -58,6 +376,8 @@ static void test_mpam_reset_msc_bitmap(struct kunit *test) static struct kunit_case mpam_devices_test_cases[] = { KUNIT_CASE(test_mpam_reset_msc_bitmap), + KUNIT_CASE(test_mpam_enable_merge_features), + KUNIT_CASE(test__props_mismatch), {} }; From 2c5167b9a3f28452b50477dc4d7e76f9a7506f23 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 2 Jul 2018 11:15:31 +0100 Subject: [PATCH 099/247] NVIDIA: SAUCE: arm64: mpam: Context switch the MPAM registers BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM allows traffic in the SoC to be labeled by the OS, these labels are used to apply policy in caches and bandwidth regulators, and to monitor traffic in the SoC. The label is made up of a PARTID and PMG value. The x86 equivalent calls these CLOSID and RMID, but they don't map precisely. MPAM has a CPU system register that is used to hold the PARTID and PMG values that traffic generated by EL0 will use. This can be set per-task by the resctrl file system. (resctrl is the defacto interface for controlling this stuff). Add a helper to switch this. struct task_struct's separate CLOSID and RMID fields are insufficient to implement resctrl using MPAM, as resctrl can change the PARTID (CLOSID) and PMG (sort of like the RMID) separately. On x86, the rmid is an independent number, so a race that writes a mismatched closid and rmid into hardware is benign. On arm64, the pmg bits extend the partid. (i.e. partid-5 has a pmg-0 that is not the same as partid-6's pmg-0). In this case, mismatching the values will 'dirty' a pmg value that resctrl believes is clean, and is not tracking with its 'limbo' code. To avoid this, the partid and pmg are always read and written as a pair. Instead of making struct task_struct's closid and rmid fields an endian-unsafe union, add the value to struct thread_info and always use READ_ONCE()/WRITE_ONCE() when accessing this field. Resctrl allows a per-cpu 'default' value to be set, this overrides the values when scheduling a task in the default control-group, which has PARTID 0. The current system register value is kept in a per-cpu variable to avoid writing to the system register if the value isn't going to change. Writes to this register may reset the hardware state for regulating bandwidth. Finally, there is no reason to context switch these registers unless there is a driver changing the values in struct task_struct. Hide the whole thing behind a static key. This also allows the driver to disable MPAM in response to errors reported by hardware. Move the existing static key to belong to the arch code, as in the future the MPAM driver may become a loadable module. All this should depend on whether there is an MPAM driver, hide it behind CONFIG_MPAM. CC: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit 448d72b40f439ac2bffd8f9881a1dcb620d495cb https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/arm64/Kconfig | 2 + arch/arm64/include/asm/mpam.h | 62 ++++++++++++++++++++++++++++ arch/arm64/include/asm/thread_info.h | 3 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/mpam.c | 11 +++++ arch/arm64/kernel/process.c | 7 ++++ drivers/resctrl/mpam_devices.c | 2 - drivers/resctrl/mpam_internal.h | 2 + 8 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/mpam.h create mode 100644 arch/arm64/kernel/mpam.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 17394c637adbf..34502faac486c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2086,6 +2086,8 @@ config ARM64_MPAM MPAM is exposed to user-space via the resctrl pseudo filesystem. + This option enables the extra context switch code. + endmenu # "ARMv8.4 architectural features" menu "ARMv8.5 architectural features" diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h new file mode 100644 index 0000000000000..9920142619ef4 --- /dev/null +++ b/arch/arm64/include/asm/mpam.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __ASM__MPAM_H +#define __ASM__MPAM_H + +#include +#include +#include +#include +#include + +#include +#include +#include + +DECLARE_STATIC_KEY_FALSE(mpam_enabled); +DECLARE_PER_CPU(u64, arm64_mpam_default); +DECLARE_PER_CPU(u64, arm64_mpam_current); + +/* + * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, + * which may race with reads in __mpam_sched_in(). Ensure only one of the old + * or new values are used. Particular care should be taken with the pmg field + * as __mpam_sched_in() may read a partid and pmg that don't match, causing + * this value to be stored with cache allocations, despite being considered + * 'free' by resctrl. + * + * A value in struct thread_info is used instead of struct task_struct as the + * cpu's u64 register format is used, but struct task_struct has two u32'. + */ +static inline u64 mpam_get_regval(struct task_struct *tsk) +{ +#ifdef CONFIG_ARM64_MPAM + return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg); +#else + return 0; +#endif +} + +static inline void mpam_thread_switch(struct task_struct *tsk) +{ + u64 oldregval; + int cpu = smp_processor_id(); + u64 regval = mpam_get_regval(tsk); + + if (!IS_ENABLED(CONFIG_ARM64_MPAM) || + !static_branch_likely(&mpam_enabled)) + return; + + if (!regval) + regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu)); + + oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + if (oldregval == regval) + return; + + /* Synchronising this write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); + WRITE_ONCE(per_cpu(arm64_mpam_current, cpu), regval); +} +#endif /* __ASM__MPAM_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index f241b8601ebd9..c226dabd50191 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -41,6 +41,9 @@ struct thread_info { #ifdef CONFIG_SHADOW_CALL_STACK void *scs_base; void *scs_sp; +#endif +#ifdef CONFIG_ARM64_MPAM + u64 mpam_partid_pmg; #endif u32 cpu; }; diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 76f32e424065e..15979f3665196 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -67,6 +67,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MPAM) += mpam.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c new file mode 100644 index 0000000000000..4a5a054cb8f1a --- /dev/null +++ b/arch/arm64/kernel/mpam.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Arm Ltd. */ + +#include + +#include +#include + +DEFINE_STATIC_KEY_FALSE(mpam_enabled); +DEFINE_PER_CPU(u64, arm64_mpam_default); +DEFINE_PER_CPU(u64, arm64_mpam_current); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index fba7ca102a8c4..b510c0699313b 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -737,6 +738,12 @@ struct task_struct *__switch_to(struct task_struct *prev, if (prev->thread.sctlr_user != next->thread.sctlr_user) update_sctlr_el1(next->thread.sctlr_user); + /* + * MPAM thread switch happens after the DSB to ensure prev's accesses + * use prev's MPAM settings. + */ + mpam_thread_switch(next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 8c1fe791ef7fc..6739df5ffe7db 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -32,8 +32,6 @@ #include "mpam_internal.h" -DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */ - /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 4f25681b56abd..aaf494563e972 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -18,6 +18,8 @@ #include #include +#include + #define MPAM_MSC_MAX_NUM_RIS 16 From 7f0237f8c3220cc0b6e64f675616a616da120a37 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 4 Jul 2025 15:18:06 +0100 Subject: [PATCH 100/247] NVIDIA: SAUCE: arm64: mpam: Re-initialise MPAM regs when CPU comes online BugLink: https://bugs.launchpad.net/bugs/2122432 Now that MPAM0_EL1 is expected to have the correct value, ensure it is reprogrammed based on struct task_struct when a CPU is brought online. Signed-off-by: James Morse (cherry picked from commit bf047341ace740f16cc560ccd9ea29b4f98275e0 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/arm64/kernel/cpufeature.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c3ef9b161b812..ae1d1a8cf6ddc 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include @@ -2527,6 +2528,12 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) static void cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) { + int cpu = smp_processor_id(); + u64 regval = 0; + + if (IS_ENABLED(CONFIG_MPAM)) + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + /* * Access by the kernel (at EL1) should use the reserved PARTID * which is configured unrestricted. This avoids priority-inversion @@ -2534,6 +2541,8 @@ cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) * been throttled to release the lock. */ write_sysreg_s(0, SYS_MPAM1_EL1); + + write_sysreg_s(regval, SYS_MPAM0_EL1); } static bool From 57ebe6ae3d50048d708f0558304be3ade02f917e Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 4 Jul 2025 14:22:30 +0100 Subject: [PATCH 101/247] NVIDIA: SAUCE: arm64: mpam: Advertise the CPUs MPAM limits to the driver BugLink: https://bugs.launchpad.net/bugs/2122432 Requestors need to populate the MPAM fields on the interconnect. For the CPUs these fields are taken from the corresponding MPAMy_ELx register. Each requestor may have a limit on the largest PARTID or PMG value that can be used. The MPAM driver has to determine the system-wide minimum supported PARTID and PMG values. To do this, the driver needs to be told what each requestor's limit is. CPUs are special, but this infrastructure is also needed for the SMMU and GIC ITS. Call the helper to tell the MPAM driver what the CPUs can do. The return value can be ignored by the arch code as it runs well before the MPAM driver starts probing. Signed-off-by: James Morse (cherry picked from commit 33c1f50970917ac9f2a8e224d850936374df6173 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/arm64/kernel/mpam.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index 4a5a054cb8f1a..ba85b943c719e 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -3,9 +3,21 @@ #include +#include #include #include DEFINE_STATIC_KEY_FALSE(mpam_enabled); DEFINE_PER_CPU(u64, arm64_mpam_default); DEFINE_PER_CPU(u64, arm64_mpam_current); + +static int __init arm64_mpam_register_cpus(void) +{ + u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); + u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + + return mpam_register_requestor(partid_max, pmg_max); +} +/* Must occur before mpam_msc_driver_init() from subsys_initcall() */ +arch_initcall(arm64_mpam_register_cpus) From d3a34778ed2202384a341cccacc5c59840cef20d Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 8 Mar 2024 15:29:04 +0000 Subject: [PATCH 102/247] NVIDIA: SAUCE: arm64: mpam: Add cpu_pm notifier to restore MPAM sysregs BugLink: https://bugs.launchpad.net/bugs/2122432 The MPAM system registers will be lost if the CPU is reset during PSCI's CPU_SUSPEND. Add a PM notifier to restore them. mpam_thread_switch(current) can't be used as this won't make any changes if the in-memory copy says the register already has the correct value. In reality the system register is UNKNOWN out of reset. Signed-off-by: James Morse (cherry picked from commit 460ce285a9bed1e31f4bd0db067e352b975a4a47 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/arm64/kernel/mpam.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index ba85b943c719e..2b594e13c6f68 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -11,12 +12,39 @@ DEFINE_STATIC_KEY_FALSE(mpam_enabled); DEFINE_PER_CPU(u64, arm64_mpam_default); DEFINE_PER_CPU(u64, arm64_mpam_current); +static int mpam_pm_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + u64 regval; + int cpu = smp_processor_id(); + + switch (cmd) { + case CPU_PM_EXIT: + /* + * Don't use mpam_thread_switch() as the system register + * value has changed under our feet. + */ + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + write_sysreg_s(0, SYS_MPAM1_EL1); + write_sysreg_s(regval, SYS_MPAM0_EL1); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block mpam_pm_nb = { + .notifier_call = mpam_pm_notifier, +}; + static int __init arm64_mpam_register_cpus(void) { u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + cpu_pm_register_notifier(&mpam_pm_nb); return mpam_register_requestor(partid_max, pmg_max); } /* Must occur before mpam_msc_driver_init() from subsys_initcall() */ From 0e31bbebac3e7a6e427d8241c0ee19796ddea84d Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 22 May 2025 16:53:44 +0100 Subject: [PATCH 103/247] NVIDIA: SAUCE: arm64: mpam: Add helpers to change a tasks and cpu mpam partid/pmg values BugLink: https://bugs.launchpad.net/bugs/2122432 Care must be taken when modifying the PARTID and PMG of a task, as writing these values may race with the task being scheduled in, and reading the modified values. Add helpers to set the task properties, and the CPU default value. These use WRITE_ONCE() that pairs with the READ_ONCE() in mpam_get_regval() to avoid causing torn values. CC: Dave Martin CC: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 84d97a4f895965da93404e0bbdd4009bca6be045 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/arm64/include/asm/mpam.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 9920142619ef4..03a8ad1d2d053 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -5,6 +5,7 @@ #define __ASM__MPAM_H #include +#include #include #include #include @@ -29,6 +30,35 @@ DECLARE_PER_CPU(u64, arm64_mpam_current); * A value in struct thread_info is used instead of struct task_struct as the * cpu's u64 register format is used, but struct task_struct has two u32'. */ +static inline void mpam_set_cpu_defaults(int cpu, u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 default_val; + + default_val = FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d); + default_val |= FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i); + default_val |= FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d); + default_val |= FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i); + + WRITE_ONCE(per_cpu(arm64_mpam_default, cpu), default_val); +} + +static inline void mpam_set_task_partid_pmg(struct task_struct *tsk, + u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ +#ifdef CONFIG_ARM64_MPAM + u64 regval; + + regval = FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d); + regval |= FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i); + regval |= FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d); + regval |= FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i); + + WRITE_ONCE(task_thread_info(tsk)->mpam_partid_pmg, regval); +#endif +} + static inline u64 mpam_get_regval(struct task_struct *tsk) { #ifdef CONFIG_ARM64_MPAM From ee97bdf09c726b30f032e8eadef3a1ababe05ffd Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Jul 2021 18:57:05 +0100 Subject: [PATCH 104/247] NVIDIA: SAUCE: cacheinfo: Add helper to find the cache size from cpu+level BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM needs to know the size of a cache associated with a particular CPU. The DT/ACPI agnostic way of doing this is to ask cacheinfo. Add a helper to do this. Signed-off-by: James Morse (cherry picked from commit 1aec3aea578643a06e3c625a42114cc3f7c67f36 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- include/linux/cacheinfo.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 6c8b6a4559312..cfd45a5a46ae4 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -148,6 +148,21 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) return ci ? ci->id : -1; } +/** + * get_cpu_cacheinfo_size() - Get the size of the cache. + * @cpu: The cpu that is associated with the cache. + * @level: The level of the cache as seen by @cpu. + * + * cpuhp lock must be held. + * Returns the cache-size on success, or 0 for an error. + */ +static inline unsigned int get_cpu_cacheinfo_size(int cpu, int level) +{ + struct cacheinfo *ci = get_cpu_cacheinfo_level(cpu, level); + + return ci ? ci->size : 0; +} + #if defined(CONFIG_ARM64) || defined(CONFIG_ARM) #define use_arch_cache_info() (true) #else From bc423d07a8146ca71df3940f6f9bccc2a78d35ff Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Jun 2019 17:02:09 +0100 Subject: [PATCH 105/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add boilerplate cpuhp and domain allocation BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl has its own data structures to describe its resources. We can't use these directly as we play tricks with the 'MBA' resource, picking the MPAM controls or monitors that best apply. We may export the same component as both L3 and MBA. Add mpam_resctrl_exports[] as the array of class->resctrl mappings we are exporting, and add the cpuhp hooks that allocated and free the resctrl domain structures. While we're here, plumb in a few other obvious things. CONFIG_ARM_CPU_RESCTRL is used to allow this code to be built even though it can't yet be linked against resctrl. Signed-off-by: James Morse (cherry picked from commit e687c9c43618e95fda5e7d71e1441c907b4de8ef https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/Makefile | 1 + drivers/resctrl/mpam_devices.c | 12 ++ drivers/resctrl/mpam_internal.h | 22 ++ drivers/resctrl/mpam_resctrl.c | 349 ++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 3 + 5 files changed, 387 insertions(+) create mode 100644 drivers/resctrl/mpam_resctrl.c diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 898199dcf80d5..40beaf999582c 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o mpam-y += mpam_devices.o +mpam-$(CONFIG_ARM_CPU_RESCTRL) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 6739df5ffe7db..3c6d58f9a35aa 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1702,6 +1702,9 @@ static int mpam_cpu_online(unsigned int cpu) mpam_reprogram_msc(msc); } + if (mpam_is_enabled()) + mpam_resctrl_online_cpu(cpu); + return 0; } @@ -1758,6 +1761,9 @@ static int mpam_cpu_offline(unsigned int cpu) mpam_reset_msc(msc, false); } + if (mpam_is_enabled()) + mpam_resctrl_offline_cpu(cpu); + return 0; } @@ -2690,6 +2696,12 @@ static void mpam_enable_once(void) mutex_unlock(&mpam_list_lock); cpus_read_unlock(); + if (!err) { + err = mpam_resctrl_setup(); + if (err) + pr_err("Failed to initialise resctrl: %d\n", err); + } + if (err) { mpam_disable_reason = "Failed to enable."; schedule_work(&mpam_broken_work); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index aaf494563e972..60cd776255069 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -338,6 +339,17 @@ struct mpam_msc_ris { struct mpam_garbage garbage; }; +struct mpam_resctrl_dom { + struct mpam_component *ctrl_comp; + struct rdt_ctrl_domain resctrl_ctrl_dom; + struct rdt_mon_domain resctrl_mon_dom; +}; + +struct mpam_resctrl_res { + struct mpam_class *class; + struct rdt_resource resctrl_res; +}; + static inline int mpam_alloc_csu_mon(struct mpam_class *class) { struct mpam_props *cprops = &class->props; @@ -392,6 +404,16 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +#ifdef CONFIG_RESCTRL_FS +int mpam_resctrl_setup(void); +int mpam_resctrl_online_cpu(unsigned int cpu); +int mpam_resctrl_offline_cpu(unsigned int cpu); +#else +static inline int mpam_resctrl_setup(void) { return 0; } +static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } +static inline int mpam_resctrl_offline_cpu(unsigned int cpu) { return 0; } +#endif /* CONFIG_RESCTRL_FS */ + /* * MPAM MSCs have the following register layout. See: * Arm Memory System Resource Partitioning and Monitoring (MPAM) System diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c new file mode 100644 index 0000000000000..c6b6251d4e423 --- /dev/null +++ b/drivers/resctrl/mpam_resctrl.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mpam_internal.h" + +/* + * The classes we've picked to map to resctrl resources, wrapped + * in with their resctrl structure. + * Class pointer may be NULL. + */ +static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; + +/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ +static DEFINE_MUTEX(domain_list_lock); + +static bool exposed_alloc_capable; +static bool exposed_mon_capable; + +bool resctrl_arch_alloc_capable(void) +{ + return exposed_alloc_capable; +} + +bool resctrl_arch_mon_capable(void) +{ + return exposed_mon_capable; +} + +/* + * MSC may raise an error interrupt if it sees an out or range partid/pmg, + * and go on to truncate the value. Regardless of what the hardware supports, + * only the system wide safe value is safe to use. + */ +u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) +{ + return mpam_partid_max + 1; +} + +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &mpam_resctrl_controls[l].resctrl_res; +} + +static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, + enum resctrl_res_level type) +{ + /* TODO: initialise the resctrl resources */ + + return 0; +} + +static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) +{ + struct mpam_class *class = comp->class; + + if (class->type == MPAM_CLASS_CACHE) + return comp->comp_id; + + /* TODO: repaint domain ids to match the L3 domain ids */ + /* + * Otherwise, expose the ID used by the firmware table code. + */ + return comp->comp_id; +} + +static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + INIT_LIST_HEAD(&hdr->list); + hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +/** + * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. + * @cpu: The CPU to remove from the domain. + * @hdr: The domain's header. + * + * Removes @cpu from the header mask. If this was the last CPU in the domain, + * the domain header is removed from its parent list and true is returned, + * indicating the parent structure can be freed. + * If there are other CPUs in the domain, returns false. + */ +static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (cpumask_empty(&hdr->cpu_mask)) { + list_del(&hdr->list); + return true; + } + + return false; +} + +static struct mpam_resctrl_dom * +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +{ + int err, idx; + struct mpam_resctrl_dom *dom; + struct rdt_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; + struct mpam_class *class = res->class; + struct mpam_component *comp_iter, *ctrl_comp; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_held(&domain_list_lock); + + ctrl_comp = NULL; + idx = srcu_read_lock(&mpam_srcu); + list_for_each_entry_srcu(comp_iter, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { + ctrl_comp = comp_iter; + break; + } + } + srcu_read_unlock(&mpam_srcu, idx); + + /* cpu with unknown exported component? */ + if (WARN_ON_ONCE(!ctrl_comp)) + return ERR_PTR(-EINVAL); + + dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!dom) + return ERR_PTR(-ENOMEM); + + if (exposed_alloc_capable) { + dom->ctrl_comp = ctrl_comp; + + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, &ctrl_d->hdr); + ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; + /* TODO: this list should be sorted */ + list_add_tail(&ctrl_d->hdr.list, &r->ctrl_domains); + err = resctrl_online_ctrl_domain(r, ctrl_d); + if (err) { + dom = ERR_PTR(err); + goto offline_ctrl_domain; + } + } else { + pr_debug("Skipped control domain online - no controls\n"); + } + + if (exposed_mon_capable) { + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, &mon_d->hdr); + mon_d->hdr.type = RESCTRL_MON_DOMAIN; + /* TODO: this list should be sorted */ + list_add_tail(&mon_d->hdr.list, &r->mon_domains); + err = resctrl_online_mon_domain(r, mon_d); + if (err) { + dom = ERR_PTR(err); + goto offline_mon_hdr; + } + } else { + pr_debug("Skipped monitor domain online - no monitors\n"); + } + goto out; + +offline_mon_hdr: + mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); +offline_ctrl_domain: + resctrl_offline_ctrl_domain(r, ctrl_d); +out: + return dom; +} + +/** + * mpam_resctrl_get_domain_from_cpu() - find the mpam domain structure + * @cpu: The CPU that is going online/offline. + * @res: The resctrl resource the domain should belong to. + * + * The component structures must be used to identify the CPU may be marked + * offline in the resctrl structures. However the resctrl domain list is + * used to search as this is also used to determine if resctrl thinks the + * domain is online. + * For platforms with controls, this is easy as each resource has one control + * component. + */ +static struct mpam_resctrl_dom * +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +{ + struct rdt_ctrl_domain *d; + struct mpam_resctrl_dom *dom; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &res->resctrl_res.ctrl_domains, hdr.list) { + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + + if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) + return dom; + } + + return NULL; +} + +int mpam_resctrl_online_cpu(unsigned int cpu) +{ + int i, err = 0; + struct mpam_resctrl_dom *dom; + struct mpam_resctrl_res *res; + + mutex_lock(&domain_list_lock); + for (i = 0; i < RDT_NUM_RESOURCES; i++) { + res = &mpam_resctrl_controls[i]; + if (!res->class) + continue; // dummy_resource; + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (!dom) + dom = mpam_resctrl_alloc_domain(cpu, res); + if (IS_ERR(dom)) { + err = PTR_ERR(dom); + break; + } + + cpumask_set_cpu(cpu, &dom->resctrl_ctrl_dom.hdr.cpu_mask); + cpumask_set_cpu(cpu, &dom->resctrl_mon_dom.hdr.cpu_mask); + } + mutex_unlock(&domain_list_lock); + + if (!err) + resctrl_online_cpu(cpu); + + return err; +} + +int mpam_resctrl_offline_cpu(unsigned int cpu) +{ + int i; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + struct rdt_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; + bool ctrl_dom_empty, mon_dom_empty; + + resctrl_offline_cpu(cpu); + + mutex_lock(&domain_list_lock); + for (i = 0; i < RDT_NUM_RESOURCES; i++) { + res = &mpam_resctrl_controls[i]; + if (!res->class) + continue; // dummy resource + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (WARN_ON_ONCE(!dom)) + continue; + + ctrl_dom_empty = true; + if (exposed_alloc_capable) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } + + mon_dom_empty = true; + if (exposed_mon_capable) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, mon_d); + } + + if (ctrl_dom_empty && mon_dom_empty) + kfree(dom); + } + mutex_unlock(&domain_list_lock); + + return 0; +} + +int mpam_resctrl_setup(void) +{ + int err = 0; + enum resctrl_res_level i; + struct mpam_resctrl_res *res; + + cpus_read_lock(); + for (i = 0; i < RDT_NUM_RESOURCES; i++) { + res = &mpam_resctrl_controls[i]; + INIT_LIST_HEAD(&res->resctrl_res.ctrl_domains); + INIT_LIST_HEAD(&res->resctrl_res.mon_domains); + res->resctrl_res.rid = i; + } + + /* TODO: pick MPAM classes to map to resctrl resources */ + + /* Initialise the resctrl structures from the classes */ + for (i = 0; i < RDT_NUM_RESOURCES; i++) { + res = &mpam_resctrl_controls[i]; + if (!res->class) + continue; // dummy resource + + err = mpam_resctrl_control_init(res, i); + if (err) { + pr_debug("Failed to initialise rid %u\n", i); + break; + } + } + cpus_read_unlock(); + + if (err || (!exposed_alloc_capable && !exposed_mon_capable)) { + if (err) + pr_debug("Internal error %d - resctrl not supported\n", err); + else + pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", + exposed_alloc_capable, exposed_mon_capable); + err = -EOPNOTSUPP; + } + + if (!err) { + if (!is_power_of_2(mpam_pmg_max + 1)) { + /* + * If not all the partid*pmg values are valid indexes, + * resctrl may allocate pmg that don't exist. This + * should cause an error interrupt. + */ + pr_warn("Number of PMG is not a power of 2! resctrl may misbehave"); + } + + /* TODO: call resctrl_init() */ + } + + return err; +} diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index cb6e6cfbea0bc..165385d8f39cc 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -41,6 +41,9 @@ static inline int acpi_mpam_count_msc(void) { return -EINVAL; } int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, int component_id); +bool resctrl_arch_alloc_capable(void); +bool resctrl_arch_mon_capable(void); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From 7a94d4a5939a168c5e96c760b629cec126105ca6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 12 Jun 2019 13:51:30 +0100 Subject: [PATCH 106/247] NVIDIA: SAUCE: arm_mpam: resctrl: Pick the caches we will use as resctrl resources BugLink: https://bugs.launchpad.net/bugs/2122432 Systems with MPAM support may have a variety of control types at any point of their system layout. We can only expose certain types of control, and only if they exist at particular locations. Start with the well-know caches. These have to be depth 2 or 3 and support MPAM's cache portion bitmap controls, with a number of portions fewer that resctrl's limit. Signed-off-by: James Morse (cherry picked from commit 0f5e192a841c2dcecdbab3ea887d72a8b3485bba https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 91 +++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index c6b6251d4e423..d66d9658c0f8e 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -60,10 +60,96 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) return &mpam_resctrl_controls[l].resctrl_res; } +static bool cache_has_usable_cpor(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_cpor_part, cprops)) + return false; + + /* TODO: Scaling is not yet supported */ + /* resctrl uses u32 for all bitmap configurations */ + return (class->props.cpbm_wd <= 32); +} + +/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ +static void mpam_resctrl_pick_caches(void) +{ + struct mpam_class *class; + struct mpam_resctrl_res *res; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + if (class->type != MPAM_CLASS_CACHE) { + pr_debug("class %u is not a cache\n", class->level); + continue; + } + + if (class->level != 2 && class->level != 3) { + pr_debug("class %u is not L2 or L3\n", class->level); + continue; + } + + if (!cache_has_usable_cpor(class)) { + pr_debug("class %u cache misses CPOR\n", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u Class has missing CPUs\n", class->level); + pr_debug("class %u mask %*pb != %*pb\n", class->level, + cpumask_pr_args(&class->affinity), + cpumask_pr_args(cpu_possible_mask)); + continue; + } + + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + res->class = class; + exposed_alloc_capable = true; + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, enum resctrl_res_level type) { - /* TODO: initialise the resctrl resources */ + struct mpam_class *class = res->class; + struct rdt_resource *r = &res->resctrl_res; + + switch (res->resctrl_res.rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + r->alloc_capable = true; + r->schema_fmt = RESCTRL_SCHEMA_BITMAP; + r->cache.arch_has_sparse_bitmasks = true; + + /* TODO: Scaling is not yet supported */ + r->cache.cbm_len = class->props.cpbm_wd; + /* mpam_devices will reject empty bitmaps */ + r->cache.min_cbm_bits = 1; + + if (r->rid == RDT_RESOURCE_L2) { + r->name = "L2"; + r->ctrl_scope = RESCTRL_L2_CACHE; + } else { + r->name = "L3"; + r->ctrl_scope = RESCTRL_L3_CACHE; + } + + /* + * Which bits are shared with other ...things... + * Unknown devices use partid-0 which uses all the bitmap + * fields. Until we configured the SMMU and GIC not to do this + * 'all the bits' is the correct answer here. + */ + r->cache.shareable_bits = resctrl_get_default_ctrl(r); + break; + default: + break; + } return 0; } @@ -307,7 +393,8 @@ int mpam_resctrl_setup(void) res->resctrl_res.rid = i; } - /* TODO: pick MPAM classes to map to resctrl resources */ + /* Find some classes to use for controls */ + mpam_resctrl_pick_caches(); /* Initialise the resctrl structures from the classes */ for (i = 0; i < RDT_NUM_RESOURCES; i++) { From e210a4218c0ccdc4fd9b9d1e30eca167536ddb3a Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 4 Mar 2019 15:15:25 +0000 Subject: [PATCH 107/247] NVIDIA: SAUCE: arm_mpam: resctrl: Implement resctrl_arch_reset_all_ctrls() BugLink: https://bugs.launchpad.net/bugs/2122432 We already have a helper for resetting an mpam class and component. Hook it up to resctrl_arch_reset_all_ctrls() and the domain offline path. Signed-off-by: James Morse (cherry picked from commit 847bd9eacee15765db0e23f7fb9e88abd1a123db https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 6 +++--- drivers/resctrl/mpam_internal.h | 7 +++++++ drivers/resctrl/mpam_resctrl.c | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 3c6d58f9a35aa..ef6a619ab6f3c 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2717,7 +2717,7 @@ static void mpam_enable_once(void) mpam_partid_max + 1, mpam_pmg_max + 1); } -static void mpam_reset_component_locked(struct mpam_component *comp) +void mpam_reset_component_locked(struct mpam_component *comp) { struct mpam_vmsc *vmsc; @@ -2742,7 +2742,7 @@ static void mpam_reset_component_locked(struct mpam_component *comp) } } -static void mpam_reset_class_locked(struct mpam_class *class) +void mpam_reset_class_locked(struct mpam_class *class) { struct mpam_component *comp; @@ -2754,7 +2754,7 @@ static void mpam_reset_class_locked(struct mpam_class *class) mpam_reset_component_locked(comp); } -static void mpam_reset_class(struct mpam_class *class) +void mpam_reset_class(struct mpam_class *class) { cpus_read_lock(); mpam_reset_class_locked(class); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 60cd776255069..58d7125aea4f0 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -394,6 +394,13 @@ extern u8 mpam_pmg_max; void mpam_enable(struct work_struct *work); void mpam_disable(struct work_struct *work); +/* Reset all the RIS in a class, optionally while holding cpus_read_lock() */ +void mpam_reset_class_locked(struct mpam_class *class); +void mpam_reset_class(struct mpam_class *class); + +/* Reset all the RIS in a component under cpus_read_lock() */ +void mpam_reset_component_locked(struct mpam_component *comp); + int mpam_apply_config(struct mpam_component *comp, u16 partid, struct mpam_config *cfg); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index d66d9658c0f8e..5c480155131b6 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -168,6 +168,19 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) +{ + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + mpam_reset_class_locked(res->class); +} + static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, struct rdt_domain_hdr *hdr) { @@ -357,6 +370,8 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) ctrl_dom_empty = true; if (exposed_alloc_capable) { + mpam_reset_component_locked(dom->ctrl_comp); + ctrl_d = &dom->resctrl_ctrl_dom; ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); if (ctrl_dom_empty) From ec2b3122520d7e5f6624f8dd9978bc282ae367e2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 4 Mar 2019 14:34:44 +0000 Subject: [PATCH 108/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_get_config() BugLink: https://bugs.launchpad.net/bugs/2122432 Implement resctrl_arch_get_config() by testing the configuration for a CPOR bitmap. For any other configuration type return the default. Signed-off-by: James Morse (cherry picked from commit e6ca0f5e07f1b04bc293ce7d28d36721667d734a https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 47 ++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 5c480155131b6..62a0f685063b4 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -168,6 +168,53 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type type) +{ + u32 partid; + struct mpam_config *cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + enum mpam_device_features configured_by; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + goto err; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + partid = resctrl_get_config_index(closid, type); + cfg = &dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + configured_by = mpam_feat_cpor_part; + break; + default: + goto err; + } + + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) || + !mpam_has_feature(configured_by, cfg)) + goto err; + + switch (configured_by) { + case mpam_feat_cpor_part: + /* TODO: Scaling is not yet supported */ + return cfg->cpbm; + default: + goto err; + } + +err: + return resctrl_get_default_ctrl(r); +} + void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct mpam_resctrl_res *res; From e0ed1b90f7420ee4338913f2a1d71908c4f3cff3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 21 May 2021 12:19:36 +0100 Subject: [PATCH 109/247] NVIDIA: SAUCE: arm_mpam: resctrl: Implement helpers to update configuration BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl has two helpers for updating the configuration. resctrl_arch_update_one() updates a single value, and is used by the software-controller to apply feedback to the bandwidth controls, it has to be called on one of the CPUs in the resctrl:domain. resctrl_arch_update_domains() copies multiple staged configurations, it can be called from anywhere. Both helpers should update any changes to the underlying hardware. Imlpement resctrl_arch_update_domains() to use resctrl_arch_update_one(), which doesn't depend on being called on the right CPU. Signed-off-by: James Morse (cherry picked from commit 1a50fe3f1a7b042546d1655651a2c58abf25b676 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 73 ++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 62a0f685063b4..5296cb54cb6d0 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -215,6 +215,79 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, return resctrl_get_default_ctrl(r); } +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + u32 partid; + struct mpam_config cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + /* + * NOTE: don't check the CPU as mpam_apply_config() doesn't care, + * and resctrl_arch_update_domains() depends on this. + */ + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + partid = resctrl_get_config_index(closid, t); + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { + pr_debug("Not alloc capable or computed PARTID out of range\n"); + return -EINVAL; + } + + /* + * Copy the current config to avoid clearing other resources when the + * same component is exposed multiple times through resctrl. + */ + cfg = dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + /* TODO: Scaling is not yet supported */ + cfg.cpbm = cfg_val; + mpam_set_feature(mpam_feat_cpor_part, &cfg); + break; + default: + return -EINVAL; + } + + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); +} + +/* TODO: this is IPI heavy */ +int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) +{ + int err = 0; + enum resctrl_conf_type t; + struct rdt_ctrl_domain *d; + struct resctrl_staged_config *cfg; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + for (t = 0; t < CDP_NUM_TYPES; t++) { + cfg = &d->staged_config[t]; + if (!cfg->have_new_ctrl) + continue; + + err = resctrl_arch_update_one(r, d, closid, t, + cfg->new_ctrl); + if (err) + return err; + } + } + + return err; +} + void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct mpam_resctrl_res *res; From e02e242e38c5d1253b705996d594234afb77cd22 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 22 May 2025 16:57:28 +0100 Subject: [PATCH 110/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add plumbing against arm64 task and cpu hooks BugLink: https://bugs.launchpad.net/bugs/2122432 arm64 provides helpers for changing a tasks and a cpus mpam partid/pmg values. These are used to back a number of resctrl_arch_ functions. Connect them up. Signed-off-by: James Morse (cherry picked from commit f11af18bbd869fc360e17a33827dcf87d0a159de https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 58 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 2 files changed, 63 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 5296cb54cb6d0..61c33ee3ff695 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,8 @@ static DEFINE_MUTEX(domain_list_lock); static bool exposed_alloc_capable; static bool exposed_mon_capable; +static bool cdp_enabled; + bool resctrl_arch_alloc_capable(void) { return exposed_alloc_capable; @@ -52,6 +55,61 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) return mpam_partid_max + 1; } +void resctrl_arch_sched_in(struct task_struct *tsk) +{ + lockdep_assert_preemption_disabled(); + + mpam_thread_switch(tsk); +} + +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid); + } else { + /* + * When CDP is enabled, resctrl halves the closid range and we + * use odd/even partid for one closid. + */ + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid); + } +} + +void resctrl_arch_sync_cpu_closid_rmid(void *info) +{ + struct resctrl_cpu_defaults *r = info; + + lockdep_assert_preemption_disabled(); + + if (r) { + resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(), + r->closid, r->rmid); + } + + resctrl_arch_sched_in(current); +} + +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid); + } else { + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid); + } +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 165385d8f39cc..91d869a0d46e0 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -44,6 +44,11 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, bool resctrl_arch_alloc_capable(void); bool resctrl_arch_mon_capable(void); +void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); +void resctrl_arch_sched_in(struct task_struct *tsk); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From 3dd5f9b13829d14670697b36cf730d754a6a4488 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 25 Jun 2021 17:19:16 +0100 Subject: [PATCH 111/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add CDP emulation BugLink: https://bugs.launchpad.net/bugs/2122432 Intel RDT's CDP feature allows the cache to use a different control value depending on whether the accesses was for instruction fetch or a data access. MPAM's equivalent feature is the other way up: the CPU assigns a different partid label to traffic depending on whether it was instruction fetch or a data access, which causes the cache to use a different control value based solely on the partid. MPAM can emulate CDP, with the side effect that the alternative partid is seen by all MSC, it can't be enabled per-MSC. Add the resctrl hooks to turn this on or off. Add the helpers that match a closid against a task, which need to be aware that the value written to hardware is not the same as the one resctrl is using. The context switch code needs to match the default resctrl group's value against a variable, as this value changes depending on whether CDP is in use. Awkwardly, the MB controls don't implement CDP. To emulate this, the MPAM equivalent needs programming twice by the resctrl glue, as resctrl expects the bandwidth controls to be applied independently for both data and isntruction-fetch. CC: Dave Martin CC: Ben Horgan CC: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit 953acc3efe584c5f5f0a3d43d61e1fec250dec64 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Noah Wager Acked-by: Jacob Martin Signed-off-by: Brad Figg --- arch/arm64/include/asm/mpam.h | 11 ++- arch/arm64/kernel/mpam.c | 2 + drivers/resctrl/mpam_resctrl.c | 119 ++++++++++++++++++++++++++++++++- include/linux/arm_mpam.h | 3 + 4 files changed, 133 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 03a8ad1d2d053..05bc63fbdd4bf 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -4,6 +4,7 @@ #ifndef __ASM__MPAM_H #define __ASM__MPAM_H +#include #include #include #include @@ -19,6 +20,14 @@ DECLARE_STATIC_KEY_FALSE(mpam_enabled); DECLARE_PER_CPU(u64, arm64_mpam_default); DECLARE_PER_CPU(u64, arm64_mpam_current); +/* + * The value of the MPAM0_EL1 sysreg when a task is in the default group. + * This is used by the context switch code to use the resctrl CPU property + * instead. The value is modified when CDP is enabled/disabled by mounting + * the resctrl filesystem. + */ +extern u64 arm64_mpam_global_default; + /* * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, * which may race with reads in __mpam_sched_in(). Ensure only one of the old @@ -78,7 +87,7 @@ static inline void mpam_thread_switch(struct task_struct *tsk) !static_branch_likely(&mpam_enabled)) return; - if (!regval) + if (regval == READ_ONCE(arm64_mpam_global_default)) regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu)); oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index 2b594e13c6f68..e5cc0d2e0106b 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -12,6 +12,8 @@ DEFINE_STATIC_KEY_FALSE(mpam_enabled); DEFINE_PER_CPU(u64, arm64_mpam_default); DEFINE_PER_CPU(u64, arm64_mpam_current); +u64 arm64_mpam_global_default; + static int mpam_pm_notifier(struct notifier_block *self, unsigned long cmd, void *v) { diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 61c33ee3ff695..55c5dc710ef0a 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -33,6 +33,10 @@ static DEFINE_MUTEX(domain_list_lock); static bool exposed_alloc_capable; static bool exposed_mon_capable; +/* + * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1. + * This applies globally to all traffic the CPU generates. + */ static bool cdp_enabled; bool resctrl_arch_alloc_capable(void) @@ -45,6 +49,71 @@ bool resctrl_arch_mon_capable(void) return exposed_mon_capable; } +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) +{ + switch (rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + return cdp_enabled; + case RDT_RESOURCE_MBA: + default: + /* + * x86's MBA control doesn't support CDP, so user-space doesn't + * expect it. + */ + return false; + } +} + +/** + * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks. + * + * At boot, all existing tasks use partid zero for D and I. + * To enable/disable CDP emulation, all these tasks need relabelling. + */ +static void resctrl_reset_task_closids(void) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + } + read_unlock(&tasklist_lock); +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level ignored, bool enable) +{ + u64 regval; + u32 partid, partid_i, partid_d; + + cdp_enabled = enable; + + partid = RESCTRL_RESERVED_CLOSID; + + if (enable) { + partid_d = resctrl_get_config_index(partid, CDP_CODE); + partid_i = resctrl_get_config_index(partid, CDP_DATA); + regval = FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d) | + FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i); + } else { + regval = FIELD_PREP(MPAM0_EL1_PARTID_D, partid) | + FIELD_PREP(MPAM0_EL1_PARTID_I, partid); + } + + resctrl_reset_task_closids(); + + WRITE_ONCE(arm64_mpam_global_default, regval); + + return 0; +} + +static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid) +{ + return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid); +} + /* * MSC may raise an error interrupt if it sees an out or range partid/pmg, * and go on to truncate the value. Regardless of what the hardware supports, @@ -110,6 +179,30 @@ void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) } } +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return tsk_closid == closid; +} + +/* The task's pmg is not unique, the partid must be considered too */ +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return (tsk_closid == closid) && (tsk_rmid == rmid); +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) @@ -245,6 +338,14 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); cprops = &res->class->props; + /* + * When CDP is enabled, but the resource doesn't support it, + * the control is cloned across both partids. + * Pick one at random to read: + */ + if (mpam_resctrl_hide_cdp(r->rid)) + type = CDP_DATA; + partid = resctrl_get_config_index(closid, type); cfg = &dom->ctrl_comp->cfg[partid]; @@ -276,6 +377,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { + int err; u32 partid; struct mpam_config cfg; struct mpam_props *cprops; @@ -316,7 +418,22 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, return -EINVAL; } - return mpam_apply_config(dom->ctrl_comp, partid, &cfg); + /* + * When CDP is enabled, but the resource doesn't support it, we need to + * apply the same configuration to the other partid. + */ + if (mpam_resctrl_hide_cdp(r->rid)) { + partid = resctrl_get_config_index(closid, CDP_CODE); + err = mpam_apply_config(dom->ctrl_comp, partid, &cfg); + if (err) + return err; + + partid = resctrl_get_config_index(closid, CDP_DATA); + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); + + } else { + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); + } } /* TODO: this is IPI heavy */ diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 91d869a0d46e0..2fdba28ea9303 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -5,6 +5,7 @@ #define __LINUX_ARM_MPAM_H #include +#include #include #define GLOBAL_AFFINITY ~0 @@ -48,6 +49,8 @@ void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); /** * mpam_register_requestor() - Register a requestor with the MPAM driver From 7422c0929eba423da03f94b4a6186ad533f40283 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Jul 2021 18:45:14 +0100 Subject: [PATCH 112/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add rmid index helpers BugLink: https://bugs.launchpad.net/bugs/2122432 Because MPAM's pmg aren't identical to RDT's rmid, resctrl handles some datastructrues by index. This allows x86 to map indexes to RMID, and MPAM to map them to partid-and-pmg. Add the helpers to do this. Signed-off-by: James Morse (cherry picked from commit d2ee8a649bd00060ace2fd8f22d8bf6826c0ef3c https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 28 ++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 3 +++ 2 files changed, 31 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 55c5dc710ef0a..cdd3136ed6853 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -124,6 +124,34 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) return mpam_partid_max + 1; } +u32 resctrl_arch_system_num_rmid_idx(void) +{ + u8 closid_shift = fls(mpam_pmg_max); + u32 num_partid = resctrl_arch_get_num_closid(NULL); + + return num_partid << closid_shift; +} + +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid) +{ + u8 closid_shift = fls(mpam_pmg_max); + + WARN_ON_ONCE(closid_shift > 8); + + return (closid << closid_shift) | rmid; +} + +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + u8 closid_shift = fls(mpam_pmg_max); + u32 pmg_mask = ~(~0 << closid_shift); + + WARN_ON_ONCE(closid_shift > 8); + + *closid = idx >> closid_shift; + *rmid = idx & pmg_mask; +} + void resctrl_arch_sched_in(struct task_struct *tsk) { lockdep_assert_preemption_disabled(); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 2fdba28ea9303..1dd0f239cad02 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -51,6 +51,9 @@ void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); +u32 resctrl_arch_system_num_rmid_idx(void); /** * mpam_register_requestor() - Register a requestor with the MPAM driver From 38c7e3153288db3f3ca988fb01a678c952d81fc7 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 22 Aug 2025 10:43:05 +0100 Subject: [PATCH 113/247] NVIDIA: SAUCE: arm_mpam: resctrl: Convert to/from MPAMs bitmaps and fixed-point formats BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM uses bitmaps and fixed-point formats for the hardware controls. Resctrl provides the bandwidth controls as a percentage. Add helpers to convert between these. Signed-off-by: Dave Martin (cherry picked from commit a5930eb0cea18d0e6a325166d871109b9d7dbcc8 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 71 ++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index cdd3136ed6853..5e1883cd52132 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -251,6 +252,76 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return (class->props.cpbm_wd <= 32); } +static u32 mbw_pbm_to_percent(const unsigned long mbw_pbm, + struct mpam_props *cprops) +{ + u32 val = bitmap_weight(&mbw_pbm, (unsigned int)cprops->mbw_pbm_bits); + + if (cprops->mbw_pbm_bits == 0) + return 0; + + val *= MAX_MBA_BW; + val = DIV_ROUND_CLOSEST(val, cprops->mbw_pbm_bits); + + return val; +} + +static u32 percent_to_mbw_pbm(u8 pc, struct mpam_props *cprops) +{ + u32 val = pc; + unsigned long ret = 0; + + if (cprops->mbw_pbm_bits == 0) + return 0; + + val *= cprops->mbw_pbm_bits; + val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); + + /* TODO: pick bits at random to avoid contention */ + bitmap_set(&ret, 0, val); + return ret; +} + +/* + * Each fixed-point hardware value architecturally represents a range + * of values: the full range 0% - 100% is split contiguously into + * (1 << cprops->bwa_wd) equal bands. + * Find the nearest percentage value to the upper bound of the selected band: + */ +static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +{ + u32 val = mbw_max; + + val >>= 16 - cprops->bwa_wd; + val += 1; + val *= MAX_MBA_BW; + val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); + + return val; +} + +/* + * Find the band whose upper bound is closest to the specified percentage. + * + * A round-to-nearest policy is followed here as a balanced compromise + * between unexpected under-commit of the resource (where the total of + * a set of resource allocations after conversion is less than the + * expected total, due to rounding of the individual converted + * percentages) and over-commit (where the total of the converted + * allocations is greater than expected). + */ +static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +{ + u32 val = pc; + + val <<= cprops->bwa_wd; + val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); + val = max(val, 1) - 1; + val <<= 16 - cprops->bwa_wd; + + return val; +} + /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { From a72ff49ec76725e9af5311d517ae17c16bfd4225 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 19 Jul 2024 11:00:12 +0100 Subject: [PATCH 114/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add support for 'MB' resource BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl supports 'MB', as a percentage throttling of traffic somewhere after the L3. This is the control that mba_sc uses, so ideally the class chosen should be as close as possible to the counters used for mba_local. MB's percentage control can be backed either with the fixed point fraction MBW_MAX or the bandwidth portion bitmap. Add a helper to convert to/from percentages. One problem here is the value written is not the same as the value read back. This is deliberately made visible to user-space. Another is the MBW_MAX fixed point fraction can't represent 100%. This is going to confuse user-space, so shift everything up taking all-ones as 100% and zero as the minimum granularity. CC: Zeng Heng Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: James Morse > (cherry picked from commit a2e2d2ea3861ba5e63b00775438a08019044da0b https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 217 ++++++++++++++++++++++++++++++++- 1 file changed, 216 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 5e1883cd52132..a815bc7486a8e 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -252,6 +252,42 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return (class->props.cpbm_wd <= 32); } +static bool mba_class_use_mbw_part(struct mpam_props *cprops) +{ + return (mpam_has_feature(mpam_feat_mbw_part, cprops) && + cprops->mbw_pbm_bits); +} + +static bool mba_class_use_mbw_max(struct mpam_props *cprops) +{ + return (mpam_has_feature(mpam_feat_mbw_max, cprops) && + cprops->bwa_wd); +} + +static bool class_has_usable_mba(struct mpam_props *cprops) +{ + return mba_class_use_mbw_part(cprops) || mba_class_use_mbw_max(cprops); +} + +/* + * Calculate the worst-case percentage change from each implemented step + * in the control. + */ +static u32 get_mba_granularity(struct mpam_props *cprops) +{ + if (mba_class_use_mbw_part(cprops)) { + return DIV_ROUND_UP(MAX_MBA_BW, cprops->mbw_pbm_bits); + } else if (mba_class_use_mbw_max(cprops)) { + /* + * bwa_wd is the number of bits implemented in the 0.xxx + * fixed point fraction. 1 bit is 50%, 2 is 25% etc. + */ + return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd); + } + + return 0; +} + static u32 mbw_pbm_to_percent(const unsigned long mbw_pbm, struct mpam_props *cprops) { @@ -322,6 +358,85 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) return val; } +/* Find the L3 cache that has affinity with this CPU */ +static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask) +{ + int err; + u32 cache_id = get_cpu_cacheinfo_id(cpu, 3); + + lockdep_assert_cpus_held(); + + err = mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask); + return err; +} + +/* + * topology_matches_l3() - Is the provided class the same shape as L3 + * @victim: The class we'd like to pretend is L3. + * + * resctrl expects all the worlds a Xeon, and all counters are on the + * L3. We play fast and loose with this, mapping counters on other + * classes - provided the CPU->domain mapping is the same kind of shape. + * + * Using cacheinfo directly would make this work even if resctrl can't + * use the L3 - but cacheinfo can't tell us anything about offline CPUs. + * Using the L3 resctrl domain list also depends on CPUs being online. + * Using the mpam_class we picked for L3 so we can use its domain list + * assumes that there are MPAM controls on the L3. + * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id() + * helper. This relies on at least one CPU per L3 cache being online at + * boot. + * + * Walk the two component lists and compare the affinity masks. The topology + * matches if each victim:component has a corresponding L3:component with the + * same affinity mask. These lists/masks are computed from firmware tables so + * don't change at runtime. + */ +static bool topology_matches_l3(struct mpam_class *victim) +{ + int cpu, err; + struct mpam_component *victim_iter; + cpumask_var_t __free(free_cpumask_var) tmp_cpumask; + + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) + return false; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(victim_iter, &victim->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_empty(&victim_iter->affinity)) { + pr_debug("class %u has CPU-less component %u - can't match L3!\n", + victim->level, victim_iter->comp_id); + return false; + } + + cpu = cpumask_any(&victim_iter->affinity); + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return false; + + cpumask_clear(tmp_cpumask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3's equivalent component to class %u component %u\n", + victim->level, victim_iter->comp_id); + return false; + } + + /* Any differing bits in the affinity mask? */ + if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) { + pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n" + "L3:%*pbl != victim:%*pbl\n", + victim->level, victim_iter->comp_id, + cpumask_pr_args(tmp_cpumask), + cpumask_pr_args(&victim_iter->affinity)); + + return false; + } + } + + return true; +} + /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { @@ -363,10 +478,60 @@ static void mpam_resctrl_pick_caches(void) } } +static void mpam_resctrl_pick_mba(void) +{ + struct mpam_class *class, *candidate_class = NULL; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_props *cprops = &class->props; + + if (class->level < 3) { + pr_debug("class %u is before L3\n", class->level); + continue; + } + + if (!class_has_usable_mba(cprops)) { + pr_debug("class %u has no bandwidth control\n", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs\n", class->level); + continue; + } + + if (!topology_matches_l3(class)) { + pr_debug("class %u topology doesn't match L3\n", class->level); + continue; + } + + /* + * mba_sc reads the mbm_local counter, and waggles the MBA controls. + * mbm_local is implicitly part of the L3, pick a resource to be MBA + * that as close as possible to the L3. + */ + if (!candidate_class || class->level < candidate_class->level) + candidate_class = class; + } + + if (candidate_class) { + pr_debug("selected class %u to back MBA\n", candidate_class->level); + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + res->class = candidate_class; + exposed_alloc_capable = true; + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, enum resctrl_res_level type) { struct mpam_class *class = res->class; + struct mpam_props *cprops = &class->props; struct rdt_resource *r = &res->resctrl_res; switch (res->resctrl_res.rid) { @@ -396,6 +561,20 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, * 'all the bits' is the correct answer here. */ r->cache.shareable_bits = resctrl_get_default_ctrl(r); + break; + case RDT_RESOURCE_MBA: + r->alloc_capable = true; + r->schema_fmt = RESCTRL_SCHEMA_RANGE; + r->ctrl_scope = RESCTRL_L3_CACHE; + + r->membw.delay_linear = true; + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->membw.min_bw = get_mba_granularity(cprops); + r->membw.max_bw = MAX_MBA_BW; + r->membw.bw_gran = get_mba_granularity(cprops); + + r->name = "MB"; + break; default: break; @@ -411,7 +590,17 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; - /* TODO: repaint domain ids to match the L3 domain ids */ + if (topology_matches_l3(class)) { + /* Use the corresponding L3 component ID as the domain ID */ + int id = get_cpu_cacheinfo_id(cpu, 3); + + /* Implies topology_matches_l3() made a mistake */ + if (WARN_ON_ONCE(id == -1)) + return comp->comp_id; + + return id; + } + /* * Otherwise, expose the ID used by the firmware table code. */ @@ -453,6 +642,15 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case RDT_RESOURCE_L3: configured_by = mpam_feat_cpor_part; break; + case RDT_RESOURCE_MBA: + if (mba_class_use_mbw_part(cprops)) { + configured_by = mpam_feat_mbw_part; + break; + } else if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + configured_by = mpam_feat_mbw_max; + break; + } + fallthrough; default: goto err; } @@ -465,6 +663,11 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case mpam_feat_cpor_part: /* TODO: Scaling is not yet supported */ return cfg->cpbm; + case mpam_feat_mbw_part: + /* TODO: Scaling is not yet supported */ + return mbw_pbm_to_percent(cfg->mbw_pbm, cprops); + case mpam_feat_mbw_max: + return mbw_max_to_percent(cfg->mbw_max, cprops); default: goto err; } @@ -513,6 +716,17 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, cfg.cpbm = cfg_val; mpam_set_feature(mpam_feat_cpor_part, &cfg); break; + case RDT_RESOURCE_MBA: + if (mba_class_use_mbw_part(cprops)) { + cfg.mbw_pbm = percent_to_mbw_pbm(cfg_val, cprops); + mpam_set_feature(mpam_feat_mbw_part, &cfg); + break; + } else if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); + mpam_set_feature(mpam_feat_mbw_max, &cfg); + break; + } + fallthrough; default: return -EINVAL; } @@ -804,6 +1018,7 @@ int mpam_resctrl_setup(void) /* Find some classes to use for controls */ mpam_resctrl_pick_caches(); + mpam_resctrl_pick_mba(); /* Initialise the resctrl structures from the classes */ for (i = 0; i < RDT_NUM_RESOURCES; i++) { From bd87882dc8797585ec966581c9d9a5282df2a176 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Aug 2025 15:44:00 +0100 Subject: [PATCH 115/247] NVIDIA: SAUCE: arm_mpam: resctrl: Reject oversized memory bandwidth portion bitmaps BugLink: https://bugs.launchpad.net/bugs/2122432 The MPAM architecture allows memory bandwidth portion bitmap (BWPBM) controls to be up to 4096 bits in size. Currently, the MPAM driver uses the scalar type u32 to represent the bitmap internally, so only the first 32 partitions will be allocatable. This would render some of the hardware memory bandwidth unusable if there are more partitions in the hardware, and there is no evidence yet that any platform will need it. Disable use of BWPBM controls that are larger than 32 bits, for now. Signed-off-by: Dave Martin (cherry picked from commit ce00b17e0ac020f8c173bb0728cdbb449712fa62 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index a815bc7486a8e..8c0234b8d8d45 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -254,8 +254,12 @@ static bool cache_has_usable_cpor(struct mpam_class *class) static bool mba_class_use_mbw_part(struct mpam_props *cprops) { - return (mpam_has_feature(mpam_feat_mbw_part, cprops) && - cprops->mbw_pbm_bits); + if (!mpam_has_feature(mpam_feat_mbw_part, cprops) || + cprops->mbw_pbm_bits < 1) + return false; + + /* u32 is used to represent MBW PBM bitmaps in the driver, for now: */ + return cprops->mbw_pbm_bits <= 32; } static bool mba_class_use_mbw_max(struct mpam_props *cprops) From b8190a6ab6bffd7601bd4cfc59f40210ec5cf005 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Aug 2025 15:44:01 +0100 Subject: [PATCH 116/247] NVIDIA: SAUCE: arm_mpam: resctrl: Fix MB min_bandwidth value exposed to userspace BugLink: https://bugs.launchpad.net/bugs/2122432 Currently, the min_bandwidth value exposed to userspace in resctrlfs for the MB resource does not match the way in which the value will get converted during reading/writing of the schema. It is also doubtful that the same rounding convention should be used for bandwidth_gran as for min_bandwidth, since bandwidth_gran (as now documented) is a bound on the worst-case precision of any value, whereas min_bandwidth is the value that corresponds to the single, specific minimum hardware encoding. Introduce an explicit helper to generate the min value directly from the conversion functions, and plumb it in. Signed-off-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit 1d9119373026f092915bf3e51e227d38f42fe7ab https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 8c0234b8d8d45..61386111221d4 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -362,6 +362,20 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) return val; } +static u32 get_mba_min(struct mpam_props *cprops) +{ + u32 val = 0; + + if (mba_class_use_mbw_part(cprops)) + val = mbw_pbm_to_percent(val, cprops); + else if (mba_class_use_mbw_max(cprops)) + val = mbw_max_to_percent(val, cprops); + else + WARN_ON_ONCE(1); + + return val; +} + /* Find the L3 cache that has affinity with this CPU */ static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask) { @@ -573,7 +587,7 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, r->membw.delay_linear = true; r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; - r->membw.min_bw = get_mba_granularity(cprops); + r->membw.min_bw = get_mba_min(cprops); r->membw.max_bw = MAX_MBA_BW; r->membw.bw_gran = get_mba_granularity(cprops); From 714065ac7c5ab0bec3cc17598eef54105c2d13b0 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 19 Jul 2024 11:32:52 +0100 Subject: [PATCH 117/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add kunit test for control format conversions BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl specifies the format of the control schemes, and these don't match the hardware. Some of the conversions are a bit hairy - add some kunit tests. [morse: squashed enough of Dave's fixes in here that it's his patch now!] Signed-off-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit d78dbb631d71eb59ecaf19cbae0db625532546e6 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 4 + drivers/resctrl/test_mpam_resctrl.c | 395 ++++++++++++++++++++++++++++ 2 files changed, 399 insertions(+) create mode 100644 drivers/resctrl/test_mpam_resctrl.c diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 61386111221d4..c18188f2c19e3 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1076,3 +1076,7 @@ int mpam_resctrl_setup(void) return err; } + +#ifdef CONFIG_MPAM_KUNIT_TEST +#include "test_mpam_resctrl.c" +#endif diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c new file mode 100644 index 0000000000000..bcd7268a8d539 --- /dev/null +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. +/* This file is intended to be included into mpam_resctrl.c */ + +#include +#include +#include +#include +#include + +struct percent_value_case { + u8 pc; + u8 width; + u16 value; +}; + +/* + * Mysterious inscriptions taken from ARM DDI 0598D.b, + * "Arm Architecture Reference Manual Supplement - Memory System + * Resource Partitioning and Monitoring (MPAM), for A-profile + * architecture", Section 9.8, "About the fixed-point fractional + * format" (exact percentage entries only): + */ +static const struct percent_value_case percent_value_cases[] = { + /* Architectural cases: */ + { 1, 8, 1 }, { 1, 12, 0x27 }, { 1, 16, 0x28e }, + { 25, 8, 0x3f }, { 25, 12, 0x3ff }, { 25, 16, 0x3fff }, + { 35, 8, 0x58 }, { 35, 12, 0x598 }, { 35, 16, 0x5998 }, + { 45, 8, 0x72 }, { 45, 12, 0x732 }, { 45, 16, 0x7332 }, + { 50, 8, 0x7f }, { 50, 12, 0x7ff }, { 50, 16, 0x7fff }, + { 52, 8, 0x84 }, { 52, 12, 0x850 }, { 52, 16, 0x851d }, + { 55, 8, 0x8b }, { 55, 12, 0x8cb }, { 55, 16, 0x8ccb }, + { 58, 8, 0x93 }, { 58, 12, 0x946 }, { 58, 16, 0x9479 }, + { 75, 8, 0xbf }, { 75, 12, 0xbff }, { 75, 16, 0xbfff }, + { 88, 8, 0xe0 }, { 88, 12, 0xe13 }, { 88, 16, 0xe146 }, + { 95, 8, 0xf2 }, { 95, 12, 0xf32 }, { 95, 16, 0xf332 }, + { 100, 8, 0xff }, { 100, 12, 0xfff }, { 100, 16, 0xffff }, + +}; + +static void test_percent_value_desc(const struct percent_value_case *param, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, + "pc=%d, width=%d, value=0x%.*x\n", + param->pc, param->width, + DIV_ROUND_UP(param->width, 4), param->value); +} + +KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases, + test_percent_value_desc); + +struct percent_value_test_info { + u32 pc; /* result of value-to-percent conversion */ + u32 value; /* result of percent-to-value conversion */ + u32 max_value; /* maximum raw value allowed by test params */ + unsigned int shift; /* promotes raw testcase value to 16 bits */ +}; + +/* + * Convert a reference percentage to a fixed-point MAX value and + * vice-versa, based on param (not test->param_value!) + */ +static void __prepare_percent_value_test(struct kunit *test, + struct percent_value_test_info *res, + const struct percent_value_case *param) +{ + struct mpam_props fake_props = { }; + + /* Reject bogus test parameters that would break the tests: */ + KUNIT_ASSERT_GE(test, param->width, 1); + KUNIT_ASSERT_LE(test, param->width, 16); + KUNIT_ASSERT_LT(test, param->value, 1 << param->width); + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = param->width; + + res->shift = 16 - param->width; + res->max_value = GENMASK_U32(param->width - 1, 0); + res->value = percent_to_mbw_max(param->pc, &fake_props); + res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props); +} + +static void test_get_mba_granularity(struct kunit *test) +{ + int ret; + struct mpam_props fake_props = { }; + + /* Use MBW_PBM */ + mpam_set_feature(mpam_feat_mbw_part, &fake_props); + + /* 0 bits means the control is unconfigurable */ + fake_props.mbw_pbm_bits = 0; + KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_part(&fake_props)); + + /* Otherwise, bitmaps that fit in a u32 are supported: */ + fake_props.mbw_pbm_bits = 1; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_part(&fake_props)); + + fake_props.mbw_pbm_bits = 32; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_part(&fake_props)); + + /* But bigger bitmaps aren't: */ + fake_props.mbw_pbm_bits = 33; + KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_part(&fake_props)); + + fake_props.mbw_pbm_bits = 4; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 4)% = 25% */ + + fake_props.mbw_pbm_bits = 6; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 17); /* DIV_ROUND_UP(100, 6)% = 7% */ + + /* Largest bitmap size that the drivers supports, for now: */ + fake_props.mbw_pbm_bits = 32; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 4); /* DIV_ROUND_UP(100, 32)% = 4% */ + + /* Use MBW_MAX */ + bitmap_zero(fake_props.features, MPAM_FEATURE_LAST); + fake_props.mbw_pbm_bits = 0; + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + + fake_props.bwa_wd = 0; + KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props)); + + fake_props.bwa_wd = 1; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* Architectural maximum: */ + fake_props.bwa_wd = 16; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* No usable control... */ + fake_props.bwa_wd = 0; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 0); + + fake_props.bwa_wd = 1; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 50); /* DIV_ROUND_UP(100, 1 << 1)% = 50% */ + + fake_props.bwa_wd = 2; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 1 << 2)% = 25% */ + + fake_props.bwa_wd = 3; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 13); /* DIV_ROUND_UP(100, 1 << 3)% = 13% */ + + fake_props.bwa_wd = 6; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 2); /* DIV_ROUND_UP(100, 1 << 6)% = 2% */ + + fake_props.bwa_wd = 7; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 7)% = 1% */ + + /* Granularity saturates at 1% */ + fake_props.bwa_wd = 16; /* architectural maximum */ + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */ +} + +static void test_mbw_pbm_to_percent(struct kunit *test) +{ + int ret; + struct mpam_props fake_props = {0}; + + mpam_set_feature(mpam_feat_mbw_part, &fake_props); + fake_props.mbw_pbm_bits = 4; + + ret = mbw_pbm_to_percent(0x0, &fake_props); + KUNIT_EXPECT_EQ(test, ret, 0); + + ret = mbw_pbm_to_percent(0x3, &fake_props); + KUNIT_EXPECT_EQ(test, ret, 50); + + ret = mbw_pbm_to_percent(0x7, &fake_props); + KUNIT_EXPECT_EQ(test, ret, 75); + + fake_props.mbw_pbm_bits = 16; + ret = mbw_pbm_to_percent(0xffff, &fake_props); + KUNIT_EXPECT_EQ(test, ret, 100); + + fake_props.mbw_pbm_bits = 0; + ret = mbw_pbm_to_percent(0xff, &fake_props); + KUNIT_EXPECT_EQ(test, ret, 0); +} + +static void test_mbw_max_to_percent(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + /* + * Since the reference values in percent_value_cases[] all + * correspond to exact percentages, round-to-nearest will + * always give the exact percentage back when the MPAM max + * value has precision of 0.5% or finer. (Always true for the + * reference data, since they all specify 8 bits or more of + * precision. + * + * So, keep it simple and demand an exact match: + */ + __prepare_percent_value_test(test, &res, param); + KUNIT_EXPECT_EQ(test, res.pc, param->pc); +} + +static void test_percent_to_mbw_pbm(struct kunit *test) +{ + unsigned long ret; + struct mpam_props fake_props = {0}; + + mpam_set_feature(mpam_feat_mbw_part, &fake_props); + fake_props.mbw_pbm_bits = 4; + + ret = percent_to_mbw_pbm(100, &fake_props); + KUNIT_EXPECT_EQ(test, bitmap_weight(&ret, fake_props.mbw_pbm_bits), 4); + + ret = percent_to_mbw_pbm(50, &fake_props); + KUNIT_EXPECT_EQ(test, bitmap_weight(&ret, fake_props.mbw_pbm_bits), 2); + + ret = percent_to_mbw_pbm(0, &fake_props); + KUNIT_EXPECT_EQ(test, bitmap_weight(&ret, fake_props.mbw_pbm_bits), 0); + + fake_props.mbw_pbm_bits = 16; + ret = percent_to_mbw_pbm(100, &fake_props); + KUNIT_EXPECT_EQ(test, bitmap_weight(&ret, fake_props.mbw_pbm_bits), 16); +} + +static void test_percent_to_mbw_max(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + __prepare_percent_value_test(test, &res, param); + + KUNIT_EXPECT_GE(test, res.value, param->value << res.shift); + KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift); + KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift); + + /* No flexibility allowed for 0% and 100%! */ + + if (param->pc == 0) + KUNIT_EXPECT_EQ(test, res.value, 0); + + if (param->pc == 100) + KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift); +} + +static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev, + char *desc) +{ + uintptr_t param = (uintptr_t)prev; + + if (param > 15) + return NULL; + + param++; + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param); + + return (void *)param; +} + +static unsigned int test_get_bwa_wd(struct kunit *test) +{ + uintptr_t param = (uintptr_t)test->param_value; + + KUNIT_ASSERT_GE(test, param, 1); + KUNIT_ASSERT_LE(test, param, 16); + + return param; +} + +static void test_mbw_max_to_percent_limits(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + u32 max_value; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + max_value = GENMASK(15, 16 - fake_props.bwa_wd); + + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props), + MAX_MBA_BW); + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), + get_mba_min(&fake_props)); + + /* + * Rounding policy dependent 0% sanity-check: + * With round-to-nearest, the minimum mbw_max value really + * should map to 0% if there are at least 200 steps. + * (100 steps may be enough for some other rounding policies.) + */ + if (fake_props.bwa_wd >= 8) + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0); + + if (fake_props.bwa_wd < 8 && + mbw_max_to_percent(0, &fake_props) == 0) + kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?", + fake_props.bwa_wd); +} + +/* + * Check that converting a percentage to mbw_max and back again (or, as + * appropriate, vice-versa) always restores the original value: + */ +static void test_percent_max_roundtrip_stability(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + unsigned int shift; + u32 pc, max, pc2, max2; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + shift = 16 - fake_props.bwa_wd; + + /* + * Converting a valid value from the coarser scale to the finer + * scale and back again must yield the original value: + */ + if (fake_props.bwa_wd >= 7) { + /* More than 100 steps: only test exact pc values: */ + for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) { + max = percent_to_mbw_max(pc, &fake_props); + pc2 = mbw_max_to_percent(max, &fake_props); + KUNIT_EXPECT_EQ(test, pc2, pc); + } + } else { + /* Fewer than 100 steps: only test exact mbw_max values: */ + for (max = 0; max < 1 << 16; max += 1 << shift) { + pc = mbw_max_to_percent(max, &fake_props); + max2 = percent_to_mbw_max(pc, &fake_props); + KUNIT_EXPECT_EQ(test, max2, max); + } + } +} + +static void test_percent_to_max_rounding(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + unsigned int num_rounded_up = 0, total = 0; + struct percent_value_test_info res; + + for (param = percent_value_cases, total = 0; + param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)]; + param++, total++) { + __prepare_percent_value_test(test, &res, param); + if (res.value > param->value << res.shift) + num_rounded_up++; + } + + /* + * The MPAM driver applies a round-to-nearest policy, whereas a + * round-down policy seems to have been applied in the + * reference table from which the test vectors were selected. + * + * For a large and well-distributed suite of test vectors, + * about half should be rounded up and half down compared with + * the reference table. The actual test vectors are few in + * number and probably not very well distributed however, so + * tolerate a round-up rate of between 1/4 and 3/4 before + * crying foul: + */ + + kunit_info(test, "Round-up rate: %u%% (%u/%u)\n", + DIV_ROUND_CLOSEST(num_rounded_up * 100, total), + num_rounded_up, total); + + KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total); + KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total); +} + +static struct kunit_case mpam_resctrl_test_cases[] = { + KUNIT_CASE(test_get_mba_granularity), + KUNIT_CASE(test_mbw_pbm_to_percent), + KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), + KUNIT_CASE(test_percent_to_mbw_pbm), + KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params), + KUNIT_CASE(test_percent_to_max_rounding), + KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability, + test_all_bwa_wd_gen_params), + {} +}; + +static struct kunit_suite mpam_resctrl_test_suite = { + .name = "mpam_resctrl_test_suite", + .test_cases = mpam_resctrl_test_cases, +}; + +kunit_test_suites(&mpam_resctrl_test_suite); From c9d31713b7a2390bc735a5de693edd467e06fd46 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 May 2024 14:55:19 +0100 Subject: [PATCH 118/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add support for csu counters BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl exposes a counter via a file named llc_occupancy. This isn't really a counter as its value goes up and down, this is a snapshot of the cache storage usage monitor. Add some picking code to find a cache as close as possible to the L3 that supports the CSU monitor. If there is an L3, but it doesn't have any controls, force the L3 resource to exist. The existing topology_matches_l3() and mpam_resctrl_domain_hdr_init() code will ensure this looks like the L3, even if the class belongs to a later cache. Signed-off-by: James Morse (cherry picked from commit 76786d3c1a36ff52bba2f5e36c7b37ff33580664 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_internal.h | 6 ++ drivers/resctrl/mpam_resctrl.c | 146 ++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 58d7125aea4f0..8aed6da8c9963 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -350,6 +350,12 @@ struct mpam_resctrl_res { struct rdt_resource resctrl_res; }; +struct mpam_resctrl_mon { + struct mpam_class *class; + + /* per-class data that resctrl needs will live here */ +}; + static inline int mpam_alloc_csu_mon(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index c18188f2c19e3..386edc05f30c2 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -31,6 +31,16 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); +/* + * The classes we've picked to map to resctrl events. + * Resctrl believes all the worlds a Xeon, and these are all on the L3. This + * array lets us find the actual class backing the event counters. e.g. + * the only memory bandwidth counters may be on the memory controller, but to + * make use of them, we pretend they are on L3. + * Class pointer may be NULL. + */ +static struct mpam_resctrl_mon mpam_resctrl_counters[QOS_NUM_EVENTS]; + static bool exposed_alloc_capable; static bool exposed_mon_capable; @@ -273,6 +283,28 @@ static bool class_has_usable_mba(struct mpam_props *cprops) return mba_class_use_mbw_part(cprops) || mba_class_use_mbw_max(cprops); } +static bool cache_has_usable_csu(struct mpam_class *class) +{ + struct mpam_props *cprops; + + if (!class) + return false; + + cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return false; + + /* + * CSU counters settle on the value, so we can get away with + * having only one. + */ + if (!cprops->num_csu_mon) + return false; + + return (mpam_partid_max > 1) || (mpam_pmg_max != 0); +} + /* * Calculate the worst-case percentage change from each implemented step * in the control. @@ -545,6 +577,62 @@ static void mpam_resctrl_pick_mba(void) } } +static void counter_update_class(enum resctrl_event_id evt_id, + struct mpam_class *class) +{ + struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; + + if (existing_class) { + if (class->level == 3) { + pr_debug("Existing class is L3 - L3 wins\n"); + return; + } else if (existing_class->level < class->level) { + pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n", + existing_class->level, class->level); + return; + } + } + + mpam_resctrl_counters[evt_id].class = class; + exposed_mon_capable = true; +} + +static void mpam_resctrl_pick_counters(void) +{ + struct mpam_class *class; + bool has_csu; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + if (class->level < 3) { + pr_debug("class %u is before L3", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u does not cover all CPUs", class->level); + continue; + } + + has_csu = cache_has_usable_csu(class); + if (has_csu && topology_matches_l3(class)) { + pr_debug("class %u has usable CSU, and matches L3 topology", class->level); + + /* CSU counters only make sense on a cache. */ + switch (class->type) { + case MPAM_CLASS_CACHE: + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); + return; + default: + return; + } + } + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, enum resctrl_res_level type) { @@ -625,6 +713,50 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, + enum resctrl_event_id type) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + /* There also needs to be an L3 cache present */ + if (get_cpu_cacheinfo_id(smp_processor_id(), 3) == -1) + return; + + /* + * If there are no MPAM resources on L3, force it into existence. + * topology_matches_l3() already ensures this looks like the L3. + * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init(). + */ + if (!res->class) { + pr_warn_once("Faking L3 MSC to enable counters.\n"); + res->class = mpam_resctrl_counters[type].class; + } + + /* Called multiple times!, once per event type */ + if (exposed_mon_capable) { + l3->mon_capable = true; + + /* Setting name is necessary on monitor only platforms */ + l3->name = "L3"; + l3->mon_scope = RESCTRL_L3_CACHE; + + resctrl_enable_mon_event(type); + + /* + * Unfortunately, num_rmid doesn't mean anything for + * mpam, and its exposed to user-space! + * num-rmid is supposed to mean the number of groups + * that can be created, both control or monitor groups. + * For mpam, each control group has its own pmg/rmid + * space. + */ + l3->mon.num_rmid = 1; + } +} + u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -1023,8 +1155,10 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) int mpam_resctrl_setup(void) { int err = 0; + enum resctrl_event_id j; enum resctrl_res_level i; struct mpam_resctrl_res *res; + struct mpam_resctrl_mon *mon; cpus_read_lock(); for (i = 0; i < RDT_NUM_RESOURCES; i++) { @@ -1050,6 +1184,18 @@ int mpam_resctrl_setup(void) break; } } + + /* Find some classes to use for monitors */ + mpam_resctrl_pick_counters(); + + for (j = 0; j < QOS_NUM_EVENTS; j++) { + mon = &mpam_resctrl_counters[j]; + if (!mon->class) + continue; // dummy resource + + mpam_resctrl_monitor_init(mon, j); + } + cpus_read_unlock(); if (err || (!exposed_alloc_capable && !exposed_mon_capable)) { From 9fdc99aec092614db0b2b0da348a8a968896e983 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 7 Sep 2021 17:21:42 +0100 Subject: [PATCH 119/247] NVIDIA: SAUCE: untested: arm_mpam: resctrl: pick classes for use as mbm counters BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl has two types of counters, NUMA-local and global. MPAM has only bandwidth counters, but the position of the MSC may mean it counts NUMA-local, or global traffic. But the topology information is not available. Apply a heuristic: the L2 or L3 supports bandwidth monitors, these are probably NUMA-local. If the memory controller supports bandwidth monitors, they are probably global. This also allows us to assert that we don't have the same class backing two different resctrl events. Because the class or component backing the event may not be 'the L3', it is necessary for mpam_resctrl_get_domain_from_cpu() to search the monitor domains too. This matters the most for 'monitor only' systems, where 'the L3' control domains may be empty, and the ctrl_comp pointer NULL. Signed-off-by: James Morse (cherry picked from commit 5c3659756641c37008abcffa5ae393a4fac60452 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_internal.h | 8 ++ drivers/resctrl/mpam_resctrl.c | 143 ++++++++++++++++++++++++++++++-- 2 files changed, 145 insertions(+), 6 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 8aed6da8c9963..3559e3a863ebe 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -341,6 +341,14 @@ struct mpam_msc_ris { struct mpam_resctrl_dom { struct mpam_component *ctrl_comp; + + /* + * There is no single mon_comp because different events may be backed + * by different class/components. mon_comp is indexed by the event + * number. + */ + struct mpam_component *mon_comp[QOS_NUM_EVENTS]; + struct rdt_ctrl_domain resctrl_ctrl_dom; struct rdt_mon_domain resctrl_mon_dom; }; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 386edc05f30c2..6221789a87229 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -50,6 +50,14 @@ static bool exposed_mon_capable; */ static bool cdp_enabled; +/* Whether this num_mbw_mon could result in a free_running system */ +static int __mpam_monitors_free_running(u16 num_mbwu_mon) +{ + if (num_mbwu_mon >= resctrl_arch_system_num_rmid_idx()) + return resctrl_arch_system_num_rmid_idx(); + return 0; +} + bool resctrl_arch_alloc_capable(void) { return exposed_alloc_capable; @@ -305,6 +313,24 @@ static bool cache_has_usable_csu(struct mpam_class *class) return (mpam_partid_max > 1) || (mpam_pmg_max != 0); } +static bool class_has_usable_mbwu(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_mbwu, cprops)) + return false; + + /* + * resctrl expects the bandwidth counters to be free running, + * which means we need as many monitors as resctrl has + * control/monitor groups. + */ + if (!__mpam_monitors_free_running(cprops->num_mbwu_mon)) + return false; + + return true; +} + /* * Calculate the worst-case percentage change from each implemented step * in the control. @@ -600,7 +626,7 @@ static void counter_update_class(enum resctrl_event_id evt_id, static void mpam_resctrl_pick_counters(void) { struct mpam_class *class; - bool has_csu; + bool has_csu, has_mbwu; lockdep_assert_cpus_held(); @@ -630,7 +656,37 @@ static void mpam_resctrl_pick_counters(void) return; } } + + has_mbwu = class_has_usable_mbwu(class); + if (has_mbwu && topology_matches_l3(class)) { + pr_debug("class %u has usable MBWU, and matches L3 topology", class->level); + + /* + * MBWU counters may be 'local' or 'total' depending on + * where they are in the topology. Counters on caches + * are assumed to be local. If it's on the memory + * controller, its assumed to be global. + * TODO: check mbm_local matches NUMA boundaries... + */ + switch (class->type) { + case MPAM_CLASS_CACHE: + counter_update_class(QOS_L3_MBM_LOCAL_EVENT_ID, + class); + break; + case MPAM_CLASS_MEMORY: + counter_update_class(QOS_L3_MBM_TOTAL_EVENT_ID, + class); + break; + default: + break; + } + } } + + /* Allocation of MBWU monitors assumes that the class is unique... */ + if (mpam_resctrl_counters[QOS_L3_MBM_LOCAL_EVENT_ID].class) + WARN_ON_ONCE(mpam_resctrl_counters[QOS_L3_MBM_LOCAL_EVENT_ID].class == + mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, @@ -971,6 +1027,20 @@ static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, return false; } +static struct mpam_component *find_component(struct mpam_class *victim, int cpu) +{ + struct mpam_component *victim_comp; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(victim_comp, &victim->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &victim_comp->affinity)) + return victim_comp; + } + + return NULL; +} + static struct mpam_resctrl_dom * mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { @@ -1021,8 +1091,32 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) } if (exposed_mon_capable) { + int i; + struct mpam_component *mon_comp, *any_mon_comp; + + /* + * Even if the monitor domain is backed by a different component, + * the L3 component IDs need to be used... only there may be no + * ctrl_comp for the L3. + * Search each event's class list for a component with overlapping + * CPUs and set up the dom->mon_comp array. + */ + for (i = 0; i < QOS_NUM_EVENTS; i++) { + struct mpam_resctrl_mon *mon; + + mon = &mpam_resctrl_counters[i]; + if (!mon->class) + continue; // dummy resource + + mon_comp = find_component(mon->class, cpu); + dom->mon_comp[i] = mon_comp; + if (mon_comp) + any_mon_comp = mon_comp; + } + WARN_ON_ONCE(!any_mon_comp); + mon_d = &dom->resctrl_mon_dom; - mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, &mon_d->hdr); + mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; /* TODO: this list should be sorted */ list_add_tail(&mon_d->hdr.list, &r->mon_domains); @@ -1044,6 +1138,37 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) return dom; } +/* + * We know all the monitors are associated with the L3, even if there are no + * controls and therefore no control component. Find the cache-id for the CPU + * and use that to search for existing resctrl domains. + * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id + * for anything that is not a cache. + */ +static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +{ + u32 cache_id; + struct rdt_mon_domain *mon_d; + struct mpam_resctrl_dom *dom; + struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + + if (!l3->class) + return NULL; + /* TODO: how does this order with cacheinfo updates under cpuhp? */ + cache_id = get_cpu_cacheinfo_id(cpu, 3); + if (cache_id == ~0) + return NULL; + + list_for_each_entry(mon_d, &l3->resctrl_res.mon_domains, hdr.list) { + dom = container_of(mon_d, struct mpam_resctrl_dom, resctrl_mon_dom); + + if (mon_d->hdr.id == cache_id) + return dom; + } + + return NULL; +} + /** * mpam_resctrl_get_domain_from_cpu() - find the mpam domain structure * @cpu: The CPU that is going online/offline. @@ -1055,23 +1180,29 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) * domain is online. * For platforms with controls, this is easy as each resource has one control * component. + * For the monitors, we need to search the list of events... */ static struct mpam_resctrl_dom * mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) { - struct rdt_ctrl_domain *d; struct mpam_resctrl_dom *dom; + struct rdt_ctrl_domain *ctrl_d; + struct rdt_resource *r = &res->resctrl_res; lockdep_assert_cpus_held(); - list_for_each_entry(d, &res->resctrl_res.ctrl_domains, hdr.list) { - dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + list_for_each_entry(ctrl_d, &r->ctrl_domains, hdr.list) { + dom = container_of(ctrl_d, struct mpam_resctrl_dom, resctrl_ctrl_dom); if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) return dom; } - return NULL; + if (r->rid != RDT_RESOURCE_L3) + return NULL; + + /* Search the mon domain list too - needed on monitor only platforms. */ + return mpam_resctrl_get_mon_domain_from_cpu(cpu); } int mpam_resctrl_online_cpu(unsigned int cpu) From 7251cf5c84d99ce07d00f7b7b986729346989d61 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 15 Jul 2025 15:39:36 +0100 Subject: [PATCH 120/247] NVIDIA: SAUCE: arm_mpam: resctrl: Pre-allocate free running monitors BugLink: https://bugs.launchpad.net/bugs/2122432 When there are enough monitors, the resctrl mbm local and total files can be exposed. These need all the monitors that resctrl may use to be allocated up front. Add helpers to do this. If a different candidate class is discovered, the old array should be free'd and the allocated monitors returned to the driver. Signed-off-by: James Morse (cherry picked from commit 8e335dfe5b6b44896228d78f53ca85b6f2df106e https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_internal.h | 8 ++- drivers/resctrl/mpam_resctrl.c | 92 +++++++++++++++++++++++++++++++-- 2 files changed, 94 insertions(+), 6 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 3559e3a863ebe..f93e0d2d79d14 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -361,7 +361,13 @@ struct mpam_resctrl_res { struct mpam_resctrl_mon { struct mpam_class *class; - /* per-class data that resctrl needs will live here */ + /* + * Array of allocated MBWU monitors, indexed by (closid, rmid). + * When ABMC is not in use, this array directly maps (closid, rmid) + * to the allocated monitor. Otherwise this array is sparse, and + * un-assigned (closid, rmid) are -1. + */ + int *mbwu_idx_to_mon; }; static inline int mpam_alloc_csu_mon(struct mpam_class *class) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 6221789a87229..eb3b68a0b0b3c 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -325,10 +325,12 @@ static bool class_has_usable_mbwu(struct mpam_class *class) * which means we need as many monitors as resctrl has * control/monitor groups. */ - if (!__mpam_monitors_free_running(cprops->num_mbwu_mon)) - return false; + if (__mpam_monitors_free_running(cprops->num_mbwu_mon)) { + pr_debug("monitors usable in free-running mode\n"); + return true; + } - return true; + return false; } /* @@ -603,10 +605,58 @@ static void mpam_resctrl_pick_mba(void) } } +static void __free_mbwu_mon(struct mpam_class *class, int *array, + u16 num_mbwu_mon) +{ + for (int i = 0; i < num_mbwu_mon; i++) { + if (array[i] < 0) + continue; + + mpam_free_mbwu_mon(class, array[i]); + array[i] = ~0; + } +} + +static int __alloc_mbwu_mon(struct mpam_class *class, int *array, + u16 num_mbwu_mon) +{ + for (int i = 0; i < num_mbwu_mon; i++) { + int mbwu_mon = mpam_alloc_mbwu_mon(class); + + if (mbwu_mon < 0) { + __free_mbwu_mon(class, array, num_mbwu_mon); + return mbwu_mon; + } + array[i] = mbwu_mon; + } + + return 0; +} + +static int *__alloc_mbwu_array(struct mpam_class *class, u16 num_mbwu_mon) +{ + int err; + size_t array_size = num_mbwu_mon * sizeof(int); + int *array __free(kfree) = kmalloc(array_size, GFP_KERNEL); + + if (!array) + return ERR_PTR(-ENOMEM); + + memset(array, -1, array_size); + + err = __alloc_mbwu_mon(class, array, num_mbwu_mon); + if (err) + return ERR_PTR(err); + return_ptr(array); +} + static void counter_update_class(enum resctrl_event_id evt_id, struct mpam_class *class) { - struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evt_id]; + struct mpam_class *existing_class = mon->class; + u16 num_mbwu_mon = class->props.num_mbwu_mon; + int *existing_array = mon->mbwu_idx_to_mon; if (existing_class) { if (class->level == 3) { @@ -619,8 +669,40 @@ static void counter_update_class(enum resctrl_event_id evt_id, } } - mpam_resctrl_counters[evt_id].class = class; + pr_debug("Updating event %u to use class %u\n", evt_id, class->level); + mon->class = class; exposed_mon_capable = true; + + if (evt_id == QOS_L3_OCCUP_EVENT_ID) + return; + + /* Might not need all the monitors */ + num_mbwu_mon = __mpam_monitors_free_running(num_mbwu_mon); + if (!num_mbwu_mon) { + pr_debug("Not pre-allocating free-running counters\n"); + return; + } + + /* + * This is the pre-allocated free-running monitors path. It always + * allocates one monitor per PARTID * PMG. + */ + WARN_ON_ONCE(num_mbwu_mon != resctrl_arch_system_num_rmid_idx()); + + mon->mbwu_idx_to_mon = __alloc_mbwu_array(class, num_mbwu_mon); + if (IS_ERR(mon->mbwu_idx_to_mon)) { + pr_debug("Failed to allocate MBWU array\n"); + mon->class = existing_class; + mon->mbwu_idx_to_mon = existing_array; + return; + } + + if (existing_array) { + pr_debug("Releasing previous class %u's monitors\n", + existing_class->level); + __free_mbwu_mon(existing_class, existing_array, num_mbwu_mon); + kfree(existing_array); + } } static void mpam_resctrl_pick_counters(void) From 3cdfd20a7f2e134cbd08fafa41c77da5f619d79d Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 15 Oct 2025 12:33:20 +0100 Subject: [PATCH 121/247] NVIDIA: SAUCE: arm_mpam: resctrl: Pre-allocate assignable monitors BugLink: https://bugs.launchpad.net/bugs/2122432 When there are not enough monitors, MPAM is able to emulate ABMC by making a smaller number of monitors assignable. These monitors still need to be allocated from the driver, and mapped to whichever control/monitor group resctrl wants to use them with. Add a second array to hold the monitor values indexed by resctrl's cntr_id. When CDP is in use, two monitors are needed so the available number of counters halves. Platforms witih one monitor will have zero monitors when CDP is in use. Signed-off-by: James Morse (cherry picked from commit 68431461f7081f02e26d91775a5f96f9ac2ed6d3 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_internal.h | 7 +++ drivers/resctrl/mpam_resctrl.c | 104 +++++++++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index f93e0d2d79d14..8831f76b99586 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -368,6 +368,13 @@ struct mpam_resctrl_mon { * un-assigned (closid, rmid) are -1. */ int *mbwu_idx_to_mon; + + /* + * Array of assigned MBWU monitors, indexed by idx argument. + * When ABMC is not in use, this array can be NULL. Otherwise + * it maps idx to the allocated monitor. + */ + int *assigned_counters; }; static inline int mpam_alloc_csu_mon(struct mpam_class *class) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index eb3b68a0b0b3c..dad5371430b5f 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -50,6 +50,12 @@ static bool exposed_mon_capable; */ static bool cdp_enabled; +/* + * L3 local/total may come from different classes - what is the number of MBWU + * 'on L3'? + */ +static unsigned int l3_num_allocated_mbwu = ~0; + /* Whether this num_mbw_mon could result in a free_running system */ static int __mpam_monitors_free_running(u16 num_mbwu_mon) { @@ -58,6 +64,15 @@ static int __mpam_monitors_free_running(u16 num_mbwu_mon) return 0; } +/* + * If l3_num_allocated_mbwu is forced below PARTID * PMG, then the counters + * are not free running, and ABMC's user-interface must be used to assign them. + */ +static bool mpam_resctrl_abmc_enabled(void) +{ + return l3_num_allocated_mbwu < resctrl_arch_system_num_rmid_idx(); +} + bool resctrl_arch_alloc_capable(void) { return exposed_alloc_capable; @@ -102,10 +117,27 @@ static void resctrl_reset_task_closids(void) read_unlock(&tasklist_lock); } +static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *l3) +{ + l3->mon.num_mbm_cntrs = l3_num_allocated_mbwu; + if (cdp_enabled) + l3->mon.num_mbm_cntrs /= 2; + + if (l3->mon.num_mbm_cntrs) { + l3->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); + l3->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); + } else { + l3->mon.mbm_cntr_assignable = false; + l3->mon.mbm_assign_on_mkdir = false; + } +} + int resctrl_arch_set_cdp_enabled(enum resctrl_res_level ignored, bool enable) { - u64 regval; + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; u32 partid, partid_i, partid_d; + u64 regval; cdp_enabled = enable; @@ -122,6 +154,7 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level ignored, bool enable) } resctrl_reset_task_closids(); + mpam_resctrl_monitor_sync_abmc_vals(l3); WRITE_ONCE(arm64_mpam_global_default, regval); @@ -330,6 +363,11 @@ static bool class_has_usable_mbwu(struct mpam_class *class) return true; } + if (cprops->num_mbwu_mon) { + pr_debug("monitors usable via ABMC assignment\n"); + return true; + } + return false; } @@ -630,6 +668,8 @@ static int __alloc_mbwu_mon(struct mpam_class *class, int *array, array[i] = mbwu_mon; } + l3_num_allocated_mbwu = min(l3_num_allocated_mbwu, num_mbwu_mon); + return 0; } @@ -771,6 +811,23 @@ static void mpam_resctrl_pick_counters(void) mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) + return false; + + return mpam_resctrl_abmc_enabled(); +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + lockdep_assert_cpus_held(); + + WARN_ON_ONCE(1); + + return 0; +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, enum resctrl_res_level type) { @@ -851,6 +908,41 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +/* + * This must run after all event counters have been picked so that any free + * running counters have already been allocated. + */ +static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + size_t array_size = resctrl_arch_system_num_rmid_idx() * sizeof(int); + int *rmid_array __free(kfree) = kmalloc(array_size, GFP_KERNEL); + struct rdt_resource *l3 = &res->resctrl_res; + struct mpam_class *class = mon->class; + u16 num_mbwu_mon; + + if (mon->mbwu_idx_to_mon) { + pr_debug("monitors free running\n"); + return 0; + } + + if (!rmid_array) { + pr_debug("Failed to allocate RMID array\n"); + return -ENOMEM; + } + memset(rmid_array, -1, array_size); + + num_mbwu_mon = class->props.num_mbwu_mon; + mon->assigned_counters = __alloc_mbwu_array(mon->class, num_mbwu_mon); + if (IS_ERR(mon->assigned_counters)) + return PTR_ERR(mon->assigned_counters); + mon->mbwu_idx_to_mon = no_free_ptr(rmid_array); + + mpam_resctrl_monitor_sync_abmc_vals(l3); + + return 0; +} + static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, enum resctrl_event_id type) { @@ -892,6 +984,16 @@ static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, * space. */ l3->mon.num_rmid = 1; + + switch (type) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + mpam_resctrl_monitor_init_abmc(mon); + + return; + default: + return; + } } } From 0f131be32d022abc61b696e35a3ea45b0603282e Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Oct 2025 14:31:11 +0100 Subject: [PATCH 122/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add kunit test for ABMC/CDP interactions BugLink: https://bugs.launchpad.net/bugs/2122432 ABMC exposes a fun corner case where a platform with one monitor can use ABMC for assignable counters - but not when CDP is enabled. Add some tests. Signed-off-by: James Morse (cherry picked from commit 26ef1ecb0d172ecc0d8b921916bd0a43b0c98bc7 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/test_mpam_resctrl.c | 62 +++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c index bcd7268a8d539..e79f77f4ec0f9 100644 --- a/drivers/resctrl/test_mpam_resctrl.c +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -374,6 +374,67 @@ static void test_percent_to_max_rounding(struct kunit *test) KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total); } +static void test_num_assignable_counters(struct kunit *test) +{ + unsigned int orig_l3_num_allocated_mbwu = l3_num_allocated_mbwu; + u32 orig_mpam_partid_max = mpam_partid_max; + u32 orig_mpam_pmg_max = mpam_pmg_max; + bool orig_cdp_enabled = cdp_enabled; + struct rdt_resource fake_l3; + + /* Force there to be some PARTID/PMG */ + mpam_partid_max = 3; + mpam_pmg_max = 1; + + cdp_enabled = false; + + /* ABMC off, CDP off */ + l3_num_allocated_mbwu = resctrl_arch_system_num_rmid_idx(); + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, resctrl_arch_system_num_rmid_idx()); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* ABMC on, CDP off */ + l3_num_allocated_mbwu = 4; + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, 4); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_assign_on_mkdir); + + cdp_enabled = true; + + /* ABMC off, CDP on */ + l3_num_allocated_mbwu = resctrl_arch_system_num_rmid_idx(); + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + + /* (value not consumed by resctrl) */ + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, resctrl_arch_system_num_rmid_idx() / 2); + + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* ABMC on, CDP on */ + l3_num_allocated_mbwu = 4; + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, 2); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* ABMC 'on', CDP on - but not enough counters */ + l3_num_allocated_mbwu = 1; + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, 0); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* Restore global variables that were messed with */ + l3_num_allocated_mbwu = orig_l3_num_allocated_mbwu; + mpam_partid_max = orig_mpam_partid_max; + mpam_pmg_max = orig_mpam_pmg_max; + cdp_enabled = orig_cdp_enabled; +} + static struct kunit_case mpam_resctrl_test_cases[] = { KUNIT_CASE(test_get_mba_granularity), KUNIT_CASE(test_mbw_pbm_to_percent), @@ -384,6 +445,7 @@ static struct kunit_case mpam_resctrl_test_cases[] = { KUNIT_CASE(test_percent_to_max_rounding), KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability, test_all_bwa_wd_gen_params), + KUNIT_CASE(test_num_assignable_counters), {} }; From a5fda58c80716ab7409beb8e15f9b9dd799bf642 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 15 Oct 2025 14:33:11 +0100 Subject: [PATCH 123/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_config_cntr() for ABMC use BugLink: https://bugs.launchpad.net/bugs/2122432 ABMC has a helper resctrl_arch_config_cntr() for changing the mapping between 'cntr_id' and a CLOSID/RMID pair. Add the helper. For MPAM this is done by updating the mon->mbwu_idx_to_mon[] array, and as usual CDP means it needs doing in three different ways. Signed-off-by: James Morse (cherry picked from commit 90f079e1ea8006fdcfeefc862c7420eabbd1ba14 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index dad5371430b5f..69f64e346bac7 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -811,6 +811,43 @@ static void mpam_resctrl_pick_counters(void) mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } +static void __config_cntr(struct mpam_resctrl_mon *mon, u32 cntr_id, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, + bool assign) +{ + u32 mbwu_idx, mon_idx = resctrl_get_config_index(cntr_id, cdp_type); + + closid = resctrl_get_config_index(closid, cdp_type); + mbwu_idx = resctrl_arch_rmid_idx_encode(closid, rmid); + WARN_ON_ONCE(mon_idx > l3_num_allocated_mbwu); + + if (assign) + mon->mbwu_idx_to_mon[mbwu_idx] = mon->assigned_counters[mon_idx]; + else + mon->mbwu_idx_to_mon[mbwu_idx] = -1; +} + +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->mbwu_idx_to_mon || !mon->assigned_counters) { + pr_debug("monitor arrays not allocated\n"); + return; + } + + if (cdp_enabled) { + __config_cntr(mon, cntr_id, CDP_CODE, closid, rmid, assign); + __config_cntr(mon, cntr_id, CDP_DATA, closid, rmid, assign); + } else { + __config_cntr(mon, cntr_id, CDP_NONE, closid, rmid, assign); + } + + resctrl_arch_reset_rmid(r, d, closid, rmid, evtid); +} + bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) From cd3c424f0e7e23f70b58b8f75539d28d7412cb14 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 25 Jun 2021 13:29:39 +0100 Subject: [PATCH 124/247] NVIDIA: SAUCE: arm_mpam: resctrl: Allow resctrl to allocate monitors BugLink: https://bugs.launchpad.net/bugs/2122432 When resctrl wants to read a domain's 'QOS_L3_OCCUP', it needs to allocate a monitor on the corresponding resource. Monitors are allocated by class instead of component. MBM monitors are much more complicated, if there are enough monitors, they will be pre-allocated and free-running. If ABMC is in use instead then 'some' are pre-allocated in a different way, and need assigning. Add helpers to allocate a CSU monitor. These helper return an out of range value for MBM counters. Allocating a montitor context is expected to block until hardware resources become available. This only makes sense for QOS_L3_OCCUP as unallocated MBM counters are losing data. Signed-off-by: James Morse (cherry picked from commit 19809e491b5777af1ecae83e6dbd4c281348c67e https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_internal.h | 14 ++++++- drivers/resctrl/mpam_resctrl.c | 68 +++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 4 ++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 8831f76b99586..5da0f9806377b 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -34,6 +34,14 @@ DECLARE_STATIC_KEY_FALSE(mpam_enabled); #define PACKED_FOR_KUNIT #endif +/* + * This 'mon' values must not alias an actual monitor, so must be larger than + * U16_MAX, but not be confused with an errno value, so smaller than + * (u32)-SZ_4K. + * USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor. + */ +#define USE_PRE_ALLOCATED (U16_MAX + 1) + static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -209,7 +217,11 @@ enum mon_filter_options { }; struct mon_cfg { - u16 mon; + /* + * mon must be large enough to hold out of range values like + * USE_RMID_IDX + */ + u32 mon; u8 pmg; bool match_pmg; bool csu_exclude_clean; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 69f64e346bac7..57ff27efb8e95 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -21,6 +21,8 @@ #include "mpam_internal.h" +DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); + /* * The classes we've picked to map to resctrl resources, wrapped * in with their resctrl structure. @@ -291,6 +293,72 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) return &mpam_resctrl_controls[l].resctrl_res; } +static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->class) + return -EINVAL; + + switch (evtid) { + case QOS_L3_OCCUP_EVENT_ID: + /* With CDP, one monitor gets used for both code/data reads */ + return mpam_alloc_csu_mon(mon->class); + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + return USE_PRE_ALLOCATED; + default: + return -EOPNOTSUPP; + } +} + +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) +{ + DEFINE_WAIT(wait); + int *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + + do { + prepare_to_wait(&resctrl_mon_ctx_waiters, &wait, + TASK_INTERRUPTIBLE); + *ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid); + if (*ret == -ENOSPC) + schedule(); + } while (*ret == -ENOSPC && !signal_pending(current)); + finish_wait(&resctrl_mon_ctx_waiters, &wait); + + return ret; +} + +static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, + u32 mon_idx) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->class) + return; + + if (evtid == QOS_L3_OCCUP_EVENT_ID) + mpam_free_csu_mon(mon->class, mon_idx); + + wake_up(&resctrl_mon_ctx_waiters); +} + +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, void *arch_mon_ctx) +{ + u32 mon_idx = *(u32 *)arch_mon_ctx; + + kfree(arch_mon_ctx); + arch_mon_ctx = NULL; + + resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 1dd0f239cad02..383364c597b0d 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -55,6 +55,10 @@ u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); u32 resctrl_arch_system_num_rmid_idx(void); +struct rdt_resource; +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From 45e5b401b4bc380c046d1781bf6e6fc51fd76c18 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 25 Jun 2021 16:36:58 +0100 Subject: [PATCH 125/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_rmid_read() and resctrl_arch_reset_rmid() BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl uses resctrl_arch_rmid_read() to read counters. CDP emulation means the counter may need reading in three different ways. The same goes for reset. The helpers behind the resctrl_arch_ functions will be re-used for the ABMC equivalent functions. Add the rounding helper for checking monitor values while we're here. Signed-off-by: James Morse (cherry picked from commit 55e9e165b354401b53c9eefb3100fef56386c393 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 154 +++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 ++ 2 files changed, 159 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 57ff27efb8e95..ead402eed8c5e 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -359,6 +359,160 @@ void resctrl_arch_mon_ctx_free(struct rdt_resource *r, resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); } +static int +__read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) +{ + struct mon_cfg cfg = { }; + + if (!mpam_is_enabled()) + return -EINVAL; + + /* Shift closid to account for CDP */ + closid = resctrl_get_config_index(closid, cdp_type); + + if (mon_idx == USE_PRE_ALLOCATED) { + int mbwu_idx = resctrl_arch_rmid_idx_encode(closid, rmid); + mon_idx = mon->mbwu_idx_to_mon[mbwu_idx]; + if (mon_idx == -1) { + if (mpam_resctrl_abmc_enabled()) { + /* Report Unassigned */ + return -ENOENT; + } + /* Report Unavailable */ + return -EINVAL; + } + } + + cfg.mon = mon_idx; + cfg.match_pmg = true; + cfg.partid = closid; + cfg.pmg = rmid; + + if (irqs_disabled()) { + /* Check if we can access this domain without an IPI */ + return -EIO; + } + + return mpam_msmon_read(mon_comp, &cfg, mon_type, val); +} + +static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, u32 closid, u32 rmid, u64 *val) +{ + if (cdp_enabled) { + u64 cdp_val = 0; + int err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_CODE, closid, rmid, &cdp_val); + if (err) + return err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_DATA, closid, rmid, &cdp_val); + if (!err) + *val += cdp_val; + return err; + } + + return __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_NONE, closid, rmid, val); +} + +/* MBWU when not in ABMC mode, and CSU counters. */ +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id eventid, + u64 *val, void *arch_mon_ctx) +{ + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + u32 mon_idx = *(u32 *)arch_mon_ctx; + enum mpam_device_features mon_type; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + + resctrl_arch_rmid_read_context_check(); + + if (eventid >= QOS_NUM_EVENTS || !mon->class) + return -EINVAL; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + switch (eventid) { + case QOS_L3_OCCUP_EVENT_ID: + mon_type = mpam_feat_msmon_csu; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + mon_type = mpam_feat_msmon_mbwu; + break; + default: + return -EINVAL; + } + + return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, + closid, rmid, val); +} + +static void __reset_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid) +{ + struct mon_cfg cfg = { }; + + if (!mpam_is_enabled()) + return; + + /* Shift closid to account for CDP */ + closid = resctrl_get_config_index(closid, cdp_type); + + if (mon_idx == USE_PRE_ALLOCATED) { + int mbwu_idx = resctrl_arch_rmid_idx_encode(closid, rmid); + mon_idx = mon->mbwu_idx_to_mon[mbwu_idx]; + } + + if (mon_idx == -1) + return; + cfg.mon = mon_idx; + mpam_msmon_reset_mbwu(mon_comp, &cfg); +} + +static void reset_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + int mon_idx, u32 closid, u32 rmid) +{ + if (cdp_enabled) { + __reset_mon(mon, mon_comp, mon_idx, CDP_CODE, closid, rmid); + __reset_mon(mon, mon_comp, mon_idx, CDP_DATA, closid, rmid); + } else { + __reset_mon(mon, mon_comp, mon_idx, CDP_NONE, closid, rmid); + } +} + +/* Called via IPI. Call with read_cpus_lock() held. */ +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id eventid) +{ + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + + if (!mpam_is_enabled()) + return; + + /* Only MBWU counters are relevant, and for supported event types. */ + if (eventid == QOS_L3_OCCUP_EVENT_ID || !mon->class) + return; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 383364c597b0d..0b5b49502a7f2 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -42,6 +42,11 @@ static inline int acpi_mpam_count_msc(void) { return -EINVAL; } int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, int component_id); +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + return val; +} + bool resctrl_arch_alloc_capable(void); bool resctrl_arch_mon_capable(void); From 4cd7c2a2236ded23947b0bec45ee779e123a9047 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 26 Aug 2025 16:05:07 +0100 Subject: [PATCH 126/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_cntr_read() & resctrl_arch_reset_cntr() BugLink: https://bugs.launchpad.net/bugs/2122432 When used in ABMC mode, resctrl uses a different set of helpers to read and reset the counters. Add these. Signed-off-by: James Morse (cherry picked from commit 89bb5a413682a2a53a6b906baa6fb41e4398e287 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 43 ++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index ead402eed8c5e..dedc153343012 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -458,6 +458,28 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, closid, rmid, val); } +/* MBWU counters when in ABMC mode */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int mon_idx, + enum resctrl_event_id eventid, u64 *val) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + + if (!mpam_is_enabled()) + return -EINVAL; + + if (eventid == QOS_L3_OCCUP_EVENT_ID || !mon->class) + return -EINVAL; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + return read_mon_cdp_safe(mon, mon_comp, mpam_feat_msmon_mbwu, mon_idx, + closid, rmid, val); +} + static void __reset_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, int mon_idx, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid) @@ -513,6 +535,27 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); } +/* Reset an assigned counter */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + + if (!mpam_is_enabled()) + return; + + if (eventid == QOS_L3_OCCUP_EVENT_ID || !mon->class) + return; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; From abbefbebe3b0c5e12f8dab85872c718052b4ad0c Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 15 Oct 2025 16:35:14 +0100 Subject: [PATCH 127/247] NVIDIA: SAUCE: untested: arm_mpam: resctrl: Allow monitors to be configured with filters BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM MSCs may have support for filtering reads or writes when monitoring traffic. Resctrl has a configuration bitmap for which kind of accesses should be monitored. Bridge the gap where possible. MPAM only has a read/write bit, so not all the combinations can be supported. Signed-off-by: James Morse (cherry picked from commit 462f362b912b97933e9c3686af8af81dfd291069 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 33 ++++++++++++ drivers/resctrl/mpam_internal.h | 9 ++++ drivers/resctrl/mpam_resctrl.c | 95 ++++++++++++++++++++++++++++++--- 3 files changed, 130 insertions(+), 7 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index ef6a619ab6f3c..f46bf43fb65d3 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1329,6 +1329,39 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, return err; } +void mpam_msmon_reset_all_mbwu(struct mpam_component *comp) +{ + int idx, i; + struct mpam_msc *msc; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + + if (!mpam_is_enabled()) + return; + + idx = srcu_read_lock(&mpam_srcu); + list_for_each_entry_rcu(vmsc, &comp->vmsc, comp_list) { + if (!mpam_has_feature(mpam_feat_msmon_mbwu, &vmsc->props)) + continue; + + msc = vmsc->msc; + list_for_each_entry_rcu(ris, &msc->ris, vmsc_list) { + if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) + continue; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + continue; + + for (i = 0; i < ris->props.num_mbwu_mon; i++) { + ris->mbwu_state[i].correction = 0; + ris->mbwu_state[i].reset_on_next_read = true; + } + mpam_mon_sel_unlock(msc); + } + } + srcu_read_unlock(&mpam_srcu, idx); +} + void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) { struct mpam_msc *msc; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 5da0f9806377b..3b60fc74f7696 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -42,6 +42,12 @@ DECLARE_STATIC_KEY_FALSE(mpam_enabled); */ #define USE_PRE_ALLOCATED (U16_MAX + 1) +/* + * Only these event configuration bits are supported. MPAM can't know if + * data is being written back, these will show up as a write. + */ +#define MPAM_RESTRL_EVT_CONFIG_VALID (READS_TO_LOCAL_MEM | NON_TEMP_WRITE_TO_LOCAL_MEM) + static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -363,6 +369,8 @@ struct mpam_resctrl_dom { struct rdt_ctrl_domain resctrl_ctrl_dom; struct rdt_mon_domain resctrl_mon_dom; + + u32 mbm_local_evt_cfg; }; struct mpam_resctrl_res { @@ -446,6 +454,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid, int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, enum mpam_device_features, u64 *val); void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); +void mpam_msmon_reset_all_mbwu(struct mpam_component *comp); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index dedc153343012..146de188762fe 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -361,7 +361,7 @@ void resctrl_arch_mon_ctx_free(struct rdt_resource *r, static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, - enum mpam_device_features mon_type, + enum mpam_device_features mon_type, enum mon_filter_options mon_opts, int mon_idx, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) { @@ -390,6 +390,7 @@ __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, cfg.match_pmg = true; cfg.partid = closid; cfg.pmg = rmid; + cfg.opts = mon_opts; if (irqs_disabled()) { /* Check if we can access this domain without an IPI */ @@ -400,29 +401,41 @@ __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, } static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, - enum mpam_device_features mon_type, + enum mpam_device_features mon_type, enum mon_filter_options mon_opts, int mon_idx, u32 closid, u32 rmid, u64 *val) { if (cdp_enabled) { u64 cdp_val = 0; int err; - err = __read_mon(mon, mon_comp, mon_type, mon_idx, + err = __read_mon(mon, mon_comp, mon_type, mon_opts, mon_idx, CDP_CODE, closid, rmid, &cdp_val); if (err) return err; - err = __read_mon(mon, mon_comp, mon_type, mon_idx, + err = __read_mon(mon, mon_comp, mon_type, mon_opts, mon_idx, CDP_DATA, closid, rmid, &cdp_val); if (!err) *val += cdp_val; return err; } - return __read_mon(mon, mon_comp, mon_type, mon_idx, + return __read_mon(mon, mon_comp, mon_type, mon_idx, mon_opts, CDP_NONE, closid, rmid, val); } +static enum mon_filter_options resctrl_evt_config_to_mpam(u32 local_evt_cfg) +{ + switch (local_evt_cfg) { + case READS_TO_LOCAL_MEM: + return COUNT_READ; + case NON_TEMP_WRITE_TO_LOCAL_MEM: + return COUNT_WRITE; + default: + return COUNT_BOTH; + } +} + /* MBWU when not in ABMC mode, and CSU counters. */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, @@ -430,6 +443,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, { struct mpam_resctrl_dom *l3_dom; struct mpam_component *mon_comp; + enum mon_filter_options mon_opts; u32 mon_idx = *(u32 *)arch_mon_ctx; enum mpam_device_features mon_type; struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; @@ -441,6 +455,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); mon_comp = l3_dom->mon_comp[eventid]; + mon_opts = resctrl_evt_config_to_mpam(l3_dom->mbm_local_evt_cfg); switch (eventid) { case QOS_L3_OCCUP_EVENT_ID: @@ -454,7 +469,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, return -EINVAL; } - return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, + return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_opts, mon_idx, closid, rmid, val); } @@ -464,6 +479,7 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, enum resctrl_event_id eventid, u64 *val) { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + enum mon_filter_options mon_opts; struct mpam_resctrl_dom *l3_dom; struct mpam_component *mon_comp; @@ -475,9 +491,10 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); mon_comp = l3_dom->mon_comp[eventid]; + mon_opts = resctrl_evt_config_to_mpam(l3_dom->mbm_local_evt_cfg); return read_mon_cdp_safe(mon, mon_comp, mpam_feat_msmon_mbwu, mon_idx, - closid, rmid, val); + mon_opts, closid, rmid, val); } static void __reset_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, @@ -1076,6 +1093,67 @@ static void mpam_resctrl_pick_counters(void) mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + struct mpam_class *class; + struct mpam_props *cprops; + + class = mpam_resctrl_counters[evt].class; + if (!class) + return false; + + cprops = &class->props; + + return mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, cprops); +} + +void resctrl_arch_mon_event_config_read(void *info) +{ + struct mpam_resctrl_dom *dom; + struct resctrl_mon_config_info *mon_info = info; + + dom = container_of(mon_info->d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_info->mon_config = dom->mbm_local_evt_cfg & MAX_EVT_CONFIG_BITS; +} + +void resctrl_arch_mon_event_config_write(void *info) +{ + struct mpam_resctrl_dom *dom; + struct resctrl_mon_config_info *mon_info = info; + + WARN_ON_ONCE(mon_info->mon_config & ~MPAM_RESTRL_EVT_CONFIG_VALID); + + dom = container_of(mon_info->d, struct mpam_resctrl_dom, resctrl_mon_dom); + dom->mbm_local_evt_cfg = mon_info->mon_config & MPAM_RESTRL_EVT_CONFIG_VALID; +} + +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + int i; + struct mpam_resctrl_dom *dom; + struct mpam_resctrl_mon *mon; + struct mpam_component *mon_comp; + + dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + dom->mbm_local_evt_cfg = MPAM_RESTRL_EVT_CONFIG_VALID; + + /* + * Monitors may be backed by different classes of MSC, all + * possible components need to be reset... + */ + for (i = 0; i < QOS_NUM_EVENTS; i++) { + mon = &mpam_resctrl_counters[i]; + if (!mon->class) + continue; // dummy resource + + mon_comp = dom->mon_comp[i]; + if (!mon_comp) + continue; + + mpam_msmon_reset_all_mbwu(mon_comp); + } +} + static void __config_cntr(struct mpam_resctrl_mon *mon, u32 cntr_id, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, bool assign) @@ -1291,6 +1369,7 @@ static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, case QOS_L3_MBM_LOCAL_EVENT_ID: case QOS_L3_MBM_TOTAL_EVENT_ID: mpam_resctrl_monitor_init_abmc(mon); + l3->mon.mbm_cfg_mask = MPAM_RESTRL_EVT_CONFIG_VALID; return; default: @@ -1601,6 +1680,8 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) } WARN_ON_ONCE(!any_mon_comp); + dom->mbm_local_evt_cfg = MPAM_RESTRL_EVT_CONFIG_VALID; + mon_d = &dom->resctrl_mon_dom; mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; From 94a2c654e9c4a015cc7d60585c9e9082ddba3770 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 14 Jul 2021 15:34:16 +0100 Subject: [PATCH 128/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add empty definitions for fine-grained enables BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl has individual hooks to separately enable and disable the closid/partid and rmid/pmg context switching code. For MPAM this is all the same thing, as the value in struct task_struct is used to cache the value that should be written to hardware. arm64's context switching code is enabled once MPAM is usable, but doesn't touch the hardware unless the value has changed. Resctrl doesn't need to ask. Add empty definitions for these hooks. Signed-off-by: James Morse (cherry picked from commit 6987dc9761651ec64f7d7e24f1348381c0e15fe1 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- include/linux/arm_mpam.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 0b5b49502a7f2..c05d5d5557e8c 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -64,6 +64,15 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +/* + * The CPU configuration for MPAM is cheap to write, and is only written if it + * has changed. No need for fine grained enables. + */ +static inline void resctrl_arch_enable_mon(void) { } +static inline void resctrl_arch_disable_mon(void) { } +static inline void resctrl_arch_enable_alloc(void) { } +static inline void resctrl_arch_disable_alloc(void) { } + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From cb5d2d6c782c7687b5d4e918b56f4a881b3b40ef Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 14 Jul 2021 15:40:17 +0100 Subject: [PATCH 129/247] NVIDIA: SAUCE: arm64: mpam: Select ARCH_HAS_CPU_RESCTRL BugLink: https://bugs.launchpad.net/bugs/2122432 Enough MPAM support is present to enable ARCH_HAS_CPU_RESCTRL. Let it rip^Wlink! ARCH_HAS_CPU_RESCTRL indicates resctrl can be enabled. It is enabled by the arch code sipmly because it has 'arch' in its name. This removes ARM_CPU_RESCTRL as a mimic of X86_CPU_RESCTRL and defines a dummy ARM64_MPAM_DRIVER to hold the bits and pieces relevant to the MPAM driver. While here, move the ACPI dependency to the driver's Kconfig file. Signed-off-by: James Morse (cherry picked from commit a423179873ede78076d49ecdba128fc2eee1a3e0 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/arm64/Kconfig | 4 ++-- arch/arm64/include/asm/resctrl.h | 2 ++ drivers/resctrl/Kconfig | 9 ++++++++- drivers/resctrl/Makefile | 2 +- 4 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 arch/arm64/include/asm/resctrl.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 34502faac486c..da0aaf0d5a635 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2063,8 +2063,8 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" - select ARM64_MPAM_DRIVER if EXPERT # does nothing yet - select ACPI_MPAM if ACPI + select ARM64_MPAM_DRIVER + select ARCH_HAS_CPU_RESCTRL help Memory System Resource Partitioning and Monitoring (MPAM) is an optional extension to the Arm architecture that allows each diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h new file mode 100644 index 0000000000000..b506e95cf6e37 --- /dev/null +++ b/arch/arm64/include/asm/resctrl.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index a2e9a61304617..15163aabcdc93 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -1,6 +1,7 @@ menuconfig ARM64_MPAM_DRIVER bool "MPAM driver" - depends on ARM64 && ARM64_MPAM && EXPERT + depends on ARM64 && ARM64_MPAM + select ACPI_MPAM if ACPI help MPAM driver for System IP, e,g. caches and memory controllers. @@ -20,3 +21,9 @@ config MPAM_KUNIT_TEST If unsure, say N. endif + +config ARM64_MPAM_RESCTRL_FS + bool + default y if ARM64_MPAM_DRIVER && RESCTRL_FS + select RESCTRL_RMID_DEPENDS_ON_CLOSID + select RESCTRL_ASSIGN_FIXED diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 40beaf999582c..4f6d0e81f9b8f 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o mpam-y += mpam_devices.o -mpam-$(CONFIG_ARM_CPU_RESCTRL) += mpam_resctrl.o +mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG From a8a8b17e1d5bf59d5822fdce9cc5467c9f8f15a4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 12 Jun 2024 13:55:55 +0100 Subject: [PATCH 130/247] NVIDIA: SAUCE: fs/resctrl: Don't touch rmid_ptrs[] in free_rmid() when there are no monitors BugLink: https://bugs.launchpad.net/bugs/2122432 On platforms with no monitors the rmid_ptrs[] array is not allocated. The rmid on these platforms is likely to be '0' for all control groups, which may lead to free_rmid() being called on rmid 0. Dave points out that the index == (0,0) check to skip freeing of a non-existant rmid is not sufficient on MPAM because the provided closid may be non-zero. The index can't be used to spot this case. Instead, check if there are any resctrl monitors enabled. This avoids a null pointer dereference in free_rmid() when control groups are freed. It isn't possible to hit this on x86 platforms. This patch to be replaced by one from Dave. Reported-by: Dave Martin Tested-by: Shaopeng Tan Tested-by: Shanker Donthineni # arm64 Signed-off-by: James Morse (cherry picked from commit 45108503cd8740ec7deb1280f9e98c41806f53d9 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/monitor.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 4076336fbba6d..cd681f8a2dc24 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -324,6 +324,10 @@ void free_rmid(u32 closid, u32 rmid) lockdep_assert_held(&rdtgroup_mutex); + /* rmid_ptrs[] not allocated if there are no monitors */ + if (!resctrl_arch_mon_capable()) + return; + /* * Do not allow the default rmid to be free'd. Comparing by index * allows architectures that ignore the closid parameter to avoid an From 30a6b9f6ae6de66ca114080d85befa7af966a9db Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 10 Jun 2024 17:20:48 +0100 Subject: [PATCH 131/247] NVIDIA: SAUCE: fs/resctrl: Avoid a race with dom_data_exit() and closid_num_dirty_rmid[] BugLink: https://bugs.launchpad.net/bugs/2122432 On MPAM systems if an error occurs the arhictecture code will call resctrl_exit(). This calls dom_data_exit() which takes the rdrgroup_mutex and kfree()s closid_num_dirty_rmid[]. It is possible that another syscall tries to access that same array in the meantime, but is blocked on the mutex. Once dom_data_exit() completes, that syscall will see a NULL pointer. Pull the IS_ENABLED() Kconfig checks into a helper and additionally check that the array has been allocated. This will cause callers to fallback to the regular CLOSID allocation strategy. Signed-off-by: James Morse (cherry picked from commit bf8d1f6d5f76b91a57e77a5eb5a33ae975abeec5 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/monitor.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index cd681f8a2dc24..09c4b821408d8 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -113,6 +113,20 @@ static inline struct rmid_entry *__rmid_entry(u32 idx) return entry; } +static bool __has_closid_num_dirty_rmid_array(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return false; + + /* + * Avoid a race with dom_data_exit() freeing the array under + * rdtgroup_mutex. + */ + return closid_num_dirty_rmid; +} + static void limbo_release_entry(struct rmid_entry *entry) { lockdep_assert_held(&rdtgroup_mutex); @@ -120,7 +134,7 @@ static void limbo_release_entry(struct rmid_entry *entry) rmid_limbo_count--; list_add_tail(&entry->list, &rmid_free_lru); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + if (__has_closid_num_dirty_rmid_array()) closid_num_dirty_rmid[entry->closid]--; } @@ -240,7 +254,7 @@ int resctrl_find_cleanest_closid(void) lockdep_assert_held(&rdtgroup_mutex); - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + if (!__has_closid_num_dirty_rmid_array()) return -EIO; for (i = 0; i < closids_supported(); i++) { @@ -313,7 +327,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) } rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + if (__has_closid_num_dirty_rmid_array()) closid_num_dirty_rmid[entry->closid]++; } From 8b01e71e3d680f7c7a5d742cd8e19cf7044bfd29 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 10 Jun 2024 17:41:58 +0100 Subject: [PATCH 132/247] NVIDIA: SAUCE: fs/resctrl: Avoid a race with dom_data_exit() and rmid_ptrs[] BugLink: https://bugs.launchpad.net/bugs/2122432 On MPAM systems if an error occurs the arhictecture code will call resctrl_exit(). This calls dom_data_exit() which takes the rdrgroup_mutex and kfree()s rmid_ptrs[]. It is possible that another syscall tries to access that same array in the meantime, but is blocked on the mutex. Once dom_data_exit() completes, that syscall will see a NULL pointer. Make __rmid_entry() return NULL in this case. Neither __check_limbo() nor free_rmid() return an error, and can silently stop their work if this occurs. dom_data_init() has only just allocated the array and still holds the lock, so __rmid_entry() should never return NULL here. Signed-off-by: James Morse (cherry picked from commit 73ab158cafae3b4dd5a16dd1154922811a51253f https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/monitor.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 09c4b821408d8..392376e5a2751 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -98,12 +98,17 @@ unsigned int resctrl_rmid_realloc_limit; * * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code * must accept an attempt to read every index. + * + * Returns NULL if the rmid_ptrs[] array is not allocated. */ static inline struct rmid_entry *__rmid_entry(u32 idx) { struct rmid_entry *entry; u32 closid, rmid; + if (!rmid_ptrs) + return NULL; + entry = &rmid_ptrs[idx]; resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); @@ -173,6 +178,8 @@ void __check_limbo(struct rdt_mon_domain *d, bool force_free) break; entry = __rmid_entry(idx); + if (!entry) + break; if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, QOS_L3_OCCUP_EVENT_ID, &val, arch_mon_ctx)) { @@ -353,6 +360,8 @@ void free_rmid(u32 closid, u32 rmid) return; entry = __rmid_entry(idx); + if (!entry) + return; if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); @@ -917,6 +926,7 @@ static int dom_data_init(struct rdt_resource *r) idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); entry = __rmid_entry(idx); + WARN_ON_ONCE(!entry); list_del(&entry->list); out_unlock: From 9bbfe09d38a166157b321a9366680343c3812c39 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 5 Dec 2023 16:18:37 +0000 Subject: [PATCH 133/247] NVIDIA: SAUCE: perf/arm-cmn: Stop claiming all the resources BugLink: https://bugs.launchpad.net/bugs/2122432 Carl reports that when both the MPAM driver and CMN driver are built into the kernel, they fight over who can claim the resources associated with their registers. This prevents the second of these two drivers from probing. Currently the CMN PMU driver claims all the CMN registers. The MPAM registers are grouped together in a small number of pages, whereas the PMU registers that the CMN PMU driver uses appear throughout the CMN register space. Having the CMN driver claim all the resources is the wrong thing to do, and claiming individual registers here and there is not worthwhile. Instead, stop the CMN driver from claiming any resources as its registers are not grouped together. Reported-by: Carl Worth Tested-by: Carl Worth CC: Ilkka Koskinen Signed-off-by: James Morse (cherry picked from commit 3d864c2cc58ae9b79a3fe4565f478cfcfa0f6568 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/perf/arm-cmn.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 23245352a3fc0..fa2a5867659f2 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2547,6 +2547,7 @@ static int arm_cmn_probe(struct platform_device *pdev) struct arm_cmn *cmn; const char *name; static atomic_t id; + struct resource *cfg; int err, rootnode, this_id; cmn = devm_kzalloc(&pdev->dev, sizeof(*cmn), GFP_KERNEL); @@ -2562,7 +2563,16 @@ static int arm_cmn_probe(struct platform_device *pdev) rootnode = arm_cmn600_acpi_probe(pdev, cmn); } else { rootnode = 0; - cmn->base = devm_platform_ioremap_resource(pdev, 0); + + /* + * Avoid requesting resources as the PMUs registers are + * scattered through CMN, and may appear either side of + * registers for other 'devices'. (e.g. the MPAM MSC controls). + */ + cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!cfg) + return -EINVAL; + cmn->base = devm_ioremap(&pdev->dev, cfg->start, resource_size(cfg)); if (IS_ERR(cmn->base)) return PTR_ERR(cmn->base); if (cmn->part == PART_CMN600) From b71b5d39711a51a91e77180e75cee389fe238f53 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 19 Aug 2021 15:06:55 +0100 Subject: [PATCH 134/247] NVIDIA: SAUCE: arm_mpam: resctrl: Call resctrl_init() on platforms that can support resctrl BugLink: https://bugs.launchpad.net/bugs/2122432 Now that MPAM links against resctrl, call resctrl_init() to register the filesystem and setup resctrl's strutures. Signed-off-by: James Morse (cherry picked from commit 0a8aafc76188c5b81ffeba1a35111742c8fa754d https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 146de188762fe..80f7ef15db83e 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1915,7 +1915,7 @@ int mpam_resctrl_setup(void) pr_warn("Number of PMG is not a power of 2! resctrl may misbehave"); } - /* TODO: call resctrl_init() */ + err = resctrl_init(); } return err; From 83dd14b97184fdb7055d63ae10579bce7e9b030c Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 2 Nov 2021 12:45:26 +0000 Subject: [PATCH 135/247] NVIDIA: SAUCE: arm_mpam: resctrl: Call resctrl_exit() in the event of errors BugLink: https://bugs.launchpad.net/bugs/2122432 All of MPAMs errors indicate a software bug, e.g. an out-of-bounds partid has been generated. When this happens, the mpam driver is disabled. If resctrl_init() succeeded, also call resctrl_exit() to remove resctrl. mpam_devices.c calls mpam_resctrl_teardown_class() when a class becomes incomplete, and can no longer be used by resctrl. If resctrl was using this class, then resctrl_exit() is called. This in turn removes the kernfs hierarchy from the filesystem and free()s memory that was allocated by resctrl. Signed-off-by: James Morse (cherry picked from commit 2ec400c93754dc78c52165a8bef0c4ac9e3cbb8f https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 30 +++++++++- drivers/resctrl/mpam_internal.h | 4 ++ drivers/resctrl/mpam_resctrl.c | 97 +++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index f46bf43fb65d3..941d8e47efae9 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -76,6 +76,14 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable); /* When mpam is disabled, the printed reason to aid debugging */ static char *mpam_disable_reason; +/* + * Whether has been setup. Used by cpuhp in preference to mpam_is_enabled() + * the disable call after an error interrupt makes mpam_is_enabled() false before + * the cpuhp callbacks are made. + * Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks). + */ +static bool mpam_resctrl_enabled; + /* * An MSC is a physical container for controls and monitors, each identified by * their RIS index. These share a base-address, interrupts and some MMIO @@ -428,6 +436,12 @@ static void mpam_ris_destroy(struct mpam_msc_ris *ris) lockdep_assert_held(&mpam_list_lock); + /* + * Once a RIS has been removed from a class, it can no longer be used + * by resctrl, even though the class has yet to be removed. + */ + mpam_resctrl_teardown_class(class); + /* * It is assumed affinities don't overlap. If they do the class becomes * unusable immediately. @@ -1735,7 +1749,7 @@ static int mpam_cpu_online(unsigned int cpu) mpam_reprogram_msc(msc); } - if (mpam_is_enabled()) + if (mpam_resctrl_enabled) mpam_resctrl_online_cpu(cpu); return 0; @@ -1794,7 +1808,7 @@ static int mpam_cpu_offline(unsigned int cpu) mpam_reset_msc(msc, false); } - if (mpam_is_enabled()) + if (mpam_resctrl_enabled) mpam_resctrl_offline_cpu(cpu); return 0; @@ -2742,6 +2756,7 @@ static void mpam_enable_once(void) } static_branch_enable(&mpam_enabled); + mpam_resctrl_enabled = true; mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); @@ -2802,17 +2817,26 @@ void mpam_reset_class(struct mpam_class *class) void mpam_disable(struct work_struct *ignored) { int idx; + bool do_resctrl_exit; struct mpam_class *class; struct mpam_msc *msc, *tmp; + if (mpam_is_enabled()) + static_branch_disable(&mpam_enabled); + mutex_lock(&mpam_cpuhp_state_lock); if (mpam_cpuhp_state) { cpuhp_remove_state(mpam_cpuhp_state); mpam_cpuhp_state = 0; } + + /* mpam_cpu_offline() tells resctrl all the CPUs are offline. */ + do_resctrl_exit = mpam_resctrl_enabled; + mpam_resctrl_enabled = false; mutex_unlock(&mpam_cpuhp_state_lock); - static_branch_disable(&mpam_enabled); + if (do_resctrl_exit) + mpam_resctrl_exit(); mpam_unregister_irqs(); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 3b60fc74f7696..94fa10d9c4b41 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -461,12 +461,16 @@ int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, #ifdef CONFIG_RESCTRL_FS int mpam_resctrl_setup(void); +void mpam_resctrl_exit(void); int mpam_resctrl_online_cpu(unsigned int cpu); int mpam_resctrl_offline_cpu(unsigned int cpu); +void mpam_resctrl_teardown_class(struct mpam_class *class); #else static inline int mpam_resctrl_setup(void) { return 0; } +static inline void mpam_resctrl_exit(void) { } static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } static inline int mpam_resctrl_offline_cpu(unsigned int cpu) { return 0; } +static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } #endif /* CONFIG_RESCTRL_FS */ /* diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 80f7ef15db83e..3e16e5c84ad36 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -52,6 +52,12 @@ static bool exposed_mon_capable; */ static bool cdp_enabled; +/* + * If resctrl_init() succeeded, resctrl_exit() can be used to remove support + * for the filesystem in the event of an error. + */ +static bool resctrl_enabled; + /* * L3 local/total may come from different classes - what is the number of MBWU * 'on L3'? @@ -297,6 +303,9 @@ static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + if (!mpam_is_enabled()) + return -EINVAL; + if (!mon->class) return -EINVAL; @@ -339,6 +348,9 @@ static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + if (!mpam_is_enabled()) + return; + if (!mon->class) return; @@ -450,6 +462,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, resctrl_arch_rmid_read_context_check(); + if (!mpam_is_enabled()) + return -EINVAL; + if (eventid >= QOS_NUM_EVENTS || !mon->class) return -EINVAL; @@ -1112,6 +1127,11 @@ void resctrl_arch_mon_event_config_read(void *info) struct mpam_resctrl_dom *dom; struct resctrl_mon_config_info *mon_info = info; + if (!mpam_is_enabled()) { + mon_info->mon_config = 0; + return; + } + dom = container_of(mon_info->d, struct mpam_resctrl_dom, resctrl_mon_dom); mon_info->mon_config = dom->mbm_local_evt_cfg & MAX_EVT_CONFIG_BITS; } @@ -1124,6 +1144,12 @@ void resctrl_arch_mon_event_config_write(void *info) WARN_ON_ONCE(mon_info->mon_config & ~MPAM_RESTRL_EVT_CONFIG_VALID); dom = container_of(mon_info->d, struct mpam_resctrl_dom, resctrl_mon_dom); + + if (!mpam_is_enabled()) { + dom->mbm_local_evt_cfg = 0; + return; + } + dom->mbm_local_evt_cfg = mon_info->mon_config & MPAM_RESTRL_EVT_CONFIG_VALID; } @@ -1135,6 +1161,10 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * struct mpam_component *mon_comp; dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + if (!mpam_is_enabled()) { + dom->mbm_local_evt_cfg = 0; + return; + } dom->mbm_local_evt_cfg = MPAM_RESTRL_EVT_CONFIG_VALID; /* @@ -1460,6 +1490,9 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); + if (!mpam_is_enabled()) + return -EINVAL; + /* * NOTE: don't check the CPU as mpam_apply_config() doesn't care, * and resctrl_arch_update_domains() depends on this. @@ -1531,6 +1564,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); + if (!mpam_is_enabled()) + return -EINVAL; + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { for (t = 0; t < CDP_NUM_TYPES; t++) { cfg = &d->staged_config[t]; @@ -1916,11 +1952,72 @@ int mpam_resctrl_setup(void) } err = resctrl_init(); + if (!err) + WRITE_ONCE(resctrl_enabled, true); } return err; } +void mpam_resctrl_exit(void) +{ + if (!READ_ONCE(resctrl_enabled)) + return; + + WRITE_ONCE(resctrl_enabled, false); + resctrl_exit(); +} + +static void mpam_resctrl_teardown_mon(struct mpam_resctrl_mon *mon, struct mpam_class *class) +{ + u32 num_mbwu_mon = l3_num_allocated_mbwu; + + if (!mon->mbwu_idx_to_mon) + return; + + if (mon->assigned_counters) { + __free_mbwu_mon(class, mon->assigned_counters, num_mbwu_mon); + mon->assigned_counters = NULL; + kfree(mon->mbwu_idx_to_mon); + } else { + __free_mbwu_mon(class, mon->mbwu_idx_to_mon, num_mbwu_mon); + } + mon->mbwu_idx_to_mon = NULL; +} + +/* + * The driver is detaching an MSC from this class, if resctrl was using it, + * pull on resctrl_exit(). + */ +void mpam_resctrl_teardown_class(struct mpam_class *class) +{ + int i; + struct mpam_resctrl_res *res; + struct mpam_resctrl_mon *mon; + + might_sleep(); + + for (i = 0; i < RDT_NUM_RESOURCES; i++) { + res = &mpam_resctrl_controls[i]; + if (res->class == class) { + mpam_resctrl_exit(); + res->class = NULL; + break; + } + } + for (i = 0; i < QOS_NUM_EVENTS; i++) { + mon = &mpam_resctrl_counters[i]; + if (mon->class == class) { + mpam_resctrl_exit(); + mon->class = NULL; + + mpam_resctrl_teardown_mon(mon, class); + + break; + } + } +} + #ifdef CONFIG_MPAM_KUNIT_TEST #include "test_mpam_resctrl.c" #endif From 0724a732f53fcab202c9617aa9f38577304a6f73 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Jul 2021 19:01:23 +0100 Subject: [PATCH 136/247] NVIDIA: SAUCE: arm_mpam: resctrl: Update the rmid reallocation limit BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl's limbo code needs to be told when the data left in a cache is small enough for the partid+pmg value to be re-allocated. x86 uses the cache size divided by the number of rmid users the cache may have. Do the same, but for the smallest cache, and with the number of partid-and-pmg users. Querying the cache size can't happen until after cacheinfo_sysfs_init() has run, so mpam_resctrl_setup() must wait until then. Signed-off-by: James Morse (cherry picked from commit ada71431a70aaf209a47164601f770751fcdda41 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 54 ++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 3e16e5c84ad36..37f52343f32cf 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -58,6 +59,13 @@ static bool cdp_enabled; */ static bool resctrl_enabled; +/* + * mpam_resctrl_pick_caches() needs to know the size of the caches. cacheinfo + * populates this from a device_initcall(). mpam_resctrl_setup() must wait. + */ +static bool cacheinfo_ready; +static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); + /* * L3 local/total may come from different classes - what is the number of MBWU * 'on L3'? @@ -588,6 +596,24 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); } +/* + * The rmid realloc threshold should be for the smallest cache exposed to + * resctrl. + */ +static void update_rmid_limits(unsigned int size) +{ + u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); + + if (WARN_ON_ONCE(!size)) + return; + + if (resctrl_rmid_realloc_limit && size > resctrl_rmid_realloc_limit) + return; + + resctrl_rmid_realloc_limit = size; + resctrl_rmid_realloc_threshold = size / num_unique_pmg; +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; @@ -856,6 +882,8 @@ static void mpam_resctrl_pick_caches(void) struct mpam_class *class; struct mpam_resctrl_res *res; + lockdep_assert_cpus_held(); + guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) { @@ -1045,6 +1073,7 @@ static void counter_update_class(enum resctrl_event_id evt_id, static void mpam_resctrl_pick_counters(void) { struct mpam_class *class; + unsigned int cache_size; bool has_csu, has_mbwu; lockdep_assert_cpus_held(); @@ -1052,6 +1081,8 @@ static void mpam_resctrl_pick_counters(void) guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) { + struct mpam_props *cprops = &class->props; + if (class->level < 3) { pr_debug("class %u is before L3", class->level); continue; @@ -1069,6 +1100,18 @@ static void mpam_resctrl_pick_counters(void) /* CSU counters only make sense on a cache. */ switch (class->type) { case MPAM_CLASS_CACHE: + /* Assume cache levels are the same size for all CPUs... */ + cache_size = get_cpu_cacheinfo_size(smp_processor_id(), + class->level); + if (!cache_size) { + pr_debug("Could not read cache size for class %u\n", + class->level); + continue; + } + + if (mpam_has_feature(mpam_feat_msmon_csu, cprops)) + update_rmid_limits(cache_size); + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); return; default: @@ -1894,6 +1937,8 @@ int mpam_resctrl_setup(void) struct mpam_resctrl_res *res; struct mpam_resctrl_mon *mon; + wait_event(wait_cacheinfo_ready, cacheinfo_ready); + cpus_read_lock(); for (i = 0; i < RDT_NUM_RESOURCES; i++) { res = &mpam_resctrl_controls[i]; @@ -2018,6 +2063,15 @@ void mpam_resctrl_teardown_class(struct mpam_class *class) } } +static int __init __cacheinfo_ready(void) +{ + cacheinfo_ready = true; + wake_up(&wait_cacheinfo_ready); + + return 0; +} +device_initcall_sync(__cacheinfo_ready); + #ifdef CONFIG_MPAM_KUNIT_TEST #include "test_mpam_resctrl.c" #endif From 183f700e08bf0f6ecd664772d2e34c6c2a17c913 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 23 Jul 2025 12:13:52 +0100 Subject: [PATCH 137/247] NVIDIA: SAUCE: arm_mpam: resctrl: Sort the order of the domain lists BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl documents that the domains appear in numeric order in the schemata file. This means a little more work is needed when bringing a domain online. Add the support for this, using resctrl_find_domain() to find the point to insert in the list. Signed-off-by: James Morse (cherry picked from commit 5dbc45a51de72184bf73bdd3c8628c3619fd2bdc https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 37f52343f32cf..0ea76b7783b6b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1685,6 +1685,21 @@ static struct mpam_component *find_component(struct mpam_class *victim, int cpu) return NULL; } +static void mpam_resctrl_domain_insert(struct list_head *list, + struct rdt_domain_hdr *new) +{ + struct rdt_domain_hdr *err; + struct list_head *pos = NULL; + + lockdep_assert_held(&domain_list_lock); + + err = resctrl_find_domain(list, new->id, &pos); + if (WARN_ON_ONCE(err)) + return; + + list_add_tail_rcu(&new->list, pos); +} + static struct mpam_resctrl_dom * mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { @@ -1723,8 +1738,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) ctrl_d = &dom->resctrl_ctrl_dom; mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; - /* TODO: this list should be sorted */ - list_add_tail(&ctrl_d->hdr.list, &r->ctrl_domains); + mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); err = resctrl_online_ctrl_domain(r, ctrl_d); if (err) { dom = ERR_PTR(err); @@ -1764,8 +1778,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) mon_d = &dom->resctrl_mon_dom; mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; - /* TODO: this list should be sorted */ - list_add_tail(&mon_d->hdr.list, &r->mon_domains); + mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); err = resctrl_online_mon_domain(r, mon_d); if (err) { dom = ERR_PTR(err); From 5547ddb42d0caf33e979d704fb813617943a1ce5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 20 Feb 2024 16:43:33 +0000 Subject: [PATCH 138/247] NVIDIA: SAUCE: arm_mpam: Generate a configuration for min controls BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM supports a minimum and maximum control for memory bandwidth. The purpose of the minimum control is to give priority to tasks that are below their minimum value. Resctrl only provides one value for the bandwidth configuration, which is used for the maximum. The minimum control is always programmed to zero on hardware that supports it. Generate a minimum bandwidth value that is 5% lower than the value provided by resctrl. This means tasks that are not receiving their target bandwidth can be prioritised by the hardware. CC: Zeng Heng Signed-off-by: James Morse (cherry picked from commit 8263cccc5ad53523c6a070199a0f49c9d789ed37 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 68 +++++++++++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 2 + drivers/resctrl/test_mpam_devices.c | 66 ++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 4 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 941d8e47efae9..3c0d6da1e6ba5 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -797,6 +797,13 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_mbw_part, props); props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); + + /* + * The BWA_WD field can represent 0-63, but the control fields it + * describes have a maximum of 16 bits. + */ + props->bwa_wd = min(props->bwa_wd, 16); + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) mpam_set_feature(mpam_feat_mbw_max, props); @@ -1481,7 +1488,7 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, if (mpam_has_feature(mpam_feat_mbw_min, rprops) && mpam_has_feature(mpam_feat_mbw_min, cfg)) - mpam_write_partsel_reg(msc, MBW_MIN, 0); + mpam_write_partsel_reg(msc, MBW_MIN, cfg->mbw_min); if (mpam_has_feature(mpam_feat_mbw_max, rprops) && mpam_has_feature(mpam_feat_mbw_max, cfg)) @@ -2916,24 +2923,77 @@ static bool mpam_update_config(struct mpam_config *cfg, maybe_update_config(cfg, mpam_feat_cpor_part, newcfg, cpbm, has_changes); maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes); maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes); + maybe_update_config(cfg, mpam_feat_mbw_min, newcfg, mbw_min, has_changes); return has_changes; } +static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg) +{ + struct mpam_props *cprops = &class->props; + u16 min, min_hw_granule, delta; + u16 max_hw_value, res0_bits; + + /* + * MAX and MIN should be set together. If only one is provided, + * generate a configuration for the other. If only one control + * type is supported, the other value will be ignored. + * + * Resctrl can only configure the MAX. + */ + if (mpam_has_feature(mpam_feat_mbw_max, cfg) && + !mpam_has_feature(mpam_feat_mbw_min, cfg)) { + /* + * Calculate the values the 'min' control can hold. + * e.g. on a platform with bwa_wd = 8, min_hw_granule is 0x00ff + * because those bits are RES0. Configurations of this value + * are effectively zero. But configurations need to saturate + * at min_hw_granule on systems with mismatched bwa_wd, where + * the 'less than 0' values are implemented on some MSC, but + * not others. + */ + res0_bits = 16 - cprops->bwa_wd; + max_hw_value = ((1 << cprops->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + + delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; + if (cfg->mbw_max > delta) + min = cfg->mbw_max - delta; + else + min = 0; + + cfg->mbw_min = max(min, min_hw_granule); + mpam_set_feature(mpam_feat_mbw_min, cfg); + } +} + int mpam_apply_config(struct mpam_component *comp, u16 partid, - struct mpam_config *cfg) + struct mpam_config *user_cfg) { struct mpam_write_config_arg arg; struct mpam_msc_ris *ris; + struct mpam_config cfg; struct mpam_vmsc *vmsc; struct mpam_msc *msc; lockdep_assert_cpus_held(); + /* Don't pass in the current config! */ - WARN_ON_ONCE(&comp->cfg[partid] == cfg); + WARN_ON_ONCE(&comp->cfg[partid] == user_cfg); + + /* + * Copy the config to avoid writing back the 'extended' version to + * the caller. + * This avoids mpam_devices.c setting a mbm_min that mpam_resctrl.c + * is unaware of ... when it then changes mbm_max to be lower than + * mbm_min. + */ + cfg = *user_cfg; + + mpam_extend_config(comp->class, &cfg); - if (!mpam_update_config(&comp->cfg[partid], cfg)) + if (!mpam_update_config(&comp->cfg[partid], &cfg)) return 0; arg.comp = comp; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 94fa10d9c4b41..c8045544b6faa 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -286,6 +286,7 @@ struct mpam_config { u32 cpbm; u32 mbw_pbm; u16 mbw_max; + u16 mbw_min; bool reset_cpbm; bool reset_mbw_pbm; @@ -629,6 +630,7 @@ static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } * MPAMCFG_MBW_MAX - MPAM memory maximum bandwidth partitioning configuration * register */ +#define MPAMCFG_MBW_MAX_MAX_NR_BITS 16 #define MPAMCFG_MBW_MAX_MAX GENMASK(15, 0) #define MPAMCFG_MBW_MAX_HARDLIM BIT(31) diff --git a/drivers/resctrl/test_mpam_devices.c b/drivers/resctrl/test_mpam_devices.c index 3e8d564a0c647..55d0278c19941 100644 --- a/drivers/resctrl/test_mpam_devices.c +++ b/drivers/resctrl/test_mpam_devices.c @@ -322,6 +322,71 @@ static void test_mpam_enable_merge_features(struct kunit *test) mutex_unlock(&mpam_list_lock); } +static void test_mpam_extend_config(struct kunit *test) +{ + struct mpam_config fake_cfg = { 0 }; + struct mpam_class fake_class = { 0 }; + + /* Configurations with both are not modified */ + fake_class.props.bwa_wd = 16; + fake_cfg.mbw_max = 0xfeef; + fake_cfg.mbw_min = 0xfeef; + bitmap_zero(fake_cfg.features, MPAM_FEATURE_LAST); + mpam_set_feature(mpam_feat_mbw_max, &fake_cfg); + mpam_set_feature(mpam_feat_mbw_min, &fake_cfg); + mpam_extend_config(&fake_class, &fake_cfg); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_max, &fake_cfg)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_min, &fake_cfg)); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_max, 0xfeef); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_min, 0xfeef); + + /* When a min is missing, it is generated */ + fake_class.props.bwa_wd = 16; + fake_cfg.mbw_max = 0xfeef; + fake_cfg.mbw_min = 0; + bitmap_zero(fake_cfg.features, MPAM_FEATURE_LAST); + mpam_set_feature(mpam_feat_mbw_max, &fake_cfg); + mpam_extend_config(&fake_class, &fake_cfg); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_max, &fake_cfg)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_min, &fake_cfg)); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_max, 0xfeef); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_min, 0xf224); + + fake_class.props.bwa_wd = 8; + fake_cfg.mbw_max = 0xfeef; + fake_cfg.mbw_min = 0; + bitmap_zero(fake_cfg.features, MPAM_FEATURE_LAST); + mpam_set_feature(mpam_feat_mbw_max, &fake_cfg); + mpam_extend_config(&fake_class, &fake_cfg); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_max, &fake_cfg)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_min, &fake_cfg)); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_max, 0xfeef); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_min, 0xf224); + + /* 5% below the minimum granule, is still the minimum granule */ + fake_class.props.bwa_wd = 12; + fake_cfg.mbw_max = 0xf; + fake_cfg.mbw_min = 0; + bitmap_zero(fake_cfg.features, MPAM_FEATURE_LAST); + mpam_set_feature(mpam_feat_mbw_max, &fake_cfg); + mpam_extend_config(&fake_class, &fake_cfg); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_max, &fake_cfg)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_min, &fake_cfg)); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_max, 0xf); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_min, 0xf); + + fake_class.props.bwa_wd = 16; + fake_cfg.mbw_max = 0x4; + fake_cfg.mbw_min = 0; + bitmap_zero(fake_cfg.features, MPAM_FEATURE_LAST); + mpam_set_feature(mpam_feat_mbw_max, &fake_cfg); + mpam_extend_config(&fake_class, &fake_cfg); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_max, &fake_cfg)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_min, &fake_cfg)); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_max, 0x4); + KUNIT_EXPECT_EQ(test, fake_cfg.mbw_min, 0x0); +} + static void test_mpam_reset_msc_bitmap(struct kunit *test) { char __iomem *buf = kunit_kzalloc(test, SZ_16K, GFP_KERNEL); @@ -378,6 +443,7 @@ static struct kunit_case mpam_devices_test_cases[] = { KUNIT_CASE(test_mpam_reset_msc_bitmap), KUNIT_CASE(test_mpam_enable_merge_features), KUNIT_CASE(test__props_mismatch), + KUNIT_CASE(test_mpam_extend_config), {} }; From dd240ab987d1a23bbe89abf1d3e10f1083bb3d9e Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 12 Mar 2024 14:36:34 +0000 Subject: [PATCH 139/247] NVIDIA: SAUCE: arm_mpam: Add quirk framework BugLink: https://bugs.launchpad.net/bugs/2122432 The MPAM specification includes the MPAMF_IIDR, which serves to uniquely identify the MSC implementation through a combination of implementer details, product ID, variant, and revision. Certain hardware issues/errata can be resolved using software workarounds. Introduce a quirk framework to allow workarounds to be enabled based on the MPAMF_IIDR value. [ morse: Stash the IIDR so this doesn't need an IPI, enable quirks only once, move the description to the callback so it can be pr_once()d, add an enum of workarounds for popular errata. Add macros for making lists of product/revision/vendor half readable ] Signed-off-by: Shanker Donthineni Signed-off-by: James Morse (cherry picked from commit 50a3fcf9c5b60c8f14206b6b93010d5153e29091 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 26 ++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 27 +++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 3c0d6da1e6ba5..9cf9f6a565088 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -706,6 +706,25 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +static const struct mpam_quirk mpam_quirks[] = { + { NULL }, /* Sentinel */ +}; + +static void mpam_enable_quirks(struct mpam_msc *msc) +{ + const struct mpam_quirk *quirk; + + for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) { + if (quirk->iidr != (msc->iidr & quirk->iidr_mask)) + continue; + + if (quirk->init) + quirk->init(msc, quirk); + else + mpam_set_quirk(quirk->workaround, msc); + } +} + /* * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour * of NRDY, software can use this bit for any purpose" - so hardware might not @@ -938,8 +957,11 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) /* Grab an IDR value to find out how many RIS there are */ mutex_lock(&msc->part_sel_lock); idr = mpam_msc_read_idr(msc); + msc->iidr = mpam_read_partsel_reg(msc, IIDR); mutex_unlock(&msc->part_sel_lock); + mpam_enable_quirks(msc); + msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr); /* Use these values so partid/pmg always starts with a valid value */ @@ -2327,6 +2349,7 @@ static void __props_mismatch(struct mpam_props *parent, * nobble the class feature, as we can't configure all the resources. * e.g. The L3 cache is composed of two resources with 13 and 17 portion * bitmaps respectively. + * Quirks on an MSC will apply to all MSC in that class. */ static void __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) @@ -2340,6 +2363,9 @@ __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n", (long)cprops->features, (long)vprops->features); + /* Merge quirks */ + class->quirks |= vmsc->msc->quirks; + /* Take the safe value for any common features */ __props_mismatch(cprops, vprops, false); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index c8045544b6faa..79fe28ef2c9fb 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -96,6 +96,8 @@ struct mpam_msc { u8 pmg_max; unsigned long ris_idxs; u32 ris_max; + u32 iidr; + u16 quirks; /* * error_irq_lock is taken when registering/unregistering the error @@ -215,6 +217,30 @@ struct mpam_props { #define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) #define mpam_clear_feature(_feat, x) clear_bit(_feat, (x)->features) +/* Workaround bits for msc->quirks */ +enum mpam_device_quirks { + MPAM_QUIRK_LAST, +}; + +#define mpam_has_quirk(_quirk, x) ((1 << (_quirk) & (x)->quirks)) +#define mpam_set_quirk(_quirk, x) ((x)->quirks |= (1 << (_quirk))) + +struct mpam_quirk { + void (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk); + + u32 iidr; + u32 iidr_mask; + + enum mpam_device_quirks workaround; +}; + +#define IIDR_PROD(x) ((x) << MPAMF_IIDR_PRODUCTID_SHIFT) +#define IIDR_VAR(x) ((x) << MPAMF_IIDR_VARIANT_SHIFT) +#define IIDR_REV(x) ((x) << MPAMF_IIDR_REVISON_SHIFT) +#define IIDR_IMP(x) ((x) << MPAMF_IIDR_IMPLEMENTER_SHIFT) + +#define IIDR_MATCH_ONE (IIDR_PROD(0xfff) | IIDR_VAR(0xf) | IIDR_REV(0xf) | IIDR_IMP(0xfff)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, @@ -267,6 +293,7 @@ struct mpam_class { struct mpam_props props; u32 nrdy_usec; + u16 quirks; u8 level; enum mpam_class_types type; From fe49cb6247e742405764e1613394c58a7bb972a5 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 12 Mar 2024 15:52:43 +0000 Subject: [PATCH 140/247] NVIDIA: SAUCE: arm_mpam: Add workaround for T241-MPAM-1 BugLink: https://bugs.launchpad.net/bugs/2122432 The MPAM bandwidth partitioning controls will not be correctly configured, and hardware will retain default configuration register values, meaning generally that bandwidth will remain unprovisioned. To address the issue, follow the below steps after updating the MBW_MIN and/or MBW_MAX registers. - Perform 64b reads from all 12 bridge MPAM shadow registers at offsets (0x360048 + slice*0x10000 + partid*8). These registers are read-only. - Continue iterating until all 12 shadow register values match in a loop. pr_warn_once if the values fail to match within the loop count 1000. - Perform 64b writes with the value 0x0 to the two spare registers at offsets 0x1b0000 and 0x1c0000. In the hardware, writes to the MPAMCFG_MBW_MAX MPAMCFG_MBW_MIN registers are transformed into broadcast writes to the 12 shadow registers. The final two writes to the spare registers cause a final rank of downstream micro-architectural MPAM registers to be updated from the shadow copies. The intervening loop to read the 12 shadow registers helps avoid a race condition where writes to the spare registers occur before all shadow registers have been updated. [ morse: Merged the min/max update into a single mpam_quirk_post_config_change() helper. Stashed the t241_id in the msc instead of carrying the physical address around. Test the msc quirk bit instead of a static key. ] Signed-off-by: Shanker Donthineni Signed-off-by: James Morse (cherry picked from commit 12faaa68b875e7bb73ab9894cdbddbd2d76907de https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/arch/arm64/silicon-errata.rst | 2 + drivers/resctrl/mpam_devices.c | 87 +++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 4 + 3 files changed, 93 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index a7ec57060f64f..4e86b85fe3d63 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -246,6 +246,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 9cf9f6a565088..aed005088449e 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -32,6 +32,16 @@ #include "mpam_internal.h" +/* Values for the T241 errata workaround */ +#define T241_CHIPS_MAX 4 +#define T241_CHIP_NSLICES 12 +#define T241_SPARE_REG0_OFF 0x1b0000 +#define T241_SPARE_REG1_OFF 0x1c0000 +#define T241_CHIP_ID(phys) FIELD_GET(GENMASK_ULL(44, 43), phys) +#define T241_SHADOW_REG_OFF(sidx, pid) (0x360048 + (sidx) * 0x10000 + (pid) * 8) +#define SMCCC_SOC_ID_T241 0x036b0241 +static void __iomem *t241_scratch_regs[T241_CHIPS_MAX]; + /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, @@ -706,7 +716,44 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +static void mpam_enable_quirk_nvidia_t241(struct mpam_msc *msc, + const struct mpam_quirk *quirk) +{ + s32 soc_id = arm_smccc_get_soc_id_version(); + struct resource *r; + phys_addr_t phys; + + /* + * A mapping to a device other than the MSC is needed, check + * SOC_ID is NVIDIA T241 chip (036b:0241) + */ + if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241) + return; + + r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0); + if (!r) + return; + + /* Find the internal registers base addr from the CHIP ID */ + msc->t241_id = T241_CHIP_ID(r->start); + phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL; + + t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M); + if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id])) + return; + + mpam_set_quirk(quirk->workaround, msc); + pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n"); +} + static const struct mpam_quirk mpam_quirks[] = { + { + /* NVIDIA t241 erratum T241-MPAM-1 */ + .init = mpam_enable_quirk_nvidia_t241, + .iidr = IIDR_PROD(0x241) | IIDR_VAR(0) | IIDR_REV(0) | IIDR_IMP(0x36b), + .iidr_mask = IIDR_MATCH_ONE, + .workaround = T241_SCRUB_SHADOW_REGS, + }, { NULL }, /* Sentinel */ }; @@ -1464,6 +1511,44 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) __mpam_write_reg(msc, reg, bm); } +static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid) +{ + int sidx, i, lcount = 1000; + void __iomem *regs; + u64 val0, val; + + regs = t241_scratch_regs[ris->vmsc->msc->t241_id]; + + for (i = 0; i < lcount; i++) { + /* Read the shadow register at index 0 */ + val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid)); + + /* Check if all the shadow registers have the same value */ + for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) { + val = readq_relaxed(regs + + T241_SHADOW_REG_OFF(sidx, partid)); + if (val != val0) + break; + } + if (sidx == T241_CHIP_NSLICES) + break; + } + + if (i == lcount) + pr_warn_once("t241: inconsistent values in shadow regs"); + + /* Write a value zero to spare registers to take effect of MBW conf */ + writeq_relaxed(0, regs + T241_SPARE_REG0_OFF); + writeq_relaxed(0, regs + T241_SPARE_REG1_OFF); +} + +static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, + struct mpam_config *cfg) +{ + if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc)) + mpam_apply_t241_erratum(ris, partid); +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) @@ -1545,6 +1630,8 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, PRI, pri_val); } + mpam_quirk_post_config_change(ris, partid, cfg); + mutex_unlock(&msc->part_sel_lock); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 79fe28ef2c9fb..83b5788235d0b 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -135,6 +135,9 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + /* Values only used on some platforms for quirks */ + u32 t241_id; + struct mpam_garbage garbage; }; @@ -219,6 +222,7 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { + T241_SCRUB_SHADOW_REGS, MPAM_QUIRK_LAST, }; From 7c873f77b5c4fa16e1cfc477dc03f07abb8c19c8 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Thu, 14 Mar 2024 13:47:52 +0000 Subject: [PATCH 141/247] NVIDIA: SAUCE: arm_mpam: Add workaround for T241-MPAM-4 BugLink: https://bugs.launchpad.net/bugs/2122432 In the T241 implementation of memory-bandwidth partitioning, in the absence of contention for bandwidth, the minimum bandwidth setting can affect the amount of achieved bandwidth. Specifically, the achieved bandwidth in the absence of contention can settle to any value between the values of MPAMCFG_MBW_MIN and MPAMCFG_MBW_MAX. Also, if MPAMCFG_MBW_MIN is set zero (below 0.78125%), once a core enters a throttled state, it will never leave that state. The first issue is not a cocern if the MPAM software allows to program MPAMCFG_MBW_MIN through the sysfs interface. This patch ensures program MBW_MIN=1 (0.78125%) whenever MPAMCFG_MBW_MIN=0 is programmed. In the scenario where the resctrl doesn't support the MBW_MIN interface via sysfs, to achieve bandwidth closer to MW_MAX in the absence of contention, software should configure a relatively narrow gap between MBW_MIN and MBW_MAX. The recommendation is to use a 5% gap to mitigate the problem. [ morse: Added as second quirk, adapted to use the new intermediate values in mpam_extend_config() ] Signed-off-by: Shanker Donthineni Signed-off-by: James Morse (cherry picked from commit 10bd74f26abf1b193cfc939c0d5a6f06a6b88a13 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/arch/arm64/silicon-errata.rst | 2 + drivers/resctrl/mpam_devices.c | 63 ++++++++++++++++----- drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 4e86b85fe3d63..b18bc704d4a1e 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -248,6 +248,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index aed005088449e..cfc9603eda1fa 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -754,6 +754,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = IIDR_MATCH_ONE, .workaround = T241_SCRUB_SHADOW_REGS, }, + { + /* NVIDIA t241 erratum T241-MPAM-4 */ + .iidr = IIDR_PROD(0x241) | IIDR_VAR(0) | IIDR_REV(0) | IIDR_IMP(0x36b), + .iidr_mask = IIDR_MATCH_ONE, + .workaround = T241_FORCE_MBW_MIN_TO_ONE, + }, { NULL }, /* Sentinel */ }; @@ -1740,6 +1746,22 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); } +/* + * This is not part of mpam_init_reset_cfg() as high level callers have the + * class, and low level callers a ris. + */ +static void mpam_wa_t241_force_mbw_min_to_one(struct mpam_config *cfg, + struct mpam_props *props) +{ + u16 max_hw_value, min_hw_granule, res0_bits; + + res0_bits = 16 - props->bwa_wd; + max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + + cfg->mbw_min = min_hw_granule + 1; +} + /* * Called via smp_call_on_cpu() to prevent migration, while still being * pre-emptible. Caller must hold mpam_srcu. @@ -1749,11 +1771,14 @@ static int mpam_reset_ris(void *arg) struct mpam_config reset_cfg; struct mpam_msc_ris *ris = arg; struct reprogram_ris reprogram_arg; + struct mpam_msc *msc = ris->vmsc->msc; if (ris->in_reset_state) return 0; mpam_init_reset_cfg(&reset_cfg); + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) + mpam_wa_t241_force_mbw_min_to_one(&reset_cfg, &ris->props); reprogram_arg.ris = ris; reprogram_arg.cfg = &reset_cfg; @@ -2744,14 +2769,19 @@ static void __destroy_component_cfg(struct mpam_component *comp) static void mpam_reset_component_cfg(struct mpam_component *comp) { int i; + struct mpam_class *class = comp->class; mpam_assert_partid_sizes_fixed(); if (!comp->cfg) return; - for (i = 0; i < mpam_partid_max + 1; i++) + for (i = 0; i < mpam_partid_max + 1; i++) { mpam_init_reset_cfg(&comp->cfg[i]); + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) + mpam_wa_t241_force_mbw_min_to_one(&comp->cfg[i], + &class->props); + } } static int __allocate_component_cfg(struct mpam_component *comp) @@ -3047,6 +3077,18 @@ static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg u16 min, min_hw_granule, delta; u16 max_hw_value, res0_bits; + /* + * Calculate the values the 'min' control can hold. + * e.g. on a platform with bwa_wd = 8, min_hw_granule is 0x00ff because + * those bits are RES0. Configurations of this value are effectively + * zero. But configurations need to saturate at min_hw_granule on + * systems with mismatched bwa_wd, where the 'less than 0' values are + * implemented on some MSC, but not others. + */ + res0_bits = 16 - cprops->bwa_wd; + max_hw_value = ((1 << cprops->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + /* * MAX and MIN should be set together. If only one is provided, * generate a configuration for the other. If only one control @@ -3056,19 +3098,6 @@ static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg */ if (mpam_has_feature(mpam_feat_mbw_max, cfg) && !mpam_has_feature(mpam_feat_mbw_min, cfg)) { - /* - * Calculate the values the 'min' control can hold. - * e.g. on a platform with bwa_wd = 8, min_hw_granule is 0x00ff - * because those bits are RES0. Configurations of this value - * are effectively zero. But configurations need to saturate - * at min_hw_granule on systems with mismatched bwa_wd, where - * the 'less than 0' values are implemented on some MSC, but - * not others. - */ - res0_bits = 16 - cprops->bwa_wd; - max_hw_value = ((1 << cprops->bwa_wd) - 1) << res0_bits; - min_hw_granule = ~max_hw_value; - delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; if (cfg->mbw_max > delta) min = cfg->mbw_max - delta; @@ -3078,6 +3107,12 @@ static void mpam_extend_config(struct mpam_class *class, struct mpam_config *cfg cfg->mbw_min = max(min, min_hw_granule); mpam_set_feature(mpam_feat_mbw_min, cfg); } + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class) && + cfg->mbw_min <= min_hw_granule) { + cfg->mbw_min = min_hw_granule + 1; + mpam_set_feature(mpam_feat_mbw_min, cfg); + } } int mpam_apply_config(struct mpam_component *comp, u16 partid, diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 83b5788235d0b..bf9b6af9acd5c 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -223,6 +223,7 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, + T241_FORCE_MBW_MIN_TO_ONE, MPAM_QUIRK_LAST, }; From 40cfabc34a4ab23b01f1cabb5bf4c043f2727b59 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 9 Jul 2024 12:35:50 -0500 Subject: [PATCH 142/247] NVIDIA: SAUCE: arm_mpam: Add workaround for T241-MPAM-6 BugLink: https://bugs.launchpad.net/bugs/2122432 The registers MSMON_MBWU_L and MSMON_MBWU return the number of requests rather than the number of bytes transferred. Bandwidth resource monitoring is performed at the last level cache, where each request arrive in 64Byte granularity. The current implementation returns the number of transactions received at the last level cache but does not provide the value in bytes. Scaling by 64 gives an accurate byte count to match the MPAM specification for the MSMON_MBWU and MSMON_MBWU_L registers. This patch fixes the issue by reporting the actual number of bytes instead of the number of transactions from __ris_msmon_read(). Signed-off-by: Shanker Donthineni Signed-off-by: James Morse (cherry picked from commit 2b078dcf83ba6b559ce9d296197a970d26dd0d0e https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/arch/arm64/silicon-errata.rst | 2 ++ drivers/resctrl/mpam_devices.c | 17 +++++++++++++++-- drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index b18bc704d4a1e..e810b2a8f40eb 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -250,6 +250,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-6 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index cfc9603eda1fa..16bacc29b8b1f 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -760,6 +760,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = IIDR_MATCH_ONE, .workaround = T241_FORCE_MBW_MIN_TO_ONE, }, + { + /* NVIDIA t241 erratum T241-MPAM-6 */ + .iidr = IIDR_PROD(0x241) | IIDR_VAR(0) | IIDR_REV(0) | IIDR_IMP(0x36b), + .iidr_mask = IIDR_MATCH_ONE, + .workaround = T241_MBW_COUNTER_SCALE_64, + }, { NULL }, /* Sentinel */ }; @@ -1302,14 +1308,21 @@ static void __ris_msmon_read(void *arg) now = FIELD_GET(MSMON___VALUE, now); } + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc)) + now *= 64; + if (nrdy) break; mbwu_state = &ris->mbwu_state[ctx->mon]; /* Add any pre-overflow value to the mbwu_state->val */ - if (mbwu_state->prev_val > now) - overflow_val = mpam_msmon_overflow_val(m->type) - mbwu_state->prev_val; + if (mbwu_state->prev_val > now) { + overflow_val = mpam_msmon_overflow_val(m->type); + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc)) + overflow_val *= 64; + overflow_val -= mbwu_state->prev_val; + } mbwu_state->prev_val = now; mbwu_state->correction += overflow_val; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index bf9b6af9acd5c..d593c72bea8db 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -224,6 +224,7 @@ struct mpam_props { enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, T241_FORCE_MBW_MIN_TO_ONE, + T241_MBW_COUNTER_SCALE_64, MPAM_QUIRK_LAST, }; From 868e1e479f63777396ba7e788f08b93566f90666 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 26 Jun 2024 18:00:37 +0100 Subject: [PATCH 143/247] NVIDIA: SAUCE: arm_mpam: Quirk CMN-650's CSU NRDY behaviour BugLink: https://bugs.launchpad.net/bugs/2122432 CMN-650 is afflicted with an erratum where the CSU NRDY bit never clears. This tells us the monitor never finishes scanning the cache. The erratum document says to wait the maximum time, then ignore the field. Add a flag to indicate whether this is the final attempt to read the counter, and when this quirk is applied, ignore the NRDY field. This means accesses to this counter will always retry, even if the counter was previously programmed to the same values. The counter value is not expected to be stable, it drifts up and down with each allocation and eviction. The CSU register provides the value for a point in time. Signed-off-by: James Morse (cherry picked from commit b1ebe89c8e35dfd01d176b2d8913aa60d97f6195 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/arch/arm64/silicon-errata.rst | 3 +++ drivers/resctrl/mpam_devices.c | 12 ++++++++++++ drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 16 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index e810b2a8f40eb..3667650036fba 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -213,6 +213,9 @@ stable kernels. | ARM | GIC-700 | #2941627 | ARM64_ERRATUM_2941627 | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ +| ARM | CMN-650 | #3642720 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_845719 | +----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_843419 | diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 16bacc29b8b1f..94ee8745809f1 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -766,6 +766,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = IIDR_MATCH_ONE, .workaround = T241_MBW_COUNTER_SCALE_64, }, + { + /* ARM CMN-650 CSU erratum 3642720 */ + .iidr = IIDR_PROD(0) | IIDR_VAR(0) | IIDR_REV(0) | IIDR_IMP(0x43b), + .iidr_mask = IIDR_MATCH_ONE, + .workaround = IGNORE_CSU_NRDY, + }, { NULL }, /* Sentinel */ }; @@ -1071,6 +1077,7 @@ struct mon_read { enum mpam_device_features type; u64 *val; int err; + bool waited_timeout; }; static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris) @@ -1288,6 +1295,10 @@ static void __ris_msmon_read(void *arg) if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); + + if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout) + nrdy = false; + break; case mpam_feat_msmon_mbwu_31counter: case mpam_feat_msmon_mbwu_44counter: @@ -1429,6 +1440,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, .ctx = ctx, .type = type, .val = val, + .waited_timeout = true, }; *val = 0; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index d593c72bea8db..72ddbc9054d80 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -225,6 +225,7 @@ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, T241_FORCE_MBW_MIN_TO_ONE, T241_MBW_COUNTER_SCALE_64, + IGNORE_CSU_NRDY, MPAM_QUIRK_LAST, }; From 33ad6962d20c35fc4abbb66ca377b5956d4b7450 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 22 Dec 2022 17:01:52 +0000 Subject: [PATCH 144/247] NVIDIA: SAUCE: debugfs: Add helpers for creating cpumask entries in debugfs BugLink: https://bugs.launchpad.net/bugs/2122432 debugfs has handy helpers to make a bool, integer or string available through debugfs. Add helpers to do the same for cpumasks. These are read only. CC: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 411842a48495d025d509446507b2f4faa1eecad5 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/debugfs/file.c | 64 +++++++++++++++++++++++++++++++++++++++++ include/linux/debugfs.h | 6 ++++ 2 files changed, 70 insertions(+) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 3ec3324c20603..849688b3fce0a 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1140,6 +1140,70 @@ void debugfs_create_str(const char *name, umode_t mode, &fops_str_ro, &fops_str_wo); } +static ssize_t debugfs_read_file_cpumask(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dentry *dentry = F_DENTRY(file); + struct cpumask *cpumask; + char *kernel_buf; + ssize_t ret; + int len; + + ret = debugfs_file_get(dentry); + if (unlikely(ret)) + return ret; + + /* How long is a piece of string? */ + kernel_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!kernel_buf) { + debugfs_file_put(dentry); + return -ENOMEM; + } + + cpumask = (struct cpumask *)file->private_data; + len = scnprintf(kernel_buf, PAGE_SIZE, + "%*pb\n", cpumask_pr_args(cpumask)); + debugfs_file_put(dentry); + if (len + 1 >= PAGE_SIZE) { + kfree(kernel_buf); + return -EIO; + } + + ret = simple_read_from_buffer(user_buf, count, ppos, kernel_buf, len); + kfree(kernel_buf); + + return ret; +} + +static const struct file_operations fops_cpumask_ro = { + .read = debugfs_read_file_cpumask, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_cpumask - create a read-only debugfs file that is used to read a cpumask + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. + */ +void debugfs_create_cpumask(const char *name, umode_t mode, + struct dentry *parent, struct cpumask *value) +{ + /* Only read-only is supported */ + WARN_ON_ONCE(mode & S_IWUGO); + + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_cpumask_ro, + &fops_cpumask_ro, &fops_cpumask_ro); +} + static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 7cecda29447e3..855cc18833403 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -202,6 +202,8 @@ void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value); void debugfs_create_str(const char *name, umode_t mode, struct dentry *parent, char **value); +void debugfs_create_cpumask(const char *name, umode_t mode, + struct dentry *parent, struct cpumask *value); struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, @@ -409,6 +411,10 @@ static inline void debugfs_create_str(const char *name, umode_t mode, char **value) { } +static inline void debugfs_create_cpumask(const char *name, umode_t mode, + struct dentry *parent, struct cpumask *value) +{ } + static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) From 3da2010fe90ab936c5f02f2a8c8988ce5fcd478d Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 1 Sep 2021 15:13:12 +0100 Subject: [PATCH 145/247] NVIDIA: SAUCE: arm_mpam: Add debugfs entries to show the MSC/RIS the driver discovered BugLink: https://bugs.launchpad.net/bugs/2122432 Not all of MPAM is visible through the resctrl user-space interface. To make it easy to debug why certain devices were not exposed through resctrl, allow the properties of the devices to be read through debugfs. This adds an mpam directory to debugfs, and exposes the devices as well as the hierarchy that was built. Signed-off-by: James Morse (cherry picked from commit 9299a330a0f7729144664ddb643982461e0b175f https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 136 +++++++++++++++++++++++++++++--- drivers/resctrl/mpam_internal.h | 9 +++ 2 files changed, 136 insertions(+), 9 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 94ee8745809f1..8f02fd750732d 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -86,6 +86,8 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable); /* When mpam is disabled, the printed reason to aid debugging */ static char *mpam_disable_reason; +static struct dentry *mpam_debugfs; + /* * Whether has been setup. Used by cpuhp in preference to mpam_is_enabled() * the disable call after an error interrupt makes mpam_is_enabled() false before @@ -403,6 +405,8 @@ static void mpam_class_destroy(struct mpam_class *class) { lockdep_assert_held(&mpam_list_lock); + debugfs_remove_recursive(class->debugfs); + class->debugfs = NULL; list_del_rcu(&class->classes_list); add_to_garbage(class); } @@ -417,6 +421,8 @@ static void mpam_comp_destroy(struct mpam_component *comp) __destroy_component_cfg(comp); + debugfs_remove_recursive(comp->debugfs); + comp->debugfs = NULL; list_del_rcu(&comp->class_list); add_to_garbage(comp); @@ -430,6 +436,8 @@ static void mpam_vmsc_destroy(struct mpam_vmsc *vmsc) lockdep_assert_held(&mpam_list_lock); + debugfs_remove_recursive(vmsc->debugfs); + vmsc->debugfs = NULL; list_del_rcu(&vmsc->comp_list); add_to_garbage(vmsc); @@ -459,6 +467,8 @@ static void mpam_ris_destroy(struct mpam_msc_ris *ris) cpumask_andnot(&comp->affinity, &comp->affinity, &ris->affinity); cpumask_andnot(&class->affinity, &class->affinity, &ris->affinity); clear_bit(ris->ris_idx, &msc->ris_idxs); + debugfs_remove_recursive(ris->debugfs); + ris->debugfs = NULL; list_del_rcu(&ris->vmsc_list); list_del_rcu(&ris->msc_list); add_to_garbage(ris); @@ -840,32 +850,32 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) /* Cache Capacity Partitioning */ if (FIELD_GET(MPAMF_IDR_HAS_CCAP_PART, ris->idr)) { - u32 ccap_features = mpam_read_partsel_reg(msc, CCAP_IDR); + ris->ccap_idr = mpam_read_partsel_reg(msc, CCAP_IDR); - props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ccap_features); + props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ris->ccap_idr); if (props->cmax_wd && - FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ccap_features)) + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_softlim, props); if (props->cmax_wd && - !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ccap_features)) + !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_cmax, props); if (props->cmax_wd && - FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ccap_features)) + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_cmin, props); - props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ccap_features); + props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ris->ccap_idr); if (props->cassoc_wd && - FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ccap_features)) + FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_cassoc, props); } /* Cache Portion partitioning */ if (FIELD_GET(MPAMF_IDR_HAS_CPOR_PART, ris->idr)) { - u32 cpor_features = mpam_read_partsel_reg(msc, CPOR_IDR); + ris->cpor_idr = mpam_read_partsel_reg(msc, CPOR_IDR); - props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, cpor_features); + props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, ris->cpor_idr); if (props->cpbm_wd) mpam_set_feature(mpam_feat_cpor_part, props); } @@ -2201,6 +2211,9 @@ static void mpam_msc_destroy(struct mpam_msc *msc) list_del_rcu(&msc->all_msc_list); platform_set_drvdata(pdev, NULL); + debugfs_remove_recursive(msc->debugfs); + msc->debugfs = NULL; + add_to_garbage(msc); } @@ -2222,6 +2235,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) { int err; u32 tmp; + char name[20]; struct mpam_msc *msc; struct resource *msc_res; struct device *dev = &pdev->dev; @@ -2276,6 +2290,10 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) list_add_rcu(&msc->all_msc_list, &mpam_all_msc); platform_set_drvdata(pdev, msc); + snprintf(name, sizeof(name), "msc.%u", msc->id); + msc->debugfs = debugfs_create_dir(name, mpam_debugfs); + debugfs_create_x32("max_nrdy_usec", 0400, msc->debugfs, &msc->nrdy_usec); + return msc; } @@ -2881,6 +2899,102 @@ static int mpam_allocate_config(void) return 0; } +static void mpam_debugfs_setup_ris(struct mpam_msc_ris *ris) +{ + char name[40]; + struct dentry *d; + struct mpam_props *rprops = &ris->props; + + snprintf(name, sizeof(name), "ris.%u", ris->ris_idx); + d = debugfs_create_dir(name, ris->vmsc->msc->debugfs); + debugfs_create_x64("mpamf_idr", 0400, d, &ris->idr); + debugfs_create_x32("mpamf_cpor_idr", 0400, d, &ris->cpor_idr); + debugfs_create_x32("mpamf_ccap_idr", 0400, d, &ris->ccap_idr); + debugfs_create_ulong("features", 0400, d, &rprops->features[0]); + debugfs_create_x16("cpbm_wd", 0400, d, &rprops->cpbm_wd); + debugfs_create_x16("mbw_pbm_bits", 0400, d, &rprops->mbw_pbm_bits); + debugfs_create_x16("num_csu_mon", 0400, d, &rprops->num_csu_mon); + debugfs_create_x16("num_mbwu_mon", 0400, d, &rprops->num_mbwu_mon); + debugfs_create_cpumask("affinity", 0400, d, &ris->affinity); + ris->debugfs = d; +} + +static void mpam_debugfs_setup_vmsc(struct mpam_component *comp, + struct mpam_vmsc *vmsc) +{ + u8 ris_idx; + char name[40]; + char path[40]; + struct dentry *d; + struct mpam_msc_ris *ris; + int msc_id = vmsc->msc->id; + + snprintf(name, sizeof(name), "vmsc.%u", msc_id); + d = debugfs_create_dir(name, comp->debugfs); + debugfs_create_ulong("features", 0400, d, &vmsc->props.features[0]); + vmsc->debugfs = d; + + list_for_each_entry_rcu(ris, &vmsc->ris, vmsc_list) { + ris_idx = ris->ris_idx; + + snprintf(name, sizeof(name), "msc.%u_ris.%u", msc_id, + ris_idx); + snprintf(path, sizeof(path), "../../../msc.%u/ris.%u", + msc_id, ris_idx); + debugfs_create_symlink(name, d, path); + } +} + +static void mpam_debugfs_setup_comp(struct mpam_class *class, + struct mpam_component *comp) +{ + char name[40]; + struct dentry *d; + struct mpam_vmsc *vmsc; + + snprintf(name, sizeof(name), "comp.%u", comp->comp_id); + d = debugfs_create_dir(name, class->debugfs); + comp->debugfs = d; + + list_for_each_entry_rcu(vmsc, &comp->vmsc, comp_list) + mpam_debugfs_setup_vmsc(comp, vmsc); +} + +static void mpam_debugfs_setup(void) +{ + char name[40]; + struct dentry *d; + struct mpam_msc *msc; + struct mpam_class *class; + struct mpam_msc_ris *ris; + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(msc, &mpam_all_msc, all_msc_list) { + d = msc->debugfs; + debugfs_create_u32("fw_id", 0400, d, &msc->pdev->id); + debugfs_create_x32("iface", 0400, d, &msc->iface); + debugfs_create_x32("mpamf_iidr", 0400, d, &msc->iidr); + list_for_each_entry(ris, &msc->ris, msc_list) + mpam_debugfs_setup_ris(ris); + } + + list_for_each_entry_rcu(class, &mpam_classes, classes_list) { + snprintf(name, sizeof(name), "class.%u", class->level); + d = debugfs_create_dir(name, mpam_debugfs); + debugfs_create_ulong("features", 0400, d, &class->props.features[0]); + debugfs_create_x32("nrdy_usec", 0400, d, &class->nrdy_usec); + debugfs_create_x16("quirks", 0400, d, &class->quirks); + debugfs_create_x8("level", 0400, d, &class->level); + debugfs_create_cpumask("affinity", 0400, d, &class->affinity); + class->debugfs = d; + + list_for_each_entry_rcu(comp, &class->components, class_list) + mpam_debugfs_setup_comp(class, comp); + } +} + static void mpam_enable_once(void) { int err; @@ -2914,6 +3028,8 @@ static void mpam_enable_once(void) pr_err("Failed to allocate configuration arrays.\n"); break; } + + mpam_debugfs_setup(); } while (0); mutex_unlock(&mpam_list_lock); cpus_read_unlock(); @@ -3236,6 +3352,8 @@ static int __init mpam_msc_driver_init(void) if (acpi_disabled) mpam_dt_create_foundling_msc(); + mpam_debugfs = debugfs_create_dir("mpam", NULL); + return platform_driver_register(&mpam_msc_driver); } /* Must occur after arm64_mpam_register_cpus() from arch_initcall() */ diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 72ddbc9054d80..a3ea649cf3bb2 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -135,6 +136,8 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + struct dentry *debugfs; + /* Values only used on some platforms for quirks */ u32 t241_id; @@ -310,6 +313,7 @@ struct mpam_class { struct ida ida_csu_mon; struct ida ida_mbwu_mon; + struct dentry *debugfs; struct mpam_garbage garbage; }; @@ -348,6 +352,7 @@ struct mpam_component { /* parent: */ struct mpam_class *class; + struct dentry *debugfs; struct mpam_garbage garbage; }; @@ -366,12 +371,15 @@ struct mpam_vmsc { /* parent: */ struct mpam_component *comp; + struct dentry *debugfs; struct mpam_garbage garbage; }; struct mpam_msc_ris { u8 ris_idx; u64 idr; + u32 cpor_idr; + u32 ccap_idr; struct mpam_props props; bool in_reset_state; @@ -389,6 +397,7 @@ struct mpam_msc_ris { /* msmon mbwu configuration is preserved over reset */ struct msmon_mbwu_state *mbwu_state; + struct dentry *debugfs; struct mpam_garbage garbage; }; From e2c3517afeca4a8e527301af327c1beb826e4c08 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 29 Jul 2024 17:05:31 +0100 Subject: [PATCH 146/247] NVIDIA: SAUCE: arm_mpam: Add force-disable debugfs trigger BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM has an error interrupt that can be triggered by an MSC when corrupt or out of range values are seen. The hardware only needs to raise an error interrupt if the error was detected, it is also permissible for the hardware to just use the corrupt or our of range value. All the reasons to raise an error indicate a software bug. When the error interrupt is triggered, the MPAM driver attempts to reset all the CPUs back to PARTID-0 and reset PARTID-0 to be unrestricted. This is done to ensure important tasks aren't accidentally given the performance of unimportant tasks. This teardown path in the driver is hard to trigger. Add a debugfs file to poke this manually. It is expected you have to reboot to make MPAM work again after this. Signed-off-by: James Morse (cherry picked from commit 7391f4e9a513081f3110895237a4ef7467bd7222 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 8f02fd750732d..d51f5d0d3d866 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2995,6 +2995,33 @@ static void mpam_debugfs_setup(void) } } +static int mpam_force_disable_show(struct seq_file *s, void *data) +{ + seq_puts(s, "Write 1 to this file to trigger an MPAM error.\n"); + return 0; +} + +static ssize_t mpam_force_disable_write(struct file *file, + const char __user *userbuf, size_t count, + loff_t *ppos) +{ + u32 user_val; + int err; + + err = kstrtou32_from_user(userbuf, count, 10, &user_val); + if (err) + return err; + + if (user_val == 1) { + mpam_disable_reason = "debugfs trigger"; + mpam_disable(NULL); + } + + return count; +} + +DEFINE_SHOW_STORE_ATTRIBUTE(mpam_force_disable); + static void mpam_enable_once(void) { int err; @@ -3034,6 +3061,9 @@ static void mpam_enable_once(void) mutex_unlock(&mpam_list_lock); cpus_read_unlock(); + debugfs_create_file("force_disable", 0600, mpam_debugfs, NULL, + &mpam_force_disable_fops); + if (!err) { err = mpam_resctrl_setup(); if (err) From 1f913ef5ef59dac725c57bde39dc37a3b9ab647b Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 18 Jul 2025 12:02:57 +0100 Subject: [PATCH 147/247] NVIDIA: SAUCE: arm_mpam: Expose the number of NRDY retries in debugfs BugLink: https://bugs.launchpad.net/bugs/2122432 It's really popular to tie NRDY high, and then act surprised when the OS never reads the counters, because they aren't ready. The spec obliges hardware to clear this bit automatically before the firmware advertised timeout. To make it easier to find errant hardware, count the number of retries and expose that number in debugfs. Signed-off-by: James Morse (cherry picked from commit d99dbcc84db16d93a9efb093213440c2ad12aba1 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 2 ++ drivers/resctrl/mpam_internal.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index d51f5d0d3d866..99ae4dd17d578 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1358,6 +1358,7 @@ static void __ris_msmon_read(void *arg) mpam_mon_sel_unlock(msc); if (nrdy) { + msc->nrdy_retry_count++; m->err = -EBUSY; return; } @@ -2976,6 +2977,7 @@ static void mpam_debugfs_setup(void) debugfs_create_u32("fw_id", 0400, d, &msc->pdev->id); debugfs_create_x32("iface", 0400, d, &msc->iface); debugfs_create_x32("mpamf_iidr", 0400, d, &msc->iidr); + debugfs_create_x64("nrdy_retry_count", 0400, d, &msc->nrdy_retry_count); list_for_each_entry(ris, &msc->ris, msc_list) mpam_debugfs_setup_ris(ris); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index a3ea649cf3bb2..f890d1381af69 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -79,6 +79,7 @@ struct mpam_msc { /* Not modified after mpam_is_enabled() becomes true */ enum mpam_msc_iface iface; u32 nrdy_usec; + u64 nrdy_retry_count; cpumask_t accessibility; bool has_extd_esr; From eec2a8f0c957620eeb3c69aac221f1de0bb4c2b5 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Aug 2025 15:43:56 +0100 Subject: [PATCH 148/247] NVIDIA: SAUCE: arm_mpam: Add resctrl_arch_round_bw() BugLink: https://bugs.launchpad.net/bugs/2122432 Add the required hook to pre-round a userspace memory bandwidth allocation percentage value to a value acceptable to the driver backend. For MPAM, no rounding is needed because the driver has all the information necessary for rounding the value when resctrl_arch_update_one() is called. So, just "round" the value to itself here. Signed-off-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit 3011c8306e6a968dd6843c4d594006d4c560dee2 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- include/linux/arm_mpam.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index c05d5d5557e8c..810f894025fb8 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -5,6 +5,7 @@ #define __LINUX_ARM_MPAM_H #include +#include #include #include @@ -42,6 +43,19 @@ static inline int acpi_mpam_count_msc(void) { return -EINVAL; } int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, int component_id); +struct resctrl_schema; + +struct rdt_resource; +static inline u32 resctrl_arch_round_bw(u32 val, + const struct rdt_resource *r __always_unused) +{ + /* + * Do nothing: for MPAM, resctrl_arch_update_one() has the necessary + * context to round the incoming value correctly. + */ + return val; +} + static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) { return val; From 3b2f167bcca499e4877c02732e263e2332d5f068 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Aug 2025 15:43:55 +0100 Subject: [PATCH 149/247] NVIDIA: SAUCE: fs/resctrl,x86/resctrl: Factor mba rounding to be per-arch BugLink: https://bugs.launchpad.net/bugs/2122432 The control value parser for the MB resource currently coerces the memory bandwidth percentage value from userspace to be an exact multiple of the bw_gran parameter. On MPAM systems, this results in somewhat worse-than-worst-case rounding, since bw_gran is in general only an approximation to the actual hardware granularity, and the hardware bandwidth allocation control value is not natively a percentage. Allow the arch to provide its own conversion that is appropriate for the hardware, and move the existing conversion to x86. This will avoid accumulated error from rounding the value twice on MPAM systems. Clarify the documentation, but avoid overly exact promises. Clamping to bw_min and bw_max still feels generic: leave it in the core code, for now. No functional change. Signed-off-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit b290abeedaef657fb50e2b11c204ac177cb338cd https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- Documentation/filesystems/resctrl.rst | 7 +++---- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 6 ++++++ fs/resctrl/ctrlmondata.c | 2 +- include/linux/resctrl.h | 2 ++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 006d23af66e19..b9f6aa44fc4d7 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -144,12 +144,11 @@ with respect to allocation: user can request. "bandwidth_gran": - The granularity in which the memory bandwidth + The approximate granularity in which the memory bandwidth percentage is allocated. The allocated b/w percentage is rounded off to the next - control step available on the hardware. The - available bandwidth control steps are: - min_bandwidth + N * bandwidth_gran. + control step available on the hardware. The available + steps are at least as small as this value. "delay_linear": Indicates if the delay scale is linear or diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 1189c0df4ad76..cf9b30b5df3ce 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -16,9 +16,15 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include "internal.h" +u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r) +{ + return roundup(val, (unsigned long)r->membw.bw_gran); +} + int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 0d0ef54fc4de1..6810a026fc7fc 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -69,7 +69,7 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) return false; } - *data = roundup(bw, (unsigned long)r->membw.bw_gran); + *data = resctrl_arch_round_bw(bw, r); return true; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a7d92718b653f..e1f9c46ea4661 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -485,6 +485,8 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); */ int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); +u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. From 4c3fcbb24818652488954cfaba9e4b74407d720a Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 29 Sep 2025 14:29:42 +0100 Subject: [PATCH 150/247] NVIDIA: SAUCE: arm_mpam: Split the locking around the mon_sel registers BugLink: https://bugs.launchpad.net/bugs/2122432 The MSC MON_SEL register needs to be accessed from hardirq for the overflow interrupt, and when taking an IPI to access these registers on platforms where MSC are not accesible from every CPU. This makes an irqsave spinlock the obvious lock to protect these registers. On systems with SCMI mailboxes it must be able to sleep, meaning a mutex must be used. The SCMI platforms can't support an overflow interrupt. Clearly these two can't exist for one MSC at the same time. Split the existing helper into a raw spinlock and a mutex, named inner and outer. The outer lock must be taken in an a pre-emptible context befroe the inner lock can be taken. On systems with SCMI mailboxes where the MON_SEL accesses must sleep - the inner lock will fail tobe taken if the caller is unable to sleep. This will allow callers to fail withuot having to explicitly check the interface type of each MSC. Signed-off-by: James Morse (cherry picked from commit 6becd89512fd69089045ca7654fdf70261c973e6 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 58 +++++++++++++++++++++-------- drivers/resctrl/mpam_internal.h | 66 ++++++++++++++++++++++++--------- 2 files changed, 91 insertions(+), 33 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 99ae4dd17d578..8a86660f73787 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -815,7 +815,7 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) bool can_set, can_clear; struct mpam_msc *msc = ris->vmsc->msc; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) return false; mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) | @@ -829,7 +829,7 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) _mpam_write_monsel_reg(msc, mon_reg, 0); now = _mpam_read_monsel_reg(msc, mon_reg); can_clear = !(now & MSMON___NRDY); - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); return (!can_set || !can_clear); } @@ -953,7 +953,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_msmon_csu_xcl, props); /* Is NRDY hardware managed? */ + mpam_mon_sel_outer_lock(msc); hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); + mpam_mon_sel_outer_unlock(msc); if (hw_managed) mpam_set_feature(mpam_feat_msmon_csu_hw_nrdy, props); } @@ -986,7 +988,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) } /* Is NRDY hardware managed? */ + mpam_mon_sel_outer_lock(msc); hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); + mpam_mon_sel_outer_unlock(msc); if (hw_managed) mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); @@ -1270,7 +1274,7 @@ static void __ris_msmon_read(void *arg) struct mpam_msc *msc = m->ris->vmsc->msc; u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; - if (!mpam_mon_sel_lock(msc)) { + if (!mpam_mon_sel_inner_lock(msc)) { m->err = -EIO; return; } @@ -1355,7 +1359,7 @@ static void __ris_msmon_read(void *arg) m->err = -EINVAL; break; } - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); if (nrdy) { msc->nrdy_retry_count++; @@ -1377,6 +1381,7 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) struct mpam_msc *msc = vmsc->msc; struct mpam_msc_ris *ris; + mpam_mon_sel_outer_lock(msc); list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, srcu_read_lock_held(&mpam_srcu)) { arg->ris = ris; @@ -1395,6 +1400,7 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) if (err) any_err = err; } + mpam_mon_sel_outer_unlock(msc); } return any_err; @@ -1477,19 +1483,21 @@ void mpam_msmon_reset_all_mbwu(struct mpam_component *comp) continue; msc = vmsc->msc; + mpam_mon_sel_outer_lock(msc); list_for_each_entry_rcu(ris, &msc->ris, vmsc_list) { if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) continue; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) continue; for (i = 0; i < ris->props.num_mbwu_mon; i++) { ris->mbwu_state[i].correction = 0; ris->mbwu_state[i].reset_on_next_read = true; } - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } + mpam_mon_sel_outer_unlock(msc); } srcu_read_unlock(&mpam_srcu, idx); } @@ -1510,18 +1518,20 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) continue; msc = vmsc->msc; + mpam_mon_sel_outer_lock(msc); list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, srcu_read_lock_held(&mpam_srcu)) { if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) continue; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) continue; ris->mbwu_state[ctx->mon].correction = 0; ris->mbwu_state[ctx->mon].reset_on_next_read = true; - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } + mpam_mon_sel_outer_unlock(msc); } } @@ -1708,8 +1718,11 @@ static int mpam_restore_mbwu_state(void *_ris) int i; struct mon_read mwbu_arg; struct mpam_msc_ris *ris = _ris; + struct mpam_msc *msc = ris->vmsc->msc; struct mpam_class *class = ris->vmsc->comp->class; + mpam_mon_sel_outer_lock(msc); + for (i = 0; i < ris->props.num_mbwu_mon; i++) { if (ris->mbwu_state[i].enabled) { mwbu_arg.ris = ris; @@ -1720,10 +1733,12 @@ static int mpam_restore_mbwu_state(void *_ris) } } + mpam_mon_sel_outer_unlock(msc); + return 0; } -/* Call with MSC lock and held */ +/* Call with MSC lock and outer mon_sel lock held */ static int mpam_save_mbwu_state(void *arg) { int i; @@ -1738,7 +1753,7 @@ static int mpam_save_mbwu_state(void *arg) mbwu_state = &ris->mbwu_state[i]; cfg = &mbwu_state->cfg; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) return -EIO; mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, i) | @@ -1763,7 +1778,7 @@ static int mpam_save_mbwu_state(void *arg) cfg->partid = FIELD_GET(MSMON_CFG_x_FLT_PARTID, cur_flt); mbwu_state->correction += val; mbwu_state->enabled = FIELD_GET(MSMON_CFG_x_CTL_EN, cur_ctl); - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } return 0; @@ -1852,6 +1867,7 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online) { struct mpam_msc_ris *ris; + mpam_mon_sel_outer_lock(msc); list_for_each_entry_srcu(ris, &msc->ris, msc_list, srcu_read_lock_held(&mpam_srcu)) { mpam_touch_msc(msc, &mpam_reset_ris, ris); @@ -1864,6 +1880,7 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online) if (mpam_is_enabled() && !online) mpam_touch_msc(msc, &mpam_save_mbwu_state, ris); } + mpam_mon_sel_outer_unlock(msc); } static void mpam_reprogram_msc(struct mpam_msc *msc) @@ -2802,11 +2819,13 @@ static void __destroy_component_cfg(struct mpam_component *comp) list_for_each_entry(vmsc, &comp->vmsc, comp_list) { msc = vmsc->msc; - if (mpam_mon_sel_lock(msc)) { + mpam_mon_sel_outer_lock(msc); + if (mpam_mon_sel_inner_lock(msc)) { list_for_each_entry(ris, &vmsc->ris, vmsc_list) add_to_garbage(ris->mbwu_state); - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } + mpam_mon_sel_outer_unlock(msc); } } @@ -2850,6 +2869,7 @@ static int __allocate_component_cfg(struct mpam_component *comp) mpam_reset_component_cfg(comp); list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + int err = 0; struct mpam_msc *msc; struct mpam_msc_ris *ris; struct msmon_mbwu_state *mbwu_state; @@ -2858,6 +2878,7 @@ static int __allocate_component_cfg(struct mpam_component *comp) continue; msc = vmsc->msc; + mpam_mon_sel_outer_lock(msc); list_for_each_entry(ris, &vmsc->ris, vmsc_list) { if (!ris->props.num_mbwu_mon) continue; @@ -2867,16 +2888,21 @@ static int __allocate_component_cfg(struct mpam_component *comp) GFP_KERNEL); if (!mbwu_state) { __destroy_component_cfg(comp); - return -ENOMEM; + err = -ENOMEM; + break; } init_garbage(&mbwu_state[0].garbage); - if (mpam_mon_sel_lock(msc)) { + if (mpam_mon_sel_inner_lock(msc)) { ris->mbwu_state = mbwu_state; - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } } + mpam_mon_sel_outer_unlock(msc); + + if (err) + return err; } return 0; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index f890d1381af69..fc00efefed642 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -123,16 +123,20 @@ struct mpam_msc { /* * mon_sel_lock protects access to the MSC hardware registers that are * affected by MPAMCFG_MON_SEL, and the mbwu_state. - * Access to mon_sel is needed from both process and interrupt contexts, - * but is complicated by firmware-backed platforms that can't make any - * access unless they can sleep. - * Always use the mpam_mon_sel_lock() helpers. - * Accesses to mon_sel need to be able to fail if they occur in the wrong - * context. + * Both the 'inner' and 'outer' must be taken. + * For real MMIO MSC, the outer lock is unnecessary - but keeps the + * code common with: + * Firmware backed MSC need to sleep when accessing the MSC, which + * means some code-paths will always fail. For these MSC the outer + * lock is providing the protection, and the inner lock fails to + * be taken if the task is unable to sleep. + * * If needed, take msc->probe_lock first. */ - raw_spinlock_t _mon_sel_lock; - unsigned long _mon_sel_flags; + struct mutex outer_mon_sel_lock; + bool outer_lock_held; + raw_spinlock_t inner_mon_sel_lock; + unsigned long inner_mon_sel_flags; void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; @@ -145,28 +149,56 @@ struct mpam_msc { struct mpam_garbage garbage; }; -/* Returning false here means accesses to mon_sel must fail and report an error. */ -static inline bool __must_check mpam_mon_sel_lock(struct mpam_msc *msc) +static inline bool __must_check mpam_mon_sel_inner_lock(struct mpam_msc *msc) { - WARN_ON_ONCE(msc->iface != MPAM_IFACE_MMIO); + /* + * The outer lock may be taken by a CPU that then issues an IPI to run + * a helper that takes the inner lock. lockdep can't help us here. + */ + WARN_ON_ONCE(!READ_ONCE(msc->outer_lock_held)); + + if (msc->iface == MPAM_IFACE_MMIO) { + raw_spin_lock_irqsave(&msc->inner_mon_sel_lock, msc->inner_mon_sel_flags); + return true; + } + + /* Accesses must fail if we are not pre-emptible */ + return !!preemptible(); +} - raw_spin_lock_irqsave(&msc->_mon_sel_lock, msc->_mon_sel_flags); - return true; +static inline void mpam_mon_sel_inner_unlock(struct mpam_msc *msc) +{ + WARN_ON_ONCE(!READ_ONCE(msc->outer_lock_held)); + + if (msc->iface == MPAM_IFACE_MMIO) + raw_spin_unlock_irqrestore(&msc->inner_mon_sel_lock, msc->inner_mon_sel_flags); +} + +static inline void mpam_mon_sel_outer_lock(struct mpam_msc *msc) +{ + mutex_lock(&msc->outer_mon_sel_lock); + msc->outer_lock_held = true; } -static inline void mpam_mon_sel_unlock(struct mpam_msc *msc) +static inline void mpam_mon_sel_outer_unlock(struct mpam_msc *msc) { - raw_spin_unlock_irqrestore(&msc->_mon_sel_lock, msc->_mon_sel_flags); + msc->outer_lock_held = false; + mutex_unlock(&msc->outer_mon_sel_lock); } static inline void mpam_mon_sel_lock_held(struct mpam_msc *msc) { - lockdep_assert_held_once(&msc->_mon_sel_lock); + WARN_ON_ONCE(!READ_ONCE(msc->outer_lock_held)); + if (msc->iface == MPAM_IFACE_MMIO) + lockdep_assert_held_once(&msc->inner_mon_sel_lock); + else + lockdep_assert_preemption_enabled(); } static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) { - raw_spin_lock_init(&msc->_mon_sel_lock); + raw_spin_lock_init(&msc->inner_mon_sel_lock); + mutex_init(&msc->outer_mon_sel_lock); } /* Bits for mpam features bitmaps */ From 775849db1bf1ab27755063eead69ca6aeb287358 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Wed, 20 Aug 2025 14:15:21 +0100 Subject: [PATCH 151/247] NVIDIA: SAUCE: arm_mpam: Relax num_rmids parameter advertised to userspace BugLink: https://bugs.launchpad.net/bugs/2122432 On MPAM systems, monitoring groups are identified in the hardware by a (PARTID, PMG) pair. Two monitoring group identifiers are the same only if the PARTIDs and PMGs both match. This means that the number of monitoring groups that can be created in each control group is the same as the number of distinct PMG values supported by the hardware. The number of monitoring groups that exist in other control groups at the same time makes no difference to this. Currently, the MPAM driver takes the cautious approach and always num_rmids = 1. Relax this limit, by advertising the number of distinct PMG values supported by the hardware. Code/Data Prioritization (CDP) makes no difference, since although this doubles the number of (PARTID, PMG) pairs available to a control group, each monitoring group now consumes two pairs instead of one. Suggested-by: Shaopeng Tan Signed-off-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit ef05d54918ccc6ddd6f6d9d64fbaa2478ba96b44 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 0ea76b7783b6b..d3241e6666ca3 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1431,12 +1431,17 @@ static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, /* * Unfortunately, num_rmid doesn't mean anything for * mpam, and its exposed to user-space! - * num-rmid is supposed to mean the number of groups - * that can be created, both control or monitor groups. - * For mpam, each control group has its own pmg/rmid - * space. + * + * num-rmid is supposed to mean the minimum number of + * monitoring groups that can exist simultaneously, including + * the default monitoring group for each control group. + * + * For mpam, each control group has its own pmg/rmid space, so + * it is not appropriate to advertise the whole rmid_idx space + * here. But the pmgs corresponding to the parent control + * group can be allocated freely: */ - l3->mon.num_rmid = 1; + l3->mon.num_rmid = mpam_pmg_max + 1;; switch (type) { case QOS_L3_MBM_LOCAL_EVENT_ID: From 2310f8417f2af2b92addbc5908f384c24ebb392e Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 29 Oct 2021 16:13:51 +0100 Subject: [PATCH 152/247] NVIDIA: SAUCE: arm_mpam: Allow the maximum partid to be overridden from the command line BugLink: https://bugs.launchpad.net/bugs/2122432 MPAMs bandwidth monitors are only available via resctrl if there are enough monitors for each combination of partid and pmg to have one. As it is unlikely anyone built that many monitors, allow the maximum partid the system will use to be set from the kernel command-line. With this, it should be possible for bandwidth monitors to be enabled by reducing the number of partid in use. Signed-off-by: James Morse (cherry picked from commit 1653c7f5b6078fb596d33a23771fc26ce5654f6e https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 8a86660f73787..831eeb2ba9175 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,8 @@ static DEFINE_MUTEX(mpam_cpuhp_state_lock); u16 mpam_partid_max; u8 mpam_pmg_max; static bool partid_max_init, partid_max_published; +static u16 mpam_cmdline_partid_max; +static bool mpam_cmdline_partid_max_overridden; static DEFINE_SPINLOCK(partid_max_lock); /* @@ -275,6 +278,9 @@ int mpam_register_requestor(u16 partid_max, u8 pmg_max) return -EBUSY; } + if (mpam_cmdline_partid_max_overridden) + mpam_partid_max = min(mpam_cmdline_partid_max, mpam_partid_max); + return 0; } EXPORT_SYMBOL(mpam_register_requestor); @@ -3417,6 +3423,38 @@ static int __init mpam_msc_driver_init(void) /* Must occur after arm64_mpam_register_cpus() from arch_initcall() */ subsys_initcall(mpam_msc_driver_init); +static int mpam_cmdline_partid_max_set(const char *arg, + const struct kernel_param *kp) +{ + int ret; + + spin_lock(&partid_max_lock); + ret = kstrtou16(arg, 10, &mpam_cmdline_partid_max); + if (!ret) + mpam_cmdline_partid_max_overridden = true; + spin_unlock(&partid_max_lock); + + return 0; +} +static int mpam_cmdline_partid_max_get(char *buffer, + const struct kernel_param *kp) +{ + u16 val = 0xffff; + + spin_lock(&partid_max_lock); + if (mpam_cmdline_partid_max_overridden) + val = mpam_cmdline_partid_max; + spin_unlock(&partid_max_lock); + + return sprintf(buffer, "%u\n", val); +} +static const struct kernel_param_ops mpam_cmdline_partid_max_ops = { + .set = mpam_cmdline_partid_max_set, + .get = mpam_cmdline_partid_max_get, +}; +module_param_cb(partid_max, &mpam_cmdline_partid_max_ops, NULL, 0644); +MODULE_PARM_DESC(partid_max, "Override for reducing the number of PARTID."); + #ifdef CONFIG_MPAM_KUNIT_TEST #include "test_mpam_devices.c" #endif From 18b6584e18c7c2246187699b9ff8dcff077decfe Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 1 Jul 2025 17:03:13 +0100 Subject: [PATCH 153/247] NVIDIA: SAUCE: arm_mpam: Allow MSC to be forced to have an unknown location BugLink: https://bugs.launchpad.net/bugs/2122432 The MPAM driver discovers which MSC control which system resources from firmware tables. The MPAM resctrl picking code then attempts to export platforms that are Xeon shaped via resctrl. Occasionally, the presence of one or more MSC prevents the platform being described as Xeon shaped, and exposed via resctrl. For example with CPU-less NUMA nodes. The additional node doensn't have an L3, so can't have domain-ids exposed for the 'MB' memory bandwidth controls. In this example, some users would prefer to control bandwidth on just the CPU nodes, instead of having nothing at all. Allow users an amount of wiggle room by allowing MSC to be forced to be treated as unknown. This effectively disables parts of the MPAM functionality. Unknown MSC are not disabled, They are still probed and contribute to the system wide properties. Suggested-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit ffc605a48a7be279dc1ca264ec45b3df7ed118eb https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 64 +++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 2 ++ 2 files changed, 66 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 831eeb2ba9175..a57d53cbc6dc5 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include "mpam_internal.h" @@ -698,6 +700,9 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, { int err; + if (mpam_force_unknown_msc_test(msc)) + type = MPAM_CLASS_UNKNOWN; + mutex_lock(&mpam_list_lock); err = mpam_ris_create_locked(msc, ris_idx, type, class_id, component_id); @@ -3455,6 +3460,65 @@ static const struct kernel_param_ops mpam_cmdline_partid_max_ops = { module_param_cb(partid_max, &mpam_cmdline_partid_max_ops, NULL, 0644); MODULE_PARM_DESC(partid_max, "Override for reducing the number of PARTID."); +static DEFINE_XARRAY(mpam_force_unkown_msc); + +static void mpam_force_unknown_msc_add(u32 msc_id, gfp_t gfp) +{ + xa_store(&mpam_force_unkown_msc, msc_id, xa_mk_value(msc_id), gfp); +} + +bool mpam_force_unknown_msc_test(struct mpam_msc *msc) +{ + return !!xa_load(&mpam_force_unkown_msc, msc->pdev->id); +} + +static int mpam_force_unknown_msc_set(const char *_str, + const struct kernel_param *kp) +{ + int err; + u32 val; + char *tok, *iter; + char *str __free(kfree) = kstrdup(_str, GFP_KERNEL); + + iter = str; + do { + tok = strsep(&iter, ","); + err = kstrtou32(tok, 10, &val); + if (err) { + pr_err("Failed to parse commandline: %d\n", err); + break; + } + mpam_force_unknown_msc_add(val, GFP_KERNEL); + } while (iter); + + return 0; +} +static int mpam_force_unknown_msc_get(char *buffer, + const struct kernel_param *kp) +{ + unsigned long index, count = 0; + int result = 0; + void *entry; + + xa_for_each(&mpam_force_unkown_msc, index, entry) { + if (count) + result += sprintf(buffer + result, ","); + + result += sprintf(buffer + result, "%lu", index); + count += 1; + } + + result += sprintf(buffer + result, "\n"); + + return result; +} +static const struct kernel_param_ops mpam_force_unknown_msc_ops = { + .set = mpam_force_unknown_msc_set, + .get = mpam_force_unknown_msc_get, +}; +subsys_param_cb(force_unknown_msc, &mpam_force_unknown_msc_ops, NULL, 0644); +MODULE_PARM_DESC(force_unknown_msc, "Disabling a set of probed MSC."); + #ifdef CONFIG_MPAM_KUNIT_TEST #include "test_mpam_devices.c" #endif diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index fc00efefed642..216db7892ef85 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -536,6 +536,8 @@ void mpam_msmon_reset_all_mbwu(struct mpam_component *comp); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +bool mpam_force_unknown_msc_test(struct mpam_msc *msc); + #ifdef CONFIG_RESCTRL_FS int mpam_resctrl_setup(void); void mpam_resctrl_exit(void); From 9fefecf6a5c5249a6a2a2f264281f103a5a6f08e Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 15 Sep 2022 18:00:40 +0100 Subject: [PATCH 154/247] NVIDIA: SAUCE: fs/resctrl: Add this_is_not_abi mount option BugLink: https://bugs.launchpad.net/bugs/2122432 Some later things in the MPAM tree enable behaviour that resctrl doesn't have upstream. To make it clear to people using the out-of-tree code that they shouldn't be relying on this in user-space, add a mount option to enable this stuff. Signed-off-by: James Morse (cherry picked from commit c006e8a9ef1e740ccf2752e688a7834b7b817b89 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/internal.h | 3 +++ fs/resctrl/rdtgroup.c | 59 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index cf1fd82dc5a99..45f86943ddf24 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -42,6 +42,7 @@ struct rdt_fs_context { bool enable_cdpl3; bool enable_mba_mbps; bool enable_debug; + bool enable_abi_playground; }; static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) @@ -296,6 +297,8 @@ struct mbm_state { u32 prev_bw; }; +DECLARE_STATIC_KEY_FALSE(resctrl_abi_playground); + extern struct mutex rdtgroup_mutex; static inline const char *rdt_kn_name(const struct kernfs_node *kn) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index ce4e716e6404a..9d7a65df0ab49 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -86,6 +86,9 @@ enum resctrl_event_id mba_mbps_default_event; static bool resctrl_debug; +/* Enable wacky behaviour that is not supported upstream. */ +DEFINE_STATIC_KEY_FALSE(resctrl_abi_playground); + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -2712,6 +2715,42 @@ static void schemata_list_destroy(void) } } +static void hack_file_mode(const char *name, u16 mode) +{ + struct rftype *rfts, *rft; + int len; + + mutex_lock(&rdtgroup_mutex); + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + rft->mode = mode; + } + + mutex_unlock(&rdtgroup_mutex); +} + +static void enable_abi_playground(void) +{ + static_key_enable(&resctrl_abi_playground.key); + + /* Make the tasks file read only */ + if (IS_ENABLED(CONFIG_CGROUP_RESCTRL)) + hack_file_mode("tasks", 0444); +} + +static void disable_abi_playground(void) +{ + static_key_disable(&resctrl_abi_playground.key); + + /* Make the tasks file read/write only */ + if (IS_ENABLED(CONFIG_CGROUP_RESCTRL)) + hack_file_mode("tasks", 0644); +} + static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); @@ -2720,6 +2759,9 @@ static int rdt_get_tree(struct fs_context *fc) struct rdt_resource *r; int ret; + if (ctx->enable_abi_playground) + enable_abi_playground(); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* @@ -2834,6 +2876,7 @@ enum rdt_param { Opt_cdpl2, Opt_mba_mbps, Opt_debug, + Opt_not_abi_playground, nr__rdt_params }; @@ -2842,6 +2885,13 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = { fsparam_flag("cdpl2", Opt_cdpl2), fsparam_flag("mba_MBps", Opt_mba_mbps), fsparam_flag("debug", Opt_debug), + + /* + * Some of MPAM's out of tree code exposes things through resctrl + * that need much more discussion before they are considered for + * mainline. Add a mount option that can be used to hide these crimes. + */ + fsparam_flag("this_is_not_abi", Opt_not_abi_playground), {} }; @@ -2872,6 +2922,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_debug: ctx->enable_debug = true; return 0; + case Opt_not_abi_playground: + ctx->enable_abi_playground = true; + return 0; } return -EINVAL; @@ -3115,6 +3168,9 @@ static void rdt_kill_sb(struct super_block *sb) kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); + + if (static_branch_unlikely(&resctrl_abi_playground)) + disable_abi_playground(); } static struct file_system_type rdt_fs_type = { @@ -4121,6 +4177,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (resctrl_debug) seq_puts(seq, ",debug"); + if (static_branch_unlikely(&resctrl_abi_playground)) + seq_puts(seq, ",this_is_not_abi"); + return 0; } From caa9d3f8609386545ff84155a2016bbdb59191fa Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 17 Sep 2021 13:19:13 +0100 Subject: [PATCH 155/247] NVIDIA: SAUCE: iommu/arm-smmu-v3: Register SMMU capabilities with MPAM BugLink: https://bugs.launchpad.net/bugs/2122432 Traffic in the system can be tagged with a PARTID and PMG. Different requestors can support a different number of bits for these fields. Before MPAM can be used, the MPAM driver has to discover the minimum number of bits supported by any requestor, which affects the range of PARTID and PMG that can be used. Detect whether the SMMU supports MPAM, if it does provide the MPAM driver with the maximum PARTID and PMG values. Tested-by: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit 7c7e4d2f36a916c5bba40329cfc762e9e6433aed https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 28 +++++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 6 +++++ 2 files changed, 34 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 3046f496a8422..c52eaaa2a30df 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -4309,6 +4310,29 @@ static void arm_smmu_get_httu(struct arm_smmu_device *smmu, u32 reg) hw_features, fw_features); } +static void arm_smmu_mpam_register_smmu(struct arm_smmu_device *smmu) +{ + u16 partid_max; + u8 pmg_max; + u32 reg; + + if (!IS_ENABLED(CONFIG_ARM64_MPAM)) + return; + + if (!(smmu->features & ARM_SMMU_FEAT_MPAM)) + return; + + reg = readl_relaxed(smmu->base + ARM_SMMU_MPAMIDR); + if (!reg) + return; + + partid_max = FIELD_GET(SMMU_MPAMIDR_PARTID_MAX, reg); + pmg_max = FIELD_GET(SMMU_MPAMIDR_PMG_MAX, reg); + + if (mpam_register_requestor(partid_max, pmg_max)) + smmu->features &= ~ARM_SMMU_FEAT_MPAM; +} + static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) { u32 reg; @@ -4462,6 +4486,8 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) smmu->features |= ARM_SMMU_FEAT_RANGE_INV; if (FIELD_GET(IDR3_FWB, reg)) smmu->features |= ARM_SMMU_FEAT_S2FWB; + if (FIELD_GET(IDR3_MPAM, reg)) + smmu->features |= ARM_SMMU_FEAT_MPAM; if (FIELD_GET(IDR3_BBM, reg) == 2) smmu->features |= ARM_SMMU_FEAT_BBML2; @@ -4529,6 +4555,8 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) if (arm_smmu_sva_supported(smmu)) smmu->features |= ARM_SMMU_FEAT_SVA; + arm_smmu_mpam_register_smmu(smmu); + dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n", smmu->ias, smmu->oas, smmu->features); return 0; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index ae23aacc38402..c5860f487f752 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -59,6 +59,7 @@ struct arm_vsmmu; #define IDR1_SIDSIZE GENMASK(5, 0) #define ARM_SMMU_IDR3 0xc +#define IDR3_MPAM (1 << 7) #define IDR3_FWB (1 << 8) #define IDR3_RIL (1 << 10) #define IDR3_BBM GENMASK(12, 11) @@ -170,6 +171,10 @@ struct arm_vsmmu; #define ARM_SMMU_PRIQ_IRQ_CFG1 0xd8 #define ARM_SMMU_PRIQ_IRQ_CFG2 0xdc +#define ARM_SMMU_MPAMIDR 0x130 +#define SMMU_MPAMIDR_PARTID_MAX GENMASK(15, 0) +#define SMMU_MPAMIDR_PMG_MAX GENMASK(23, 16) + #define ARM_SMMU_REG_SZ 0xe00 /* Common MSI config fields */ @@ -768,6 +773,7 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_HD (1 << 22) #define ARM_SMMU_FEAT_S2FWB (1 << 23) #define ARM_SMMU_FEAT_BBML2 (1 << 24) +#define ARM_SMMU_FEAT_MPAM (1 << 25) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) From 1ac17fce494e5c5be9bab6829c422b1a48084c73 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 14 Sep 2021 17:57:42 +0100 Subject: [PATCH 156/247] NVIDIA: SAUCE: iommu/arm-smmu-v3: Add mpam helpers to query and set state BugLink: https://bugs.launchpad.net/bugs/2122432 To allow an iommu_group to be moved between resctrl groups as if it were a CPU thread, the mpam driver needs to be able to set the partid and pmg for the iommu_group. Use the properties in the STE, as these only apply to one stream. The MPAM driver also needs to know the maximum partid and pmg values that the SMMU can generate. This allows it to determine the system-wide common supported range of values. Add a helper to return this id register. Tested-by: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit 46a241f45ca9b71abf900f31a0c89fcbf24c44c4 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 92 +++++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 ++ drivers/iommu/iommu.c | 6 ++ include/linux/iommu.h | 7 ++ 4 files changed, 112 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index c52eaaa2a30df..b6466a261ffb6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3681,6 +3681,96 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } +static int arm_smmu_group_set_mpam(struct iommu_group *group, u16 partid, + u8 pmg) +{ + int i; + u32 sid; + unsigned long flags; + struct arm_smmu_ste *step; + struct iommu_domain *domain; + struct arm_smmu_device *smmu; + struct arm_smmu_master *master; + struct arm_smmu_cmdq_batch cmds; + struct arm_smmu_domain *smmu_domain; + struct arm_smmu_cmdq_ent cmd = { + .opcode = CMDQ_OP_CFGI_STE, + .cfgi = { + .leaf = true, + }, + }; + struct arm_smmu_master_domain *master_domain; + + domain = iommu_get_domain_for_group(group); + smmu_domain = to_smmu_domain(domain); + if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + return -EIO; + smmu = smmu_domain->smmu; + + arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd); + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + master = master_domain->master; + + for (i = 0; i < master->num_streams; i++) { + sid = master->streams[i].id; + step = arm_smmu_get_step_for_sid(smmu, sid); + + /* These need locking if the VMSPtr is ever used */ + step->data[4] = FIELD_PREP(STRTAB_STE_4_PARTID, partid); + step->data[5] = FIELD_PREP(STRTAB_STE_5_PMG, pmg); + + cmd.cfgi.sid = sid; + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); + } + + master->partid = partid; + master->pmg = pmg; + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + arm_smmu_cmdq_batch_submit(smmu, &cmds); + + return 0; +} + +static int arm_smmu_group_get_mpam(struct iommu_group *group, u16 *partid, + u8 *pmg) +{ + int err = -EINVAL; + unsigned long flags; + struct iommu_domain *domain; + struct arm_smmu_master *master; + struct arm_smmu_domain *smmu_domain; + struct arm_smmu_master_domain *master_domain; + + domain = iommu_get_domain_for_group(group); + smmu_domain = to_smmu_domain(domain); + if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + return -EIO; + + if (!partid && !pmg) + return 0; + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + master = master_domain->master; + if (master) { + if (partid) + *partid = master->partid; + if (pmg) + *pmg = master->pmg; + err = 0; + } + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + return err; +} + static const struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, @@ -3693,6 +3783,8 @@ static const struct iommu_ops arm_smmu_ops = { .device_group = arm_smmu_device_group, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, + .get_group_qos_params = arm_smmu_group_get_mpam, + .set_group_qos_params = arm_smmu_group_set_mpam, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, .get_viommu_size = arm_smmu_get_viommu_size, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index c5860f487f752..09a9c77d9140f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -276,6 +276,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_1_MEV (1UL << 19) #define STRTAB_STE_1_S2FWB (1UL << 25) #define STRTAB_STE_1_S1STALLD (1UL << 27) +#define STRTAB_STE_1_S1MPAM (1UL << 26) #define STRTAB_STE_1_EATS GENMASK_ULL(29, 28) #define STRTAB_STE_1_EATS_ABT 0UL @@ -306,6 +307,10 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) +#define STRTAB_STE_4_PARTID GENMASK_ULL(31, 16) + +#define STRTAB_STE_5_PMG GENMASK_ULL(7, 0) + /* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */ #define STRTAB_STE_0_NESTING_ALLOWED \ cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \ @@ -858,6 +863,8 @@ struct arm_smmu_master { bool stall_enabled; unsigned int ssid_bits; unsigned int iopf_refcount; + u16 partid; + u8 pmg; }; /* SMMU private data for an IOMMU domain */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 59244c744eabd..8d8d9b68cc463 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2207,6 +2207,12 @@ struct iommu_domain *iommu_get_domain_for_dev(struct device *dev) } EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); +struct iommu_domain *iommu_get_domain_for_group(struct iommu_group *group) +{ + return group->domain; +} +EXPORT_SYMBOL_GPL(iommu_get_domain_for_group); + /* * For IOMMU_DOMAIN_DMA implementations which already provide their own * guarantees that the group and its default domain are valid and correct. diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473d..773d1ba12e67a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -703,6 +703,12 @@ struct iommu_ops { struct iommu_domain *parent_domain, const struct iommu_user_data *user_data); + /* Per group IOMMU features */ + int (*get_group_qos_params)(struct iommu_group *group, u16 *partition, + u8 *perf_mon_grp); + int (*set_group_qos_params)(struct iommu_group *group, u16 partition, + u8 perf_mon_grp); + const struct iommu_domain_ops *default_domain_ops; struct module *owner; struct iommu_domain *identity_domain; @@ -909,6 +915,7 @@ extern int iommu_attach_device(struct iommu_domain *domain, extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); +extern struct iommu_domain *iommu_get_domain_for_group(struct iommu_group *group); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); From e19b8b417f9e446270e917faea9c8d437e943c77 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 31 Jul 2023 12:07:30 +0100 Subject: [PATCH 157/247] NVIDIA: SAUCE: iommu: Add helpers to get and set the QoS state BugLink: https://bugs.launchpad.net/bugs/2122432 To allow an iommu_group to be moved between resctrl groups as if it were a CPU thread, the mpam driver needs to be able to set the partid and pmg for the iommu_group. Add helpers that call the iommu driver's get/set methods for these parameters. Signed-off-by: James Morse (cherry picked from commit 5ee2d478b62586acf816699e3cc479f49b682a58 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/iommu/iommu.c | 76 +++++++++++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 15 +++++++++ 2 files changed, 91 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 8d8d9b68cc463..4ac1c7f220fee 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3861,3 +3861,79 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return ret; } #endif /* CONFIG_IRQ_MSI_IOMMU */ + +/* + * iommu_group_set_qos_params() - Set the QoS parameters for a group + * @group: the iommu group. + * @partition: the partition label all traffic from the group should use. + * @perf_mon_grp: the performance label all traffic from the group should use. + * + * Return: 0 on success, or an error. + */ +int iommu_group_set_qos_params(struct iommu_group *group, + u16 partition, u8 perf_mon_grp) +{ + const struct iommu_ops *ops; + struct group_device *device; + int ret; + + mutex_lock(&group->mutex); + device = list_first_entry_or_null(&group->devices, typeof(*device), + list); + if (!device) { + ret = -ENODEV; + goto out_unlock; + } + + ops = dev_iommu_ops(device->dev); + if (!ops->set_group_qos_params) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = ops->set_group_qos_params(group, partition, perf_mon_grp); + +out_unlock: + mutex_unlock(&group->mutex); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_group_set_qos_params, "IOMMUFD_INTERNAL"); + +/* + * iommu_group_get_qos_params() - Get the QoS parameters for a group + * @group: the iommu group. + * @partition: the partition label all traffic from the group uses. + * @perf_mon_grp: the performance label all traffic from the group uses. + * + * Return: 0 on success, or an error. + */ +int iommu_group_get_qos_params(struct iommu_group *group, + u16 *partition, u8 *perf_mon_grp) +{ + const struct iommu_ops *ops; + struct group_device *device; + int ret; + + mutex_lock(&group->mutex); + device = list_first_entry_or_null(&group->devices, typeof(*device), + list); + if (!device) { + ret = -ENODEV; + goto out_unlock; + } + + ops = dev_iommu_ops(device->dev); + if (!ops->get_group_qos_params) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = ops->get_group_qos_params(group, partition, perf_mon_grp); + +out_unlock: + mutex_unlock(&group->mutex); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_group_get_qos_params, "IOMMUFD_INTERNAL"); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 773d1ba12e67a..a3eefd82760ff 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1191,6 +1191,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev); void iommu_free_global_pasid(ioasid_t pasid); +int iommu_group_set_qos_params(struct iommu_group *group, + u16 partition, u8 perf_mon_grp); +int iommu_group_get_qos_params(struct iommu_group *group, + u16 *partition, u8 *perf_mon_grp); #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1514,6 +1518,17 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) } static inline void iommu_free_global_pasid(ioasid_t pasid) {} +static inline int iommu_group_set_qos_params(struct iommu_group *group, + u16 partition, u8 perf_mon_grp) +{ + return -ENODEV; +} + +static inline int iommu_group_get_qos_params(struct iommu_group *group, + u16 *partition, u8 *perf_mon_grp) +{ + return -ENODEV; +} #endif /* CONFIG_IOMMU_API */ #ifdef CONFIG_IRQ_MSI_IOMMU From 67c4db163f0fb2efd66f5ac816162e46674b3ba9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Sep 2021 16:19:43 +0100 Subject: [PATCH 158/247] NVIDIA: SAUCE: iommu: Add helpers to retrieve iommu_groups by id or kobject BugLink: https://bugs.launchpad.net/bugs/2122432 ARM SMMU with MPAM support are able to mark streams of traffic with the QoS labels MPAM uses. The user-space interface for MPAM is the resctrl filesystem, which allows threads to be moved between groups, its natural to do the same for iommu_groups. The resctrl interface lists threads, so will also need to list iommu_groups, it will be necessary to walk the list of iommu_groups. To ensure this matches what user-space sees via sysfs, it is best to walk the kobjects. When making a change, resctrl will only have the id of a group. To avoid walking the list of kobjects in this case, add iommu_group_get_by_id(). Signed-off-by: James Morse (cherry picked from commit 43349f9d05cdd700ffe08a4d5ee2dbcaa9a86f84 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/iommu/iommu.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 12 ++++++++++++ 2 files changed, 46 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 4ac1c7f220fee..669492f2f9f76 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1087,6 +1087,40 @@ struct iommu_group *iommu_group_alloc(void) } EXPORT_SYMBOL_GPL(iommu_group_alloc); +struct iommu_group *iommu_group_get_from_kobj(struct kobject *group_kobj) +{ + struct iommu_group *group; + + if (!iommu_group_kset || !group_kobj) + return NULL; + + group = container_of(group_kobj, struct iommu_group, kobj); + + kobject_get(group->devices_kobj); + kobject_put(&group->kobj); + + return group; +} + +struct iommu_group *iommu_group_get_by_id(int id) +{ + struct kobject *group_kobj; + const char *name; + + if (!iommu_group_kset) + return NULL; + + name = kasprintf(GFP_KERNEL, "%d", id); + if (!name) + return NULL; + + group_kobj = kset_find_obj(iommu_group_kset, name); + kfree(name); + + return iommu_group_get_from_kobj(group_kobj); +} +EXPORT_SYMBOL_GPL(iommu_group_get_by_id); + /** * iommu_group_get_iommudata - retrieve iommu_data registered for a group * @group: the group diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a3eefd82760ff..2aaa62dacdd0f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -909,6 +909,8 @@ static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) { return iommu_paging_domain_alloc_flags(dev, 0); } +struct iommu_group *iommu_group_get_from_kobj(struct kobject *group_kobj); +extern struct iommu_group *iommu_group_get_by_id(int id); extern void iommu_domain_free(struct iommu_domain *domain); extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); @@ -1222,6 +1224,16 @@ static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) return ERR_PTR(-ENODEV); } +static inline struct iommu_group *iommu_group_get_from_kobj(struct kobject *group_kobj) +{ + return NULL; +} + +static inline struct iommu_group *iommu_group_get_by_id(int id) +{ + return NULL; +} + static inline void iommu_domain_free(struct iommu_domain *domain) { } From 6f483549b73c37cca1e3ee3118888d00d20ca113 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 31 Jul 2023 13:10:25 +0100 Subject: [PATCH 159/247] NVIDIA: SAUCE: iommu: Add helper to retrieve iommu kset BugLink: https://bugs.launchpad.net/bugs/2122432 To walk the list of iommu groups visible in sysfs, resctrl needs access to iommu_group_kset. Expose it. Signed-off-by: James Morse (cherry picked from commit d133049deda711d40675b0fd3e93677d81046b01 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/iommu/iommu.c | 5 +++++ include/linux/iommu.h | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 669492f2f9f76..db770b73e3a8f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1121,6 +1121,11 @@ struct iommu_group *iommu_group_get_by_id(int id) } EXPORT_SYMBOL_GPL(iommu_group_get_by_id); +struct kset *iommu_get_group_kset(void) +{ + return kset_get(iommu_group_kset); +} + /** * iommu_group_get_iommudata - retrieve iommu_data registered for a group * @group: the group diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2aaa62dacdd0f..b74228f9f1ce0 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -968,6 +968,7 @@ extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); extern int iommu_group_id(struct iommu_group *group); +struct kset *iommu_get_group_kset(void); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); int iommu_set_pgtable_quirks(struct iommu_domain *domain, @@ -1392,6 +1393,11 @@ static inline int iommu_group_id(struct iommu_group *group) return -ENODEV; } +static inline struct kset *iommu_get_group_kset(void) +{ + return NULL; +} + static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks) { From d7da71eb1abcc675df175609235b550bba3b9e9f Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Sep 2021 16:45:41 +0100 Subject: [PATCH 160/247] NVIDIA: SAUCE: kobject: Add kset_get_next_obj() to allow a kset to be walked BugLink: https://bugs.launchpad.net/bugs/2122432 To expose iommu_groups via the resctrl filesystem, the resctrl driver needs to be able to walk the list of iommu_groups. These are exposed via sysfs as a kset. Add kset_get_next_obj() to allow resctrl to walk the kobjects in the kset. Signed-off-by: James Morse (cherry picked from commit ac5ef3495532d2cc9994a2fa8750b7b1b591cff1 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- include/linux/kobject.h | 2 ++ lib/kobject.c | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/kobject.h b/include/linux/kobject.h index c8219505a79f9..514e4cf1f0f54 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -200,6 +200,8 @@ static inline const struct kobj_type *get_ktype(const struct kobject *kobj) struct kobject *kset_find_obj(struct kset *, const char *); +struct kobject *kset_get_next_obj(struct kset *kset, struct kobject *prev); + /* The global /sys/kernel/ kobject for people to chain off of */ extern struct kobject *kernel_kobj; /* The global /sys/kernel/mm/ kobject for people to chain off of */ diff --git a/lib/kobject.c b/lib/kobject.c index abe5f5b856ceb..b1fdd6ad60f1b 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -920,6 +920,27 @@ struct kobject *kset_find_obj(struct kset *kset, const char *name) } EXPORT_SYMBOL_GPL(kset_find_obj); +struct kobject *kset_get_next_obj(struct kset *kset, struct kobject *prev) +{ + struct kobject *k; + + spin_lock(&kset->list_lock); + + if (!prev) + k = list_first_entry_or_null(&kset->list, typeof(*k), entry); + else + k = list_next_entry(prev, entry); + + if (list_entry_is_head(k, &kset->list, entry)) + k = NULL; + + kobject_get(k); + spin_unlock(&kset->list_lock); + kobject_put(prev); + + return k; +} + static void kset_release(struct kobject *kobj) { struct kset *kset = container_of(kobj, struct kset, kobj); From 1228dea0bda7487d2a7b7dc3b4a7bb36ed4b71a4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 31 Jul 2023 09:51:04 +0100 Subject: [PATCH 161/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add iommu helpers to get/set the partid and pmg BugLink: https://bugs.launchpad.net/bugs/2122432 SMMU that support MPAM can be configured to use a particular partid and pmg for a stream. The assignment of an iommu_group and its corresponding streams should be done via resctrl. Add helpers similar to setting a closid/rmid on a task. We need the same shifting if the CPUs are using CDP. The SMMU only takes one partid, conceptually its always making data accesses. Signed-off-by: James Morse (cherry picked from commit 2393c3cf73cf642b67533ade39094fb9dd9d053c https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/Kconfig | 1 + drivers/resctrl/mpam_resctrl.c | 53 ++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index 15163aabcdc93..b6d15ca5b6495 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -27,3 +27,4 @@ config ARM64_MPAM_RESCTRL_FS default y if ARM64_MPAM_DRIVER && RESCTRL_FS select RESCTRL_RMID_DEPENDS_ON_CLOSID select RESCTRL_ASSIGN_FIXED + select RESCTRL_IOMMU if ARM_SMMU_V3 diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index d3241e6666ca3..7be05de8a22a2 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -299,6 +300,58 @@ bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) return (tsk_closid == closid) && (tsk_rmid == rmid); } +int resctrl_arch_set_iommu_closid_rmid(struct iommu_group *group, u32 closid, + u32 rmid) +{ + u16 partid; + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU)) + return 0; + + if (cdp_enabled) + partid = closid << 1; + else + partid = closid; + + return iommu_group_set_qos_params(group, partid, rmid); +} + +bool resctrl_arch_match_iommu_closid(struct iommu_group *group, u32 closid) +{ + u16 partid; + int err = iommu_group_get_qos_params(group, &partid, NULL); + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU)) + return false; + + if (err) + return false; + + if (cdp_enabled) + partid >>= 1; + + return (partid == closid); +} + +bool resctrl_arch_match_iommu_closid_rmid(struct iommu_group *group, + u32 closid, u32 rmid) +{ + u8 pmg; + u16 partid; + int err = iommu_group_get_qos_params(group, &partid, &pmg); + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU)) + return false; + + if (err) + return false; + + if (cdp_enabled) + partid >>= 1; + + return (partid == closid) && (rmid == pmg); +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) From c83ad8462ee4f8221876d3943a997aa482bf7a95 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Sep 2021 17:11:58 +0100 Subject: [PATCH 162/247] NVIDIA: SAUCE: fs/resctrl: Add support for assigning iommu_groups to resctrl groups BugLink: https://bugs.launchpad.net/bugs/2122432 Arm's MPAM has support for assigning devices behind an IOMMU to a control or monitor group. This can be used for device-passthrough for a VM, or user-space drivers using VFIO to ensure the device is either in the same control group as the CPU threads. Alternatively, the iommu_group may be assigned to a different control group with preferential schema values. Extend the resctrl tasks file to include iommu_groups. These appear as 'iommu_group:0', where 0 is the group number that can be found from /sys/kernel/iommu_groups/. iommu_groups can be moved between resctrl groups by writing this string in the same way as tasks are moved. No state is preserved by resctrl, an iommu_group that disappears will no longer be listed as being part of a resctrl group. A new iommu_group will appear in the default group. Add helpers to list and move iommu_groups. Architecture specific helpers are used to apply the closid/rmid to the iommu_group due to the way MPAM emulates CDP. Tested-by: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit 98b622c413ee64b8e05f93f0ff5f8cf85776afba https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/acpi/arm64/mpam.c | 91 +++++++++++++++++++++++++++++++++-- fs/resctrl/Kconfig | 6 +++ fs/resctrl/rdtgroup.c | 99 ++++++++++++++++++++++++++++++++++++++- include/linux/resctrl.h | 28 +++++++++++ 4 files changed, 220 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c index 51c6f5fd4a5e0..845aedf61993d 100644 --- a/drivers/acpi/arm64/mpam.c +++ b/drivers/acpi/arm64/mpam.c @@ -95,17 +95,51 @@ static void acpi_mpam_parse_irqs(struct platform_device *pdev, res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error"); } -static int acpi_mpam_parse_resource(struct mpam_msc *msc, +#define UUID_MPAM_INTERCONNECT_TABLE "fe2bd645-033b-49e6-9479-2e0b8b21d1cd" + +struct acpi_mpam_interconnect_descriptor_table { + u8 type_uuid[16]; + u32 num_descriptors; +}; + +struct acpi_mpam_interconnect_descriptor { + u32 source_id; + u32 destination_id; + u8 link_type; + u8 reserved[3]; +}; + +static int acpi_mpam_parse_resource(struct acpi_mpam_msc_node *tbl_msc, + struct mpam_msc *msc, struct acpi_mpam_resource_node *res) { + struct acpi_mpam_interconnect_descriptor_table *tbl_int_tbl; + struct acpi_mpam_interconnect_descriptor *tbl_int; + guid_t int_tbl_uuid, spec_uuid; int level, nid; u32 cache_id; + off_t offset; + /* + * Class IDs are somewhat arbitrary, but need to be co-ordinated. + * 0-N are caches, + * 64, 65: Interconnect, but ideally these would appear between the + * classes the controls are adjacent to. + * 128: SMMU, + * 192-192+level: Memory Side Caches, nothing checks that N is a + * small number. + * 255: Memory Controllers + * + * ACPI devices would need a class id allocated based on the _HID. + * + * Classes that the mpam driver can't currently plumb into resctrl + * are registered as UNKNOWN. + */ switch (res->locator_type) { case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: cache_id = res->locator.cache_locator.cache_reference; level = find_acpi_cache_level_from_id(cache_id); - if (level <= 0) { + if (level <= 0 || level >= 64) { pr_err_once("Bad level (%d) for cache with id %u\n", level, cache_id); return -EINVAL; } @@ -120,6 +154,57 @@ static int acpi_mpam_parse_resource(struct mpam_msc *msc, } return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, 255, nid); + case ACPI_MPAM_LOCATION_TYPE_SMMU: + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_UNKNOWN, + 128, res->locator.smmu_locator.smmu_interface); + case ACPI_MPAM_LOCATION_TYPE_MEMORY_CACHE: + cache_id = res->locator.mem_cache_locator.reference; + level = res->locator.mem_cache_locator.level; + if (192 + level >= 255) { + pr_err_once("Bad level (%u) for memory side cache with reference %u\n", + level, cache_id); + return -EINVAL; + } + + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, + 192 + level, cache_id); + + case ACPI_MPAM_LOCATION_TYPE_INTERCONNECT: + /* Find the descriptor table, and check it lands in the parent msc */ + offset = res->locator.interconnect_ifc_locator.inter_connect_desc_tbl_off; + if (offset >= tbl_msc->length) { + pr_err_once("Bad offset (%lu) for interconnect descriptor on msc %u\n", + offset, tbl_msc->identifier); + return -EINVAL; + } + tbl_int_tbl = ACPI_ADD_PTR(struct acpi_mpam_interconnect_descriptor_table, + tbl_msc, offset); + guid_parse(UUID_MPAM_INTERCONNECT_TABLE, &spec_uuid); + import_guid(&int_tbl_uuid, tbl_int_tbl->type_uuid); + if (guid_equal(&spec_uuid, &int_tbl_uuid)) { + pr_err_once("Bad UUID for interconnect descriptor on msc %u\n", + tbl_msc->identifier); + return -EINVAL; + } + + offset += sizeof(*tbl_int_tbl); + offset += tbl_int_tbl->num_descriptors * sizeof(*tbl_int); + if (offset >= tbl_msc->length) { + pr_err_once("Bad num_descriptors (%u) for interconnect descriptor on msc %u\n", + tbl_int_tbl->num_descriptors, tbl_msc->identifier); + return -EINVAL; + } + + tbl_int = ACPI_ADD_PTR(struct acpi_mpam_interconnect_descriptor, + tbl_int_tbl, sizeof(*tbl_int_tbl)); + cache_id = tbl_int->source_id; + + /* Unknown link type? */ + if (tbl_int->link_type != 0 && tbl_int->link_type == 1) + return 0; + + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_UNKNOWN, + 64 + tbl_int->link_type, cache_id); default: /* These get discovered later and are treated as unknown */ return 0; @@ -150,7 +235,7 @@ int acpi_mpam_parse_resources(struct mpam_msc *msc, return -EINVAL; } - err = acpi_mpam_parse_resource(msc, resource); + err = acpi_mpam_parse_resource(tbl_msc, msc, resource); if (err) return err; diff --git a/fs/resctrl/Kconfig b/fs/resctrl/Kconfig index 21671301bd8a4..145d837c190a3 100644 --- a/fs/resctrl/Kconfig +++ b/fs/resctrl/Kconfig @@ -37,3 +37,9 @@ config RESCTRL_RMID_DEPENDS_ON_CLOSID Enabled by the architecture when the RMID values depend on the CLOSID. This causes the CLOSID allocator to search for CLOSID with clean RMID. + +config RESCTRL_IOMMU + bool + help + Enabled by the architecture when some IOMMU are able to be configured + with CLOSID/RMID. diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 9d7a65df0ab49..e7a48e373711a 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -763,10 +764,65 @@ static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, return ret; } +static int rdtgroup_move_iommu(int iommu_group_id, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + const struct cred *cred = current_cred(); + struct iommu_group *iommu_group; + int err; + + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID)) { + rdt_last_cmd_printf("No permission to move iommu_group %d\n", + iommu_group_id); + return -EPERM; + } + + iommu_group = iommu_group_get_by_id(iommu_group_id); + if (!iommu_group) { + rdt_last_cmd_printf("No matching iommu_group %d\n", + iommu_group_id); + return -ESRCH; + } + + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_iommu_closid(iommu_group, + rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move iommu_group to different control group\n"); + err = -EINVAL; + } else { + err = resctrl_arch_set_iommu_closid_rmid(iommu_group, + rdtgrp->closid, + rdtgrp->mon.rmid); + } + + iommu_group_put(iommu_group); + + return err; +} + +static bool string_is_iommu_group(char *buf, int *val) +{ + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU) || + !static_branch_unlikely(&resctrl_abi_playground)) + return false; + + if (strlen(buf) <= strlen("iommu_group:")) + return false; + + if (strncmp(buf, "iommu_group:", strlen("iommu_group:"))) + return false; + + buf += strlen("iommu_group:"); + + return !kstrtoint(buf, 0, val); +} + static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + int iommu_group_id; + bool is_iommu; char *pid_str; int ret = 0; pid_t pid; @@ -788,7 +844,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, while (buf && buf[0] != '\0' && buf[0] != '\n') { pid_str = strim(strsep(&buf, ",")); - if (kstrtoint(pid_str, 0, &pid)) { + is_iommu = string_is_iommu_group(pid_str, &iommu_group_id); + if (is_iommu) + ret = rdtgroup_move_iommu(iommu_group_id, rdtgrp, of); + else if (kstrtoint(pid_str, 0, &pid)) { rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); ret = -EINVAL; break; @@ -813,6 +872,42 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, return ret ?: nbytes; } +static bool iommu_matches_rdtgroup(struct iommu_group *group, struct rdtgroup *r) +{ + if (r->type == RDTCTRL_GROUP) + return resctrl_arch_match_iommu_closid(group, r->closid); + + return resctrl_arch_match_iommu_closid_rmid(group, r->closid, + r->mon.rmid); +} + +static void show_rdt_iommu(struct rdtgroup *r, struct seq_file *s) +{ + struct kset *iommu_groups; + struct iommu_group *group; + struct kobject *group_kobj = NULL; + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU) || + !static_branch_unlikely(&resctrl_abi_playground)) + return; + + iommu_groups = iommu_get_group_kset(); + + while ((group_kobj = kset_get_next_obj(iommu_groups, group_kobj))) { + /* iommu_group_get_from_kobj() wants to drop a reference */ + kobject_get(group_kobj); + + group = iommu_group_get_from_kobj(group_kobj); + if (!group) + continue; + + if (iommu_matches_rdtgroup(group, r)) + seq_printf(s, "iommu_group:%s\n", group_kobj->name); + } + + kset_put(iommu_groups); +} + static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) { struct task_struct *p, *t; @@ -827,6 +922,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) } } rcu_read_unlock(); + + show_rdt_iommu(r, s); } static int rdtgroup_tasks_show(struct kernfs_open_file *of, diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index e1f9c46ea4661..7e00e28b885a1 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -3,6 +3,7 @@ #define _RESCTRL_H #include +#include #include #include #include @@ -662,6 +663,7 @@ extern unsigned int resctrl_rmid_realloc_limit; int resctrl_init(void); void resctrl_exit(void); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK u64 resctrl_arch_get_prefetch_disable_bits(void); int resctrl_arch_pseudo_lock_fn(void *_plr); @@ -675,4 +677,30 @@ static inline int resctrl_arch_measure_cycles_lat_fn(void *_plr) { return 0; } static inline int resctrl_arch_measure_l2_residency(void *_plr) { return 0; } static inline int resctrl_arch_measure_l3_residency(void *_plr) { return 0; } #endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ + +/* When supported, the architecture must implement these */ +#ifdef CONFIG_RESCTRL_IOMMU +int resctrl_arch_set_iommu_closid_rmid(struct iommu_group *group, u32 closid, + u32 rmid); +bool resctrl_arch_match_iommu_closid(struct iommu_group *group, u32 closid); +bool resctrl_arch_match_iommu_closid_rmid(struct iommu_group *group, u32 closid, + u32 rmid); +#else +static inline int resctrl_arch_set_iommu_closid_rmid(struct iommu_group *group, + u32 closid, u32 rmid) +{ + return -EOPNOTSUPP; +} +static inline bool resctrl_arch_match_iommu_closid(struct iommu_group *group, + u32 closid) +{ + return false; +} +static inline bool +resctrl_arch_match_iommu_closid_rmid(struct iommu_group *group, + u32 closid, u32 rmid) +{ + return false; +} +#endif /* CONFIG_RESCTRL_IOMMU */ #endif /* _RESCTRL_H */ From f9f94b6b3d637140f4f6b287cc4af426e5cf792f Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 21 May 2024 11:26:16 +0100 Subject: [PATCH 163/247] NVIDIA: SAUCE: firmware: arm_scmi: add MPAM-FB SCMI protocol stub BugLink: https://bugs.launchpad.net/bugs/2122432 The Arm MPAM Firmware-backed (Fb) Profile describes an SCMI based protocol to access "Memory System Components" (MSCs) in an "Memory System Resource Partitioning And Monitoring" (MPAM) enabled system. Although this SCMI protocol follows the usual protocol properties, it will not be described in the SCMI specifications. Also since ACPI based systems will need to use this MPAM-fb profile, we do not follow the usual way of describing each protocol function as a function in the SCMI framework system. Instead there is one generic transport function, that takes a preformatted buffer and transfers this to the MSC agent. Signed-off-by: Andre Przywara Signed-off-by: James Morse (cherry picked from commit 0f353a5efef50bf98ca01fc9900edb1c68576439 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/firmware/arm_scmi/Makefile | 2 +- drivers/firmware/arm_scmi/driver.c | 2 + drivers/firmware/arm_scmi/mpam.c | 67 +++++++++++++++++++++++++++ drivers/firmware/arm_scmi/protocols.h | 1 + drivers/resctrl/mpam_devices.c | 2 +- include/linux/scmi_protocol.h | 12 +++++ 6 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 drivers/firmware/arm_scmi/mpam.c diff --git a/drivers/firmware/arm_scmi/Makefile b/drivers/firmware/arm_scmi/Makefile index 780cd62b2f78a..caa61f16d12fc 100644 --- a/drivers/firmware/arm_scmi/Makefile +++ b/drivers/firmware/arm_scmi/Makefile @@ -8,7 +8,7 @@ scmi-driver-$(CONFIG_ARM_SCMI_RAW_MODE_SUPPORT) += raw_mode.o scmi-transport-$(CONFIG_ARM_SCMI_HAVE_SHMEM) = shmem.o scmi-transport-$(CONFIG_ARM_SCMI_HAVE_MSG) += msg.o scmi-protocols-y := base.o clock.o perf.o power.o reset.o sensors.o system.o voltage.o powercap.o -scmi-protocols-y += pinctrl.o +scmi-protocols-y += pinctrl.o mpam.o scmi-module-objs := $(scmi-driver-y) $(scmi-protocols-y) $(scmi-transport-y) obj-$(CONFIG_ARM_SCMI_PROTOCOL) += transports/ diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c index a8f2247feab9d..f85077887225d 100644 --- a/drivers/firmware/arm_scmi/driver.c +++ b/drivers/firmware/arm_scmi/driver.c @@ -3443,6 +3443,7 @@ static int __init scmi_driver_init(void) scmi_system_register(); scmi_powercap_register(); scmi_pinctrl_register(); + scmi_mpam_register(); return platform_driver_register(&scmi_driver); } @@ -3461,6 +3462,7 @@ static void __exit scmi_driver_exit(void) scmi_system_unregister(); scmi_powercap_unregister(); scmi_pinctrl_unregister(); + scmi_mpam_unregister(); platform_driver_unregister(&scmi_driver); diff --git a/drivers/firmware/arm_scmi/mpam.c b/drivers/firmware/arm_scmi/mpam.c new file mode 100644 index 0000000000000..21a7d197ab60a --- /dev/null +++ b/drivers/firmware/arm_scmi/mpam.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * System Control and Management Interface (SCMI) MPAM Protocol + * + * Copyright (C) 2024 ARM Ltd. + */ + +#include "common.h" +#include + +#define SCMI_PROTOCOL_SUPPORTED_VERSION 0x10000 + +static int scmi_mpam_transfer_buf(const struct scmi_protocol_handle *ph, + u8 msg_id, void *msg_buf, size_t msg_len, + u32 *ret_val) +{ + int ret; + struct scmi_xfer *t; + + ret = ph->xops->xfer_get_init(ph, msg_id, msg_len, + ret_val ? sizeof(*ret_val) : 0, &t); + if (ret) + return ret; + + memcpy(t->tx.buf, msg_buf, msg_len); + + ret = ph->xops->do_xfer(ph, t); + if (!ret && ret_val) { + u32 value; + + memcpy(&value, t->rx.buf, sizeof(value)); + *ret_val = le32_to_cpu((__le32)value); + } + + ph->xops->xfer_put(ph, t); + + return ret; +} + +static const struct scmi_mpam_proto_ops mpam_proto_ops = { + .mpam_transfer_buf = scmi_mpam_transfer_buf, +}; + +static int scmi_mpam_protocol_init(const struct scmi_protocol_handle *ph) +{ + int ret; + u32 version; + + ret = ph->xops->version_get(ph, &version); + if (ret) + return ret; + + dev_dbg(ph->dev, "SCMI MPAM Version %d.%d\n", + PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version)); + + return 0; +} + +static const struct scmi_protocol scmi_mpam = { + .id = SCMI_PROTOCOL_MPAM, + .owner = THIS_MODULE, + .instance_init = &scmi_mpam_protocol_init, + .ops = &mpam_proto_ops, + .supported_version = SCMI_PROTOCOL_SUPPORTED_VERSION, +}; + +DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(mpam, scmi_mpam) diff --git a/drivers/firmware/arm_scmi/protocols.h b/drivers/firmware/arm_scmi/protocols.h index d62c4469d1fd9..ad6fcfcfdd8d8 100644 --- a/drivers/firmware/arm_scmi/protocols.h +++ b/drivers/firmware/arm_scmi/protocols.h @@ -379,5 +379,6 @@ DECLARE_SCMI_REGISTER_UNREGISTER(sensors); DECLARE_SCMI_REGISTER_UNREGISTER(voltage); DECLARE_SCMI_REGISTER_UNREGISTER(system); DECLARE_SCMI_REGISTER_UNREGISTER(powercap); +DECLARE_SCMI_REGISTER_UNREGISTER(mpam); #endif /* _SCMI_PROTOCOLS_H */ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index a57d53cbc6dc5..ba8196fe43c53 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2266,7 +2266,6 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) u32 tmp; char name[20]; struct mpam_msc *msc; - struct resource *msc_res; struct device *dev = &pdev->dev; lockdep_assert_held(&mpam_list_lock); @@ -2305,6 +2304,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) if (msc->iface == MPAM_IFACE_MMIO) { void __iomem *io; + struct resource *msc_res; io = devm_platform_get_and_ioremap_resource(pdev, 0, &msc_res); diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 688466a0e8162..33989a689e6fc 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -877,6 +877,17 @@ struct scmi_notify_ops { struct notifier_block *nb); }; +/** + * struct scmi_mpam_proto_ops - operations provided by SCMI MPAM Protocol + * + * @mpam_transfer_buf: transfer an SCMI MPAM message to the agent + */ +struct scmi_mpam_proto_ops { + int (*mpam_transfer_buf)(const struct scmi_protocol_handle *ph, + u8 msg_id, void *msg_buf, size_t msg_len, + u32 *ret_val); +}; + /** * struct scmi_handle - Handle returned to ARM SCMI clients for usage. * @@ -926,6 +937,7 @@ enum scmi_std_protocol { SCMI_PROTOCOL_VOLTAGE = 0x17, SCMI_PROTOCOL_POWERCAP = 0x18, SCMI_PROTOCOL_PINCTRL = 0x19, + SCMI_PROTOCOL_MPAM = 0x1a, }; enum scmi_system_events { From 87a0d80952ef641a4d846771cf81d9f4c6d3e659 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 23 Sep 2025 17:20:22 +0100 Subject: [PATCH 164/247] NVIDIA: SAUCE: arm_mpam: add MPAM-FB MSC firmware access support BugLink: https://bugs.launchpad.net/bugs/2122432 The Arm MPAM Firmware-backed (Fb) Profile document[1] describes an alternative way of accessing the "Memory System Components" (MSC) in an MPAM enabled system. Normally the MSCs are MMIO mapped, but in some implementations this might not be possible (MSC located outside of the local socket, MSC mapped secure-only) or desirable (direct MMIO access too slow or needs to be mediated through a control processor). MPAM-fb standardises a protocol to abstract MSC accesses, building on the SCMI protocol. Add functions that do an MSC read or write access by redirecting the request through a firmware interface. This can either be through any supported SCMI transport, described via devicetree nodes, or via an ACPI PCC shared memory and mailbox combination. Signed-off-by: Andre Przywara Signed-off-by: James Morse (cherry picked from commit 2813c2c356c721ffe6b36529d336f8ea561f2753 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/Makefile | 2 +- drivers/resctrl/mpam_devices.c | 64 +++++++++- drivers/resctrl/mpam_fb.c | 207 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_fb.h | 26 ++++ drivers/resctrl/mpam_internal.h | 7 ++ include/linux/arm_mpam.h | 1 + 6 files changed, 300 insertions(+), 7 deletions(-) create mode 100644 drivers/resctrl/mpam_fb.c create mode 100644 drivers/resctrl/mpam_fb.h diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 4f6d0e81f9b8f..097c036724e97 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o -mpam-y += mpam_devices.o +mpam-y += mpam_devices.o mpam_fb.o mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index ba8196fe43c53..fd331f4adc2f2 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -33,7 +33,10 @@ #include #include +#include + #include "mpam_internal.h" +#include "mpam_fb.h" /* Values for the T241 errata workaround */ #define T241_CHIPS_MAX 4 @@ -159,6 +162,16 @@ static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) { WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + if (msc->iface == MPAM_IFACE_SCMI) { + u32 ret; + + mpam_fb_send_read_request(&msc->mpam_fb_chan, + msc->mpam_fb_msc_id, reg, &ret); + return ret; + } + + WARN_ON_ONCE(reg + sizeof(u32) > msc->mapped_hwpage_sz); + return readl_relaxed(msc->mapped_hwpage + reg); } @@ -172,10 +185,15 @@ static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg) static void __mpam_write_reg(struct mpam_msc *msc, u16 reg, u32 val) { - WARN_ON_ONCE(reg + sizeof(u32) >= msc->mapped_hwpage_sz); WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); - writel_relaxed(val, msc->mapped_hwpage + reg); + if (msc->iface == MPAM_IFACE_SCMI) { + mpam_fb_send_write_request(&msc->mpam_fb_chan, + msc->mpam_fb_msc_id, reg, val); + } else { + WARN_ON_ONCE(reg + sizeof(u32) >= msc->mapped_hwpage_sz); + writel_relaxed(val, msc->mapped_hwpage + reg); + } } static inline void _mpam_write_partsel_reg(struct mpam_msc *msc, u16 reg, u32 val) @@ -2246,6 +2264,11 @@ static void mpam_msc_destroy(struct mpam_msc *msc) add_to_garbage(msc); } +static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg) +{ + /* TODO: wake up tasks blocked on this MSC's PCC channel */ +} + static void mpam_msc_drv_remove(struct platform_device *pdev) { struct mpam_msc *msc = platform_get_drvdata(pdev); @@ -2263,9 +2286,9 @@ static void mpam_msc_drv_remove(struct platform_device *pdev) static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) { int err; - u32 tmp; char name[20]; struct mpam_msc *msc; + struct of_phandle_args of_args; struct device *dev = &pdev->dev; lockdep_assert_held(&mpam_list_lock); @@ -2297,10 +2320,16 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) if (err) return ERR_PTR(err); - if (device_property_read_u32(&pdev->dev, "pcc-channel", &tmp)) - msc->iface = MPAM_IFACE_MMIO; - else + if (!device_property_read_u32(&pdev->dev, "pcc-channel", + &msc->pcc_subspace_id)) { msc->iface = MPAM_IFACE_PCC; + } else if (!of_parse_phandle_with_fixed_args(pdev->dev.of_node, + "mpam-fb", 1, 0, + &of_args)) { + msc->iface = MPAM_IFACE_SCMI; + } else { + msc->iface = MPAM_IFACE_MMIO; + } if (msc->iface == MPAM_IFACE_MMIO) { void __iomem *io; @@ -2314,6 +2343,29 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) } msc->mapped_hwpage_sz = msc_res->end - msc_res->start; msc->mapped_hwpage = io; + } else if (msc->iface == MPAM_IFACE_PCC) { + msc->pcc_cl.dev = &pdev->dev; + msc->pcc_cl.rx_callback = mpam_pcc_rx_callback; + msc->pcc_cl.tx_block = false; + msc->pcc_cl.tx_tout = 1000; /* 1s */ + msc->pcc_cl.knows_txdone = false; + + msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl, + msc->pcc_subspace_id); + if (IS_ERR(msc->pcc_chan)) { + pr_err("Failed to request MSC PCC channel\n"); + return (void *)msc->pcc_chan; + } + } else if (msc->iface == MPAM_IFACE_SCMI) { + err = mpam_fb_connect_channel(of_args.np, + &msc->mpam_fb_chan); + if (err < 0) + return ERR_PTR(err); + + if (of_args.args_count > 0) + msc->mpam_fb_msc_id = of_args.args[0]; + else + msc->mpam_fb_msc_id = 0; } list_add_rcu(&msc->all_msc_list, &mpam_all_msc); diff --git a/drivers/resctrl/mpam_fb.c b/drivers/resctrl/mpam_fb.c new file mode 100644 index 0000000000000..af87a9e934cd0 --- /dev/null +++ b/drivers/resctrl/mpam_fb.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2024 Arm Ltd. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mpam_fb.h" + +#define MPAM_MSC_ATTRIBUTES 0x3 +#define MPAM_MSC_READ 0x4 +#define MPAM_MSC_WRITE 0x5 + +static const struct scmi_mpam_proto_ops *mpam_scmi_ops; + +static DEFINE_MUTEX(scmi_agent_list_mutex); +static LIST_HEAD(smci_agent_list); + +struct scmi_mpam_agent { + struct list_head list; + struct device_node *of_node; + struct scmi_protocol_handle *ph_handle; +}; + +#define SCMI_BUF_LENGTH_IDX 4 +#define SCMI_BUF_HEADER_IDX 5 +#define SCMI_BUF_PAYLOAD_IDX 6 +#define SCMI_READ_MSG_SIZE 9 +#define SCMI_WRITE_MSG_SIZE 10 + +static int mpam_fb_build_read_message(int msc_id, int reg, u32 *msg_buf) +{ + memset(msg_buf, 0, SCMI_READ_MSG_SIZE * sizeof(u32)); + + msg_buf[SCMI_BUF_LENGTH_IDX] = SCMI_READ_MSG_SIZE * sizeof(u32); + msg_buf[SCMI_BUF_HEADER_IDX] = MPAM_MSC_READ | (0x1a << 10); + msg_buf[SCMI_BUF_PAYLOAD_IDX + 0] = msc_id; + msg_buf[SCMI_BUF_PAYLOAD_IDX + 2] = reg; + + return SCMI_READ_MSG_SIZE * sizeof(u32); +} + +static int mpam_fb_build_write_message(int msc_id, int reg, u32 val, + u32 *msg_buf) +{ + memset(msg_buf, 0, SCMI_WRITE_MSG_SIZE * sizeof(u32)); + + msg_buf[SCMI_BUF_LENGTH_IDX] = SCMI_WRITE_MSG_SIZE * sizeof(u32); + msg_buf[SCMI_BUF_HEADER_IDX] = MPAM_MSC_WRITE | (0x1a << 10); + msg_buf[SCMI_BUF_PAYLOAD_IDX + 0] = msc_id; + msg_buf[SCMI_BUF_PAYLOAD_IDX + 2] = reg; + msg_buf[SCMI_BUF_PAYLOAD_IDX + 3] = val; + + return SCMI_WRITE_MSG_SIZE * sizeof(u32); +} + +static struct scmi_protocol_handle *scmi_agent_get_ph(const struct device_node *np) +{ + struct scmi_mpam_agent *agent; + struct scmi_protocol_handle *ph = NULL; + + mutex_lock(&scmi_agent_list_mutex); + + list_for_each_entry(agent, &smci_agent_list, list) { + if (np == agent->of_node) { + ph = agent->ph_handle; + break; + } + } + + mutex_unlock(&scmi_agent_list_mutex); + + return ph; +} + +int mpam_fb_connect_channel(const struct device_node *of_node, + struct mpam_fb_channel *chan) +{ + int msc_id = 0; + + chan->ph_handle = scmi_agent_get_ph(of_node); + if (!chan->ph_handle) + return -EPROBE_DEFER; + + chan->use_scmi = true; + + return msc_id; +} + +int mpam_fb_send_read_request(struct mpam_fb_channel *chan, int msc_id, + u16 reg, u32 *result) +{ + u32 msg_buf[12]; + int msg_len; + + msg_len = mpam_fb_build_read_message(msc_id, reg, msg_buf); + + if (chan->use_scmi) { + /* The SCMI layer adds the shared memory header itself. */ + msg_len -= SCMI_BUF_PAYLOAD_IDX * sizeof(u32); + + mpam_scmi_ops->mpam_transfer_buf(chan->ph_handle, + MPAM_MSC_READ, + msg_buf + SCMI_BUF_PAYLOAD_IDX, + msg_len, result); + + return 0; + } + + if (msg_len < chan->pcc_shmem_size) + return -EINVAL; + + memcpy(chan->pcc_shmem, msg_buf, msg_len); + mbox_send_message(chan->pcc_mbox, NULL); + + return 0; +} + +int mpam_fb_send_write_request(struct mpam_fb_channel *chan, int msc_id, + u16 reg, u32 value) +{ + u32 msg_buf[12]; + int msg_len; + + msg_len = mpam_fb_build_write_message(msc_id, reg, value, msg_buf); + if (msg_len < 0) + return msg_len; + + if (chan->use_scmi) { + /* The SCMI layer adds the shared memory header itself. */ + msg_len -= SCMI_BUF_PAYLOAD_IDX * sizeof(u32); + + mpam_scmi_ops->mpam_transfer_buf(chan->ph_handle, + MPAM_MSC_WRITE, + msg_buf + SCMI_BUF_PAYLOAD_IDX, + msg_len, NULL); + + return 0; + } + + if (msg_len < chan->pcc_shmem_size) + return -EINVAL; + + memcpy(chan->pcc_shmem, msg_buf, msg_len); + mbox_send_message(chan->pcc_mbox, NULL); + + return 0; +} + +static int scmi_mpam_probe(struct scmi_device *sdev) +{ + const struct scmi_handle *handle = sdev->handle; + struct scmi_protocol_handle *ph; + struct scmi_mpam_agent *agent; + + if (!handle) + return -ENODEV; + + mpam_scmi_ops = handle->devm_protocol_get(sdev, SCMI_PROTOCOL_MPAM, &ph); + if (IS_ERR(mpam_scmi_ops)) + return PTR_ERR(mpam_scmi_ops); + + agent = devm_kzalloc(&sdev->dev, sizeof(*agent), GFP_KERNEL); + if (!agent) + return -ENOMEM; + + agent->of_node = sdev->dev.of_node; + agent->ph_handle = ph; + + mutex_lock(&scmi_agent_list_mutex); + list_add(&agent->list, &smci_agent_list); + mutex_unlock(&scmi_agent_list_mutex); + + return 0; +} + +static void scmi_mpam_remove(struct scmi_device *sdev) +{ +} + +static const struct scmi_device_id scmi_id_table[] = { + { SCMI_PROTOCOL_MPAM, "mpam" }, + {}, +}; +MODULE_DEVICE_TABLE(scmi, scmi_id_table); + +static struct scmi_driver scmi_mpam_driver = { + .name = "scmi-mpam-driver", + .probe = scmi_mpam_probe, + .remove = scmi_mpam_remove, + .id_table = scmi_id_table, +}; +module_scmi_driver(scmi_mpam_driver); diff --git a/drivers/resctrl/mpam_fb.h b/drivers/resctrl/mpam_fb.h new file mode 100644 index 0000000000000..723e9c5a5e1e3 --- /dev/null +++ b/drivers/resctrl/mpam_fb.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (C) 2024 Arm Ltd. + +#ifndef MPAM_FB_H_ +#define MPAM_FB_H_ + +#include +#include +#include + +struct mpam_fb_channel { + bool use_scmi; + struct scmi_protocol_handle *ph_handle; + void __iomem *pcc_shmem; + size_t pcc_shmem_size; + struct mbox_chan *pcc_mbox; +}; + +int mpam_fb_connect_channel(const struct device_node *of_node, + struct mpam_fb_channel *chan); +int mpam_fb_send_read_request(struct mpam_fb_channel *chan, int msc_id, + u16 reg, u32 *result); +int mpam_fb_send_write_request(struct mpam_fb_channel *chan, int msc_id, + u16 reg, u32 value); + +#endif diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 216db7892ef85..768393d5f7cf7 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -22,6 +22,8 @@ #include +#include "mpam_fb.h" + #define MPAM_MSC_MAX_NUM_RIS 16 @@ -78,6 +80,11 @@ struct mpam_msc { /* Not modified after mpam_is_enabled() becomes true */ enum mpam_msc_iface iface; + u32 pcc_subspace_id; + struct mbox_client pcc_cl; + struct pcc_mbox_chan *pcc_chan; + struct mpam_fb_channel mpam_fb_chan; + int mpam_fb_msc_id; /* in its own name space */ u32 nrdy_usec; u64 nrdy_retry_count; cpumask_t accessibility; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 810f894025fb8..a2c1812ce0406 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -16,6 +16,7 @@ struct mpam_msc; enum mpam_msc_iface { MPAM_IFACE_MMIO, /* a real MPAM MSC */ MPAM_IFACE_PCC, /* a fake MPAM MSC */ + MPAM_IFACE_SCMI, /* through a firmware interface */ }; enum mpam_class_types { From 14ea1ccbc9b6050ab75622130b14c8a37e731dd3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 28 Jun 2024 14:04:53 +0100 Subject: [PATCH 165/247] NVIDIA: SAUCE: arm_mpam: Allow duplicate PCC subspace_ids BugLink: https://bugs.launchpad.net/bugs/2122432 Carl reports that some platforms use the same PCC channel for multiple MSCs, which leads to the driver not probing. Add a list that is searched each time a new channel is allocated. CC: Carl Worth Signed-off-by: James Morse (cherry picked from commit 12faf26e0cf2547819ff5bbb47779c634a7de1ab https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 110 +++++++++++++++++++++++++++++--- drivers/resctrl/mpam_internal.h | 3 +- 2 files changed, 103 insertions(+), 10 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index fd331f4adc2f2..b371c456a6812 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -521,6 +521,102 @@ static void mpam_free_garbage(void) } } +static LIST_HEAD(mpam_pcc_channels); + +struct mpam_pcc_chan { + struct list_head pcc_channels_list; + + u32 refs; + u32 subspace_id; + struct pcc_mbox_chan *channel; + struct mbox_client pcc_cl; + + struct mpam_garbage garbage; +}; + +static struct pcc_mbox_chan *mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) +{ + struct mpam_pcc_chan *chan; + + lockdep_assert_held(&mpam_list_lock); + + chan = kzalloc(sizeof(*chan), gfp); + if (!chan) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD_RCU(&chan->pcc_channels_list); + chan->refs = 1; + chan->subspace_id = subspace_id; + /* + * TODO is the device important - these subspace_id can be re-used, so + * there is no one device to put here ... + */ + chan->pcc_cl.rx_callback = mpam_pcc_rx_callback; + chan->pcc_cl.tx_block = false; + chan->pcc_cl.tx_tout = 1000; /* 1s */ + chan->pcc_cl.knows_txdone = false; + + chan->channel = pcc_mbox_request_channel(&chan->pcc_cl, subspace_id); + if (IS_ERR(chan->channel)) { + kfree(chan); + return NULL; + } + + init_garbage(&chan->garbage); + list_add(&chan->pcc_channels_list, &mpam_pcc_channels); + return chan->channel; +} + +static struct pcc_mbox_chan *mpam_pcc_get(u8 subspace_id, bool alloc, gfp_t gfp) +{ + bool found = false; + struct mpam_pcc_chan *chan; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(chan, &mpam_pcc_channels, pcc_channels_list) { + if (chan->subspace_id == subspace_id) { + found = true; + break; + } + } + + if (found) { + chan->refs++; + return chan->channel; + } + + if (!alloc) + return ERR_PTR(-ENOENT); + + return mpam_pcc_alloc(subspace_id, gfp); +} + +static void mpam_pcc_put(u8 subspace_id) +{ + bool found = false; + struct mpam_pcc_chan *chan; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(chan, &mpam_pcc_channels, pcc_channels_list) { + if (chan->subspace_id == subspace_id) { + found = true; + break; + } + } + + if (!found) + return; + + chan->refs--; + if (!chan->refs) { + list_del(&chan->pcc_channels_list); + pcc_mbox_free_channel(chan->channel); + add_to_garbage(chan); + } +} + /* Called recursively to walk the list of caches from a particular CPU */ static void __mpam_get_cpumask_from_cache_id(int cpu, struct device_node *cache_node, unsigned long cache_id, @@ -2261,10 +2357,13 @@ static void mpam_msc_destroy(struct mpam_msc *msc) debugfs_remove_recursive(msc->debugfs); msc->debugfs = NULL; + if (msc->iface == MPAM_IFACE_PCC) + mpam_pcc_put(msc->pcc_subspace_id); + add_to_garbage(msc); } -static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg) +void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg) { /* TODO: wake up tasks blocked on this MSC's PCC channel */ } @@ -2344,14 +2443,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) msc->mapped_hwpage_sz = msc_res->end - msc_res->start; msc->mapped_hwpage = io; } else if (msc->iface == MPAM_IFACE_PCC) { - msc->pcc_cl.dev = &pdev->dev; - msc->pcc_cl.rx_callback = mpam_pcc_rx_callback; - msc->pcc_cl.tx_block = false; - msc->pcc_cl.tx_tout = 1000; /* 1s */ - msc->pcc_cl.knows_txdone = false; - - msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl, - msc->pcc_subspace_id); + msc->pcc_chan = mpam_pcc_get(msc->pcc_subspace_id, true, GFP_KERNEL); if (IS_ERR(msc->pcc_chan)) { pr_err("Failed to request MSC PCC channel\n"); return (void *)msc->pcc_chan; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 768393d5f7cf7..17288eb0f3d13 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -81,7 +81,6 @@ struct mpam_msc { /* Not modified after mpam_is_enabled() becomes true */ enum mpam_msc_iface iface; u32 pcc_subspace_id; - struct mbox_client pcc_cl; struct pcc_mbox_chan *pcc_chan; struct mpam_fb_channel mpam_fb_chan; int mpam_fb_msc_id; /* in its own name space */ @@ -545,6 +544,8 @@ int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, bool mpam_force_unknown_msc_test(struct mpam_msc *msc); +void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg); + #ifdef CONFIG_RESCTRL_FS int mpam_resctrl_setup(void); void mpam_resctrl_exit(void); From eb1088a67a48b75586a9235a4e5aa1bf66708e71 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 2 Jul 2025 17:20:39 +0100 Subject: [PATCH 166/247] NVIDIA: SAUCE: untested: mpam: Convert pcc_channels list to XArray and cleanup BugLink: https://bugs.launchpad.net/bugs/2122432 Squash this into the previous patch once it has been tested... ... does anyone have a PCC platform that can take this for a spin? Signed-off-by: James Morse (cherry picked from commit 4edb3917c0bacddf62fcd54ab0b5cdba6c2cedf3 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 51 +++++++++++++--------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index b371c456a6812..704e12f6685ec 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -521,11 +521,9 @@ static void mpam_free_garbage(void) } } -static LIST_HEAD(mpam_pcc_channels); +static DEFINE_XARRAY(mpam_pcc_channels); struct mpam_pcc_chan { - struct list_head pcc_channels_list; - u32 refs; u32 subspace_id; struct pcc_mbox_chan *channel; @@ -534,17 +532,15 @@ struct mpam_pcc_chan { struct mpam_garbage garbage; }; -static struct pcc_mbox_chan *mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) +static struct mpam_pcc_chan *__mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) { - struct mpam_pcc_chan *chan; + struct mpam_pcc_chan *chan __free(kfree) = kzalloc(sizeof(*chan), gfp); lockdep_assert_held(&mpam_list_lock); - chan = kzalloc(sizeof(*chan), gfp); if (!chan) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD_RCU(&chan->pcc_channels_list); chan->refs = 1; chan->subspace_id = subspace_id; /* @@ -557,31 +553,29 @@ static struct pcc_mbox_chan *mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) chan->pcc_cl.knows_txdone = false; chan->channel = pcc_mbox_request_channel(&chan->pcc_cl, subspace_id); - if (IS_ERR(chan->channel)) { - kfree(chan); - return NULL; - } + if (IS_ERR(chan->channel)) + return ERR_CAST(chan->channel); init_garbage(&chan->garbage); - list_add(&chan->pcc_channels_list, &mpam_pcc_channels); - return chan->channel; + xa_store(&mpam_pcc_channels, subspace_id, chan, gfp); + + return_ptr(chan); +} + +static struct pcc_mbox_chan *mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) +{ + struct mpam_pcc_chan *chan = __mpam_pcc_alloc(subspace_id, gfp); + return IS_ERR(chan) ? ERR_CAST(chan) : chan->channel; } static struct pcc_mbox_chan *mpam_pcc_get(u8 subspace_id, bool alloc, gfp_t gfp) { - bool found = false; struct mpam_pcc_chan *chan; lockdep_assert_held(&mpam_list_lock); - list_for_each_entry(chan, &mpam_pcc_channels, pcc_channels_list) { - if (chan->subspace_id == subspace_id) { - found = true; - break; - } - } - - if (found) { + chan = xa_load(&mpam_pcc_channels, subspace_id); + if (chan) { chan->refs++; return chan->channel; } @@ -594,24 +588,17 @@ static struct pcc_mbox_chan *mpam_pcc_get(u8 subspace_id, bool alloc, gfp_t gfp) static void mpam_pcc_put(u8 subspace_id) { - bool found = false; struct mpam_pcc_chan *chan; lockdep_assert_held(&mpam_list_lock); - list_for_each_entry(chan, &mpam_pcc_channels, pcc_channels_list) { - if (chan->subspace_id == subspace_id) { - found = true; - break; - } - } - - if (!found) + chan = xa_load(&mpam_pcc_channels, subspace_id); + if (!chan) return; chan->refs--; if (!chan->refs) { - list_del(&chan->pcc_channels_list); + xa_erase(&mpam_pcc_channels, subspace_id); pcc_mbox_free_channel(chan->channel); add_to_garbage(chan); } From f55f41d061457cbbfd859a0e1b6d0f7fbe454e0a Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 15 Mar 2024 16:46:12 +0000 Subject: [PATCH 167/247] NVIDIA: SAUCE: x86/resctrl: Add stub to allow other architecture to disable monitor overflow BugLink: https://bugs.launchpad.net/bugs/2122432 Resctrl has an overflow handler that runs on each domain every second to ensure that any overflow of the hardware counter is accounted for. MPAM can have counters as large as 63 bits, in which case there is no need to check for overflow. To allow other architectures to disable this, add a helper that reports whether counters can overflow. Signed-off-by: James Morse (cherry picked from commit 935ddfc61145dc5d12df9487676a60341376c0f0 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/include/asm/resctrl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 575f8408a9e7c..40a74a0617345 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -191,6 +191,11 @@ static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx) { } +static inline bool resctrl_arch_mon_can_overflow(void) +{ + return true; +} + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else From 0cb5a7cfc9ed080fe7e389a24fdf89d4611ba4f8 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 15 Mar 2024 17:32:53 +0000 Subject: [PATCH 168/247] NVIDIA: SAUCE: arm_mpam: resctrl: Determine if any exposed counter can overflow BugLink: https://bugs.launchpad.net/bugs/2122432 Resctrl has an overflow handler that runs on each domain every second to ensure that any overflow of the hardware counter is accounted for. MPAM can have counters as large as 63 bits, in which case there is no need to check for overflow. To allow the overflow handler to be disabled, determine if an overflow can happen. If a class is not implemented, or has the 63bit counter, it can't overflow. Signed-off-by: James Morse (cherry picked from commit 5cbe15bd6c1d393cf1ffe2b259a3be54a5345e1e https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 21 +++++++++++++++++++++ include/linux/arm_mpam.h | 1 + 2 files changed, 22 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 7be05de8a22a2..10908a1fd055a 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -432,6 +432,27 @@ void resctrl_arch_mon_ctx_free(struct rdt_resource *r, resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); } +static bool __resctrl_arch_mon_can_overflow(enum resctrl_event_id eventid) +{ + struct mpam_props *cprops; + struct mpam_class *class = mpam_resctrl_counters[eventid].class; + + if (!class) + return false; + + /* No need to worry about a 63 bit counter overflowing */ + cprops = &class->props; + return !mpam_has_feature(mpam_feat_msmon_mbwu_63counter, cprops); +} + +bool resctrl_arch_mon_can_overflow(void) +{ + if (__resctrl_arch_mon_can_overflow(QOS_L3_MBM_LOCAL_EVENT_ID)) + return true; + + return __resctrl_arch_mon_can_overflow(QOS_L3_MBM_TOTAL_EVENT_ID); +} + static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, enum mpam_device_features mon_type, enum mon_filter_options mon_opts, diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index a2c1812ce0406..7c46dbd56f69a 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -64,6 +64,7 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) bool resctrl_arch_alloc_capable(void); bool resctrl_arch_mon_capable(void); +bool resctrl_arch_mon_can_overflow(void); void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); From b8370970eed87100d5fe0b7451fd9bf60a7b0194 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 15 Mar 2024 17:36:02 +0000 Subject: [PATCH 169/247] NVIDIA: SAUCE: fs/restrl: Allow the overflow handler to be disabled BugLink: https://bugs.launchpad.net/bugs/2122432 Resctrl has an overflow handler that runs on each domain every second to ensure that any overflow of the hardware counter is accounted for. MPAM can have counters as large as 63 bits, in which case there is no need to check for overflow. Call the new arch helpers to determine this. Signed-off-by: James Morse (cherry picked from commit 7120f602ee83c2fb75517b137efd41b83777ba76 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/monitor.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 392376e5a2751..5005e912bf0de 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -863,8 +863,10 @@ void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ /* * When a domain comes online there is no guarantee the filesystem is * mounted. If not, there is no need to catch counter overflow. + * Some architecture may have ~64bit counters, and can ignore overflow. */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) + if (!resctrl_mounted || !resctrl_arch_mon_capable() || + !resctrl_arch_mon_can_overflow()) return; cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; From 5bb1b2949458976dc4191a3dd968bcaa15634062 Mon Sep 17 00:00:00 2001 From: Rex Nie Date: Mon, 11 Mar 2024 16:18:39 +0800 Subject: [PATCH 170/247] NVIDIA: SAUCE: fs/resctrl: Uniform data type of component_id/domid/id/cache_id BugLink: https://bugs.launchpad.net/bugs/2122432 This patch uniform data type of component_id/domid/id/cache_id to u32 to avoid type confusion. According to ACPI for mpam, cache id is used as locator for cache MSC. Reference to RD_PPTT_CACHE_ID definition from edk2-platforms, u32 is enough for cache_id. ( \ (((PackageId) & 0xF) << 20) | (((ClusterId) & 0xFF) << 12) | \ (((CoreId) & 0xFF) << 4) | ((CacheType) & 0xF) \ ) refs: 1. ACPI for mpam: https://developer.arm.com/documentation/den0065/latest/ 2. RD_PPTT_CACHE_ID from edk2-platforms: https://github.com/tianocore/edk2-platforms/blob/master/Platform/ARM/SgiPkg/Include/SgiAcpiHeader.h#L202 Signed-off-by: Rex Nie Signed-off-by: James Morse (cherry picked from commit fb0bcda86e18dded432e3dbe684ea494f9ea71ab https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 8 ++++---- include/linux/arm_mpam.h | 2 +- include/linux/resctrl.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 704e12f6685ec..82640520fa583 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -343,7 +343,7 @@ static struct mpam_vmsc *mpam_vmsc_find(struct mpam_component *comp, } static struct mpam_component * -mpam_component_alloc(struct mpam_class *class, int id) +mpam_component_alloc(struct mpam_class *class, u32 id) { struct mpam_component *comp; @@ -366,7 +366,7 @@ mpam_component_alloc(struct mpam_class *class, int id) } static struct mpam_component * -mpam_component_find(struct mpam_class *class, int id) +mpam_component_find(struct mpam_class *class, u32 id) { struct mpam_component *comp; @@ -735,7 +735,7 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, - int component_id) + u32 component_id) { int err; struct mpam_vmsc *vmsc; @@ -797,7 +797,7 @@ static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx, } int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, - enum mpam_class_types type, u8 class_id, int component_id) + enum mpam_class_types type, u8 class_id, u32 component_id) { int err; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7c46dbd56f69a..380bafed9043c 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -42,7 +42,7 @@ static inline int acpi_mpam_count_msc(void) { return -EINVAL; } #endif int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, - enum mpam_class_types type, u8 class_id, int component_id); + enum mpam_class_types type, u8 class_id, u32 component_id); struct resctrl_schema; diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 7e00e28b885a1..15b86b899f695 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -136,7 +136,7 @@ enum resctrl_domain_type { */ struct rdt_domain_hdr { struct list_head list; - int id; + u32 id; enum resctrl_domain_type type; struct cpumask cpu_mask; }; From 7a36a5b23abd9463139abb324a7fc696e6d1c70f Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 27 Aug 2024 15:24:08 +0100 Subject: [PATCH 171/247] NVIDIA: SAUCE: arm_mpam: Allow cmax/cmin to be configured BugLink: https://bugs.launchpad.net/bugs/2122432 mpam_reprogram_ris_partid() always resets the CMAX/CMIN controls to their 'unrestricted' value. This prevents the controls from being configured. Add fields in struct mpam_config, and program these values when they are set in the features bitmask. Signed-off-by: James Morse (cherry picked from commit 72fd49e3e0392832f9840f83769c38e4ab61e23f https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 23 +++++++++++++++++++---- drivers/resctrl/mpam_internal.h | 4 ++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 82640520fa583..89bdae8bd86d6 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1769,11 +1769,25 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_has_feature(mpam_feat_mbw_prop, cfg)) mpam_write_partsel_reg(msc, MBW_PROP, 0); - if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) - mpam_write_partsel_reg(msc, CMAX, cmax); + if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) { + if (mpam_has_feature(mpam_feat_cmax_cmax, cfg)) { + u32 cmax_val = cfg->cmax; - if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) - mpam_write_partsel_reg(msc, CMIN, 0); + if (cfg->cmax_softlim) + cmax_val |= MPAMCFG_CMAX_SOFTLIM; + mpam_write_partsel_reg(msc, CMAX, cmax_val); + } else { + mpam_write_partsel_reg(msc, CMAX, cmax); + } + } + + if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) { + if (mpam_has_feature(mpam_feat_cmax_cmin, cfg)) { + mpam_write_partsel_reg(msc, CMIN, cfg->cmin); + } else { + mpam_write_partsel_reg(msc, CMIN, 0); + } + } if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops)) mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC); @@ -3405,6 +3419,7 @@ static bool mpam_update_config(struct mpam_config *cfg, bool has_changes = false; maybe_update_config(cfg, mpam_feat_cpor_part, newcfg, cpbm, has_changes); + maybe_update_config(cfg, mpam_feat_cmax_cmax, newcfg, cmax, has_changes); maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes); maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes); maybe_update_config(cfg, mpam_feat_mbw_min, newcfg, mbw_min, has_changes); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 17288eb0f3d13..7c7160d6042ad 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -364,6 +364,10 @@ struct mpam_config { u32 mbw_pbm; u16 mbw_max; u16 mbw_min; + u16 cmax; + u16 cmin; + + bool cmax_softlim; bool reset_cpbm; bool reset_mbw_pbm; From 866683427cb8e7abf814b4036bf345fa0a9a6916 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 11:37:26 +0000 Subject: [PATCH 172/247] NVIDIA: SAUCE: arm_mpam: Rename mbw conversion to 'fract16' for code re-use BugLink: https://bugs.launchpad.net/bugs/2122432 Functions like mbw_max_to_percent() convert a value into MPAMs 16 bit fixed point fraction format. These are not only used for memory bandwidth, but cache capcity controls too. Rename these functions to convert to/from a 'fract16', and add helpers for the specific mbw_max/cmax controls. Signed-off-by: James Morse (cherry picked from commit a852dfa8ee9ed70c8397a9d89206eebcdd2f368e https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 24 +++++++++++++++++------- drivers/resctrl/test_mpam_resctrl.c | 4 ++-- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 10908a1fd055a..d244fa7651075 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -823,14 +823,14 @@ static u32 percent_to_mbw_pbm(u8 pc, struct mpam_props *cprops) * (1 << cprops->bwa_wd) equal bands. * Find the nearest percentage value to the upper bound of the selected band: */ -static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +static u32 fract16_to_percent(u16 fract, u8 wd) { - u32 val = mbw_max; + u32 val = fract; - val >>= 16 - cprops->bwa_wd; + val >>= 16 - wd; val += 1; val *= MAX_MBA_BW; - val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); + val = DIV_ROUND_CLOSEST(val, 1 << wd); return val; } @@ -845,18 +845,28 @@ static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) * percentages) and over-commit (where the total of the converted * allocations is greater than expected). */ -static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +static u16 percent_to_fract16(u8 pc, u8 wd) { u32 val = pc; - val <<= cprops->bwa_wd; + val <<= wd; val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); val = max(val, 1) - 1; - val <<= 16 - cprops->bwa_wd; + val <<= 16 - wd; return val; } +static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +{ + return fract16_to_percent(mbw_max, cprops->bwa_wd); +} + +static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +{ + return percent_to_fract16(pc, cprops->bwa_wd); +} + static u32 get_mba_min(struct mpam_props *cprops) { u32 val = 0; diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c index e79f77f4ec0f9..3ece77289931e 100644 --- a/drivers/resctrl/test_mpam_resctrl.c +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -189,7 +189,7 @@ static void test_mbw_pbm_to_percent(struct kunit *test) KUNIT_EXPECT_EQ(test, ret, 0); } -static void test_mbw_max_to_percent(struct kunit *test) +static void test_fract16_to_percent(struct kunit *test) { const struct percent_value_case *param = test->param_value; struct percent_value_test_info res; @@ -438,7 +438,7 @@ static void test_num_assignable_counters(struct kunit *test) static struct kunit_case mpam_resctrl_test_cases[] = { KUNIT_CASE(test_get_mba_granularity), KUNIT_CASE(test_mbw_pbm_to_percent), - KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_fract16_to_percent, test_percent_value_gen_params), KUNIT_CASE(test_percent_to_mbw_pbm), KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params), KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params), From 923ecb64355c819d4f227a62167498cb15095db0 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 18 Nov 2024 18:45:50 +0000 Subject: [PATCH 173/247] NVIDIA: SAUCE: fs/resctrl: Group all the MBA specific properties in a separate struct BugLink: https://bugs.launchpad.net/bugs/2122432 struct resctrl_membw combines parameters that are related to the control value, and parameters that are specific to the MBA resource. To allow the control value parsing and management code to be re-used for other resources, it needs to be separated from the MBA resource. Add struct resctrl_mba that holds all the parameters that are specific to the MBA resource. Signed-off-by: James Morse (cherry picked from commit 2187592a6e6508e2a342cfd18a472304c70e4538 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/core.c | 18 +++++++++--------- drivers/resctrl/mpam_resctrl.c | 4 ++-- fs/resctrl/ctrlmondata.c | 3 ++- fs/resctrl/rdtgroup.c | 18 +++++++++--------- include/linux/resctrl.h | 26 +++++++++++++++++--------- 5 files changed, 39 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 06ca5a30140c2..6781cbe84d987 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -193,21 +193,21 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) hw_res->num_closid = edx.split.cos_max + 1; max_delay = eax.split.max_delay + 1; r->membw.max_bw = MAX_MBA_BW; - r->membw.arch_needs_linear = true; + r->mba.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { - r->membw.delay_linear = true; + r->mba.delay_linear = true; r->membw.min_bw = MAX_MBA_BW - max_delay; r->membw.bw_gran = MAX_MBA_BW - max_delay; } else { if (!rdt_get_mb_table(r)) return false; - r->membw.arch_needs_linear = false; + r->mba.arch_needs_linear = false; } if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) - r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; + r->mba.throttle_mode = THREAD_THROTTLE_PER_THREAD; else - r->membw.throttle_mode = THREAD_THROTTLE_MAX; + r->mba.throttle_mode = THREAD_THROTTLE_MAX; r->alloc_capable = true; @@ -230,14 +230,14 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) r->membw.max_bw = 1 << eax; /* AMD does not use delay */ - r->membw.delay_linear = false; - r->membw.arch_needs_linear = false; + r->mba.delay_linear = false; + r->mba.arch_needs_linear = false; /* * AMD does not use memory delay throttle model to control * the allocation like Intel does. */ - r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; @@ -301,7 +301,7 @@ static void mba_wrmsr_amd(struct msr_param *m) */ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { - if (r->membw.delay_linear) + if (r->mba.delay_linear) return MAX_MBA_BW - bw; pr_warn_once("Non Linear delay-bw map not supported but queried\n"); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index d244fa7651075..ace72011f54d7 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1405,8 +1405,8 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, r->schema_fmt = RESCTRL_SCHEMA_RANGE; r->ctrl_scope = RESCTRL_L3_CACHE; - r->membw.delay_linear = true; - r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->mba.delay_linear = true; + r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = get_mba_min(cprops); r->membw.max_bw = MAX_MBA_BW; r->membw.bw_gran = get_mba_granularity(cprops); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 6810a026fc7fc..f7325f4a1acd7 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -46,7 +46,8 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { + if (r->rid == RDT_RESOURCE_MBA && + !r->mba.delay_linear && r->mba.arch_needs_linear) { rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index e7a48e373711a..5b0956a7ef47b 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1274,7 +1274,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); struct rdt_resource *r = s->res; - seq_printf(seq, "%u\n", r->membw.delay_linear); + seq_printf(seq, "%u\n", r->mba.delay_linear); return 0; } @@ -1292,7 +1292,7 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); struct rdt_resource *r = s->res; - switch (r->membw.throttle_mode) { + switch (r->mba.throttle_mode) { case THREAD_THROTTLE_PER_THREAD: seq_puts(seq, "per-thread\n"); return 0; @@ -1627,7 +1627,7 @@ bool is_mba_sc(struct rdt_resource *r) if (r->rid != RDT_RESOURCE_MBA) return false; - return r->membw.mba_sc; + return r->mba.mba_sc; } /* @@ -2223,13 +2223,13 @@ static void thread_throttle_mode_init(void) r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); if (r_mba->alloc_capable && - r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_mba->membw.throttle_mode; + r_mba->mba.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_mba->mba.throttle_mode; r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); if (r_smba->alloc_capable && - r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_smba->membw.throttle_mode; + r_smba->mba.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_smba->mba.throttle_mode; if (throttle_mode == THREAD_THROTTLE_UNDEFINED) return; @@ -2512,7 +2512,7 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, static inline bool is_mba_linear(void) { - return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; + return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->mba.delay_linear; } static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -2570,7 +2570,7 @@ static int set_mba_sc(bool mba_sc) if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) return -EINVAL; - r->membw.mba_sc = mba_sc; + r->mba.mba_sc = mba_sc; rdtgroup_default.mba_mbps_event = mba_mbps_default_event; diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 15b86b899f695..4180623da9c00 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -235,22 +235,28 @@ enum membw_throttle_mode { * @min_bw: Minimum memory bandwidth percentage user can request * @max_bw: Maximum memory bandwidth value, used as the reset value * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @arch_needs_linear: True if we can't configure non-linear resources - * @throttle_mode: Bandwidth throttling mode when threads request - * different memory bandwidths - * @mba_sc: True if MBA software controller(mba_sc) is enabled - * @mb_map: Mapping of memory B/W percentage to memory B/W delay */ struct resctrl_membw { u32 min_bw; u32 max_bw; u32 bw_gran; - u32 delay_linear; - bool arch_needs_linear; - enum membw_throttle_mode throttle_mode; +}; + +/** + * struct resctrl_mba - Resource properties that are specific to the MBA resource + * @mba_sc: True if MBA software controller(mba_sc) is enabled + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + * @delay_linear: True if control is in linear scale + * @arch_needs_linear: True if we can't configure non-linear resources + * @throttle_mode: Mode when threads request different control values + */ +struct resctrl_mba { bool mba_sc; u32 *mb_map; + bool delay_linear; + bool arch_needs_linear; + enum membw_throttle_mode throttle_mode; + }; struct resctrl_schema; @@ -301,6 +307,7 @@ struct resctrl_mon { * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource + * @mba: Properties of the MBA resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. * @cdp_capable: Is the CDP feature available on this resource @@ -314,6 +321,7 @@ struct rdt_resource { struct resctrl_cache cache; struct resctrl_membw membw; struct resctrl_mon mon; + struct resctrl_mba mba; struct list_head ctrl_domains; struct list_head mon_domains; char *name; From ad82e0041a748ab8ef00a82342f279badd7c9f17 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 10 Sep 2024 11:33:53 +0100 Subject: [PATCH 174/247] NVIDIA: SAUCE: fs/resctrl: Abstract duplicate domain test to a helper BugLink: https://bugs.launchpad.net/bugs/2122432 parse_cbm() and parse_bw() both test the staged config for an existing entry. These would indicate user-space has provided a schema with a duplicate domain entry. e.g: | L3:0=ffff;1=f00f;0=f00f If new parsers are added this duplicate domain test has to be duplicated. Move it to the caller. Signed-off-by: James Morse (cherry picked from commit ef00c0fb556566630ade5166b6f7cff772e5e977 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/ctrlmondata.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index f7325f4a1acd7..3d2fe9493d461 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -82,12 +82,6 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_resource *r = s->res; u32 bw_val; - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; @@ -96,6 +90,7 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, return 0; } + cfg = &d->staged_config[s->conf_type]; cfg->new_ctrl = bw_val; cfg->have_new_ctrl = true; @@ -162,12 +157,6 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_resource *r = s->res; u32 cbm_val; - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - /* * Cannot set up more than one pseudo-locked region in a cache * hierarchy. @@ -205,6 +194,7 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, } } + cfg = &d->staged_config[s->conf_type]; cfg->new_ctrl = cbm_val; cfg->have_new_ctrl = true; @@ -262,12 +252,17 @@ static int parse_line(char *line, struct resctrl_schema *s, dom = strim(dom); list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (d->hdr.id == dom_id) { + cfg = &d->staged_config[t]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); + return -EINVAL; + } + data.buf = dom; data.rdtgrp = rdtgrp; if (parse_ctrlval(&data, s, d)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - cfg = &d->staged_config[t]; /* * In pseudo-locking setup mode and just * parsed a valid CBM that should be From 37bd1e68ffcf6da29490cbc8be3cd22ae105925a Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 15:02:03 +0000 Subject: [PATCH 175/247] NVIDIA: SAUCE: fs/resctrl: Move MBA supported check to parse_line() instead of parse_bw() BugLink: https://bugs.launchpad.net/bugs/2122432 MBA is only supported on platforms where the delay inserted by the control is linear. Resctrl checks the two properties provided by the arch code match each time it parses part of a new control value. This doesn't need to be done so frequently, and obscures changes to parse_bw() to abstract it for use with other control types. Move this check to the parse_line() caller so it only happens once. Signed-off-by: James Morse (cherry picked from commit 24f77499324e859031b686ae592465eb459e41b1 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/ctrlmondata.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 3d2fe9493d461..f70ea58135ee2 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -43,15 +43,6 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) int ret; u32 bw; - /* - * Only linear delay values is supported for current Intel SKUs. - */ - if (r->rid == RDT_RESOURCE_MBA && - !r->mba.delay_linear && r->mba.arch_needs_linear) { - rdt_last_cmd_puts("No support for non-linear MB domains\n"); - return false; - } - ret = kstrtou32(buf, 10, &bw); if (ret) { rdt_last_cmd_printf("Invalid MB value %s\n", buf); @@ -240,6 +231,15 @@ static int parse_line(char *line, struct resctrl_schema *s, return -EINVAL; } + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (r->rid == RDT_RESOURCE_MBA && + !r->mba.delay_linear && r->mba.arch_needs_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); + return -EINVAL; + } + next: if (!line || line[0] == '\0') return 0; From 5ad6d9e04b3c0646b5dac916684eea215d988e93 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 15:55:45 +0000 Subject: [PATCH 176/247] NVIDIA: SAUCE: fs/resctrl: Rename resctrl_get_default_ctrl() to include resource BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl_get_default_ctrl() is called by both the architecture code and filesystem code to return the default value for a control. This depends on the schema format. parse_bw() doesn't bother checking the bounds it is given if the resource is in use by mba_sc. This is because the values parsed from user-space are not the same as those the control should take. To make this disparity easier to work with, a second different copy of the schema format is needed, which would need a version of resctrl_get_default_ctrl(). This would let the resctrl change the schema format presented to user-space, provided it converts it to match what the architecture code expects. Rename resctrl_get_default_ctrl() to make it clear it returns the resource default. Signed-off-by: James Morse (cherry picked from commit 55030f526cf5a4d204bf017d26339bb59a9cab7c https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/core.c | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- drivers/resctrl/mpam_resctrl.c | 4 ++-- fs/resctrl/rdtgroup.c | 4 ++-- include/linux/resctrl.h | 13 ++++++++----- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 6781cbe84d987..5fc91bfdbd2fd 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -354,7 +354,7 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) * For Memory Allocation: Set b/w requested to 100% */ for (i = 0; i < hw_res->num_closid; i++, dc++) - *dc = resctrl_get_default_ctrl(r); + *dc = resctrl_get_resource_default_ctrl(r); } static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 8850264684405..8a017f1111028 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -253,7 +253,7 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) hw_dom = resctrl_to_arch_ctrl_dom(d); for (i = 0; i < hw_res->num_closid; i++) - hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r); + hw_dom->ctrl_val[i] = resctrl_get_resource_default_ctrl(r); msr_param.dom = d; smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index ace72011f54d7..091d7a01769e8 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1398,7 +1398,7 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, * fields. Until we configured the SMMU and GIC not to do this * 'all the bits' is the correct answer here. */ - r->cache.shareable_bits = resctrl_get_default_ctrl(r); + r->cache.shareable_bits = resctrl_get_resource_default_ctrl(r); break; case RDT_RESOURCE_MBA: r->alloc_capable = true; @@ -1606,7 +1606,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, } err: - return resctrl_get_default_ctrl(r); + return resctrl_get_resource_default_ctrl(r); } int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 5b0956a7ef47b..f42d079949a47 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1100,7 +1100,7 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of, struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); struct rdt_resource *r = s->res; - seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); + seq_printf(seq, "%x\n", resctrl_get_resource_default_ctrl(r)); return 0; } @@ -3663,7 +3663,7 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) } cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = resctrl_get_default_ctrl(r); + cfg->new_ctrl = resctrl_get_resource_default_ctrl(r); cfg->have_new_ctrl = true; } } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 4180623da9c00..2867a8a364a1a 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -309,7 +309,10 @@ struct resctrl_mon { * @mon_domains: RCU list of all monitor domains for this resource * @mba: Properties of the MBA resource * @name: Name to use in "schemata" file. - * @schema_fmt: Which format string and parser is used for this schema. + * @schema_fmt: Which format control parameters should be in for this resource. + * @evt_list: List of monitoring events + * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth + * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { @@ -388,11 +391,11 @@ struct resctrl_mon_config_info { void resctrl_arch_sync_cpu_closid_rmid(void *info); /** - * resctrl_get_default_ctrl() - Return the default control value for this - * resource. - * @r: The resource whose default control type is queried. + * resctrl_get_resource_default_ctrl() - Return the default control value for + * this resource. + * @r: The resource whose default control value is queried. */ -static inline u32 resctrl_get_default_ctrl(struct rdt_resource *r) +static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) { switch (r->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: From 7aa4e822cc37186a968465d4ad66698a9a7f86c9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 12:21:25 +0000 Subject: [PATCH 177/247] NVIDIA: SAUCE: fs/resctrl: Add a schema format to the schema, allowing it to be different BugLink: https://bugs.launchpad.net/bugs/2122432 parse_bw() doesn't bother checking the bounds it is given if the resource is in use by mba_sc. This is because the values parsed from user-space are not the same as those the control should take. To make this disparity easier to work with, a second different copy of the schema format is needed, which would need a version of resctrl_get_default_ctrl(). This would let the resctrl change the schema format presented to user-space, provided it converts it to match what the architecture code expects. Add a second schema format for use with mba_sc. The membw properties are copied and the schema version is used. When mba_sc is enabled the schema copy of these properties is modified. Signed-off-by: James Morse (cherry picked from commit 3e066eaa16feb66603fef0add0fdbc2af5bea63e https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 4 ++-- fs/resctrl/ctrlmondata.c | 14 ++++++------ fs/resctrl/rdtgroup.c | 26 +++++++++++++++++------ include/linux/arm_mpam.h | 4 +--- include/linux/resctrl.h | 24 ++++++++++++++++++++- 5 files changed, 52 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index cf9b30b5df3ce..a7828c31c118c 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -20,9 +20,9 @@ #include "internal.h" -u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r) +u32 resctrl_arch_round_bw(u32 val, const struct resctrl_schema *s) { - return roundup(val, (unsigned long)r->membw.bw_gran); + return roundup(val, (unsigned long)s->membw.bw_gran); } int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index f70ea58135ee2..1b0f375df03d9 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -38,7 +38,7 @@ typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. */ -static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) +static bool bw_validate(char *buf, u32 *data, struct resctrl_schema *s) { int ret; u32 bw; @@ -50,18 +50,18 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) } /* Nothing else to do if software controller is enabled. */ - if (is_mba_sc(r)) { + if (is_mba_sc(s->res)) { *data = bw; return true; } - if (bw < r->membw.min_bw || bw > r->membw.max_bw) { + if (bw < s->membw.min_bw || bw > s->membw.max_bw) { rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", - bw, r->membw.min_bw, r->membw.max_bw); + bw, s->membw.min_bw, s->membw.max_bw); return false; } - *data = resctrl_arch_round_bw(bw, r); + *data = resctrl_arch_round_bw(bw, s); return true; } @@ -73,7 +73,7 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_resource *r = s->res; u32 bw_val; - if (!bw_validate(data->buf, &bw_val, r)) + if (!bw_validate(data->buf, &bw_val, s)) return -EINVAL; if (is_mba_sc(r)) { @@ -213,7 +213,7 @@ static int parse_line(char *line, struct resctrl_schema *s, /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - switch (r->schema_fmt) { + switch (s->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: parse_ctrlval = &parse_cbm; break; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index f42d079949a47..4c3c4a608edcf 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1098,9 +1098,8 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - seq_printf(seq, "%x\n", resctrl_get_resource_default_ctrl(r)); + seq_printf(seq, "%x\n", resctrl_get_schema_default_ctrl(s)); return 0; } @@ -1224,9 +1223,8 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - seq_printf(seq, "%u\n", r->membw.min_bw); + seq_printf(seq, "%u\n", s->membw.min_bw); return 0; } @@ -1262,9 +1260,8 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - seq_printf(seq, "%u\n", r->membw.bw_gran); + seq_printf(seq, "%u\n", s->membw.bw_gran); return 0; } @@ -2759,7 +2756,22 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type if (cl > max_name_width) max_name_width = cl; - switch (r->schema_fmt) { + s->schema_fmt = r->schema_fmt; + s->membw = r->membw; + + /* + * When mba_sc() is enabled the format used by user space is different + * to that expected by hardware. The conversion is done by + * update_mba_bw(). + */ + if (is_mba_sc(r)) { + s->schema_fmt = RESCTRL_SCHEMA_RANGE; + s->membw.min_bw = 0; + s->membw.max_bw = MBA_MAX_MBPS; + s->membw.bw_gran = 1; + } + + switch (s->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: s->fmt_str = "%d=%x"; break; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 380bafed9043c..06827f240cf9e 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -45,10 +45,8 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, u32 component_id); struct resctrl_schema; - -struct rdt_resource; static inline u32 resctrl_arch_round_bw(u32 val, - const struct rdt_resource *r __always_unused) + const struct resctrl_schema *s __always_unused) { /* * Do nothing: for MPAM, resctrl_arch_update_one() has the necessary diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 2867a8a364a1a..2e9c548282ce5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -345,9 +345,12 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l); * @list: Member of resctrl_schema_all. * @name: The name to use in the "schemata" file. * @fmt_str: Format string to show domain value. + * @schema_fmt: Which format string and parser is used for this schema. * @conf_type: Whether this schema is specific to code/data. * @res: The resource structure exported by the architecture to describe * the hardware that is configured by this schema. + * @membw The properties of the schema which may be different to the format + * that was specified by the resource, * @num_closid: The number of closid that can be used with this schema. When * features like CDP are enabled, this will be lower than the * hardware supports for the resource. @@ -356,8 +359,10 @@ struct resctrl_schema { struct list_head list; char name[8]; const char *fmt_str; + enum resctrl_schema_fmt schema_fmt; enum resctrl_conf_type conf_type; struct rdt_resource *res; + struct resctrl_membw membw; u32 num_closid; }; @@ -407,6 +412,23 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) return WARN_ON_ONCE(1); } +/** + * resctrl_get_schema_default_ctrl() - Return the default control value for + * this schema. + * @s: The schema whose default control value is queried. + */ +static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) +{ + switch (s->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + return resctrl_get_resource_default_ctrl(s->res); + case RESCTRL_SCHEMA_RANGE: + return s->membw.max_bw; + } + + return WARN_ON_ONCE(1); +} + /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); @@ -497,7 +519,7 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); */ int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); -u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r); +u32 resctrl_arch_round_bw(u32 val, const struct resctrl_schema *s); /* * Update the ctrl_val and apply this config right now. From 2877e6f51a897f7bb8993cb0091c77e1896af4c3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 27 Sep 2024 17:59:15 +0100 Subject: [PATCH 178/247] NVIDIA: SAUCE: fs/resctrl: Use schema format to check the resource is a bitmap BugLink: https://bugs.launchpad.net/bugs/2122432 rdtgroup_cbm_to_size() uses a WARN_ON_ONCE() to assert that the resource it has been passed is one of the L2 or L3 cache. This is to avoid using uninitialised bitmap properties. Updating this list for every resource that is configured by a bitmap doesn't scale. Instead change the WARN_ON_ONCE() to use the schema format the arch code requested for the resource. Signed-off-by: James Morse (cherry picked from commit b17ce95b490d79604112587139d854fba384d34f https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/rdtgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 4c3c4a608edcf..4c853d6b1bf73 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1601,7 +1601,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct cacheinfo *ci; int num_b; - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) + if (WARN_ON_ONCE(r->schema_fmt != RESCTRL_SCHEMA_BITMAP)) return size; num_b = bitmap_weight(&cbm, r->cache.cbm_len); @@ -1688,11 +1688,11 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, ctrl = resctrl_arch_get_config(r, d, closid, type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) - size = ctrl; - else + + if (schema->schema_fmt == RESCTRL_SCHEMA_BITMAP) size = rdtgroup_cbm_to_size(r, d, ctrl); + else + size = ctrl; } seq_printf(s, "%d=%u", d->hdr.id, size); sep = true; From 50648e80a9da1e1f3aadd4a9845da37d736d40b0 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:15:54 +0000 Subject: [PATCH 179/247] NVIDIA: SAUCE: fs/resctrl: Add specific schema types for 'range' BugLink: https://bugs.launchpad.net/bugs/2122432 Resctrl allows the architecture code to specify the schema format for a control. Controls can either take a bitmap, or some kind of number. If user-space doesn't know what a control is by its name, it could be told the schema format. 'Some kind of number' isn't useful as the difference between a percentage and a value in MB/s affects how these would be programmed, even if resctrl's parsing code doesn't need to care. Add the types resctrl already has in addition to 'range'. This allows architectures to move over before 'range' is removed. These new schema formats are parsed the same, but will additionally affect which files are visible. Schema formats with a double underscore should not be considered portable between architectures, and are likely to be described to user-space as 'platform defined'. AMDs MBA resource is configured with an absolute bandwidth measured in multiples of one eighth of a GB per second. resctrl needs to be aware of this platform defined format to ensure the existing 'MB' files continue to be shown. Signed-off-by: James Morse (cherry picked from commit 50eed7f84b62a3a4f539c579035ab72ec150cf19 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/ctrlmondata.c | 3 +++ fs/resctrl/rdtgroup.c | 3 +++ include/linux/resctrl.h | 12 ++++++++++++ 3 files changed, 18 insertions(+) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 1b0f375df03d9..ac1c622c94c43 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -218,6 +218,9 @@ static int parse_line(char *line, struct resctrl_schema *s, parse_ctrlval = &parse_cbm; break; case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: parse_ctrlval = &parse_bw; break; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 4c853d6b1bf73..055fd575cadae 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2776,6 +2776,9 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type s->fmt_str = "%d=%x"; break; case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: s->fmt_str = "%d=%u"; break; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 2e9c548282ce5..f4f15d21abc51 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -271,10 +271,16 @@ enum resctrl_scope { * enum resctrl_schema_fmt - The format user-space provides for a schema. * @RESCTRL_SCHEMA_BITMAP: The schema is a bitmap in hex. * @RESCTRL_SCHEMA_RANGE: The schema is a decimal number. + * @RESCTRL_SCHEMA_PERCENT: The schema is a percentage. + * @RESCTRL_SCHEMA_MBPS: The schema ia a MBps value. + * @RESCTRL_SCHEMA__AMD_MBA: The schema value is MBA for AMD platforms. */ enum resctrl_schema_fmt { RESCTRL_SCHEMA_BITMAP, RESCTRL_SCHEMA_RANGE, + RESCTRL_SCHEMA_PERCENT, + RESCTRL_SCHEMA_MBPS, + RESCTRL_SCHEMA__AMD_MBA, }; /** @@ -406,6 +412,9 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) case RESCTRL_SCHEMA_BITMAP: return BIT_MASK(r->cache.cbm_len) - 1; case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: return r->membw.max_bw; } @@ -423,6 +432,9 @@ static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) case RESCTRL_SCHEMA_BITMAP: return resctrl_get_resource_default_ctrl(s->res); case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: return s->membw.max_bw; } From 2a03fd37a3dff4dcbb97e811a39c4ea751a20a01 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:19:37 +0000 Subject: [PATCH 180/247] NVIDIA: SAUCE: x86/resctrl: Move over to specifying MBA control formats BugLink: https://bugs.launchpad.net/bugs/2122432 Resctrl specifies the schema format for MB and SMBA in rdt_resources_all[]. Intel platforms take a percentage for MB, AMD platforms take an absolute value which isn't MB/s. Currently these are both treated as a 'range'. Adding support for additional types of control shows that user-space needs to be told what the control formats are. Today users of resctrl must already know if their platform is Intel or AMD to know how the MB resource will behave. The MPAM support exposes new control types that take a 'percentage'. The Intel MB resource is also configured by a percentage, so should be able to expose this to user-space. Remove the static configuration for schema_fmt in rdt_resources_all[] and specify it with the other control properties in __get_mem_config_intel() or __get_mem_config_amd(). Signed-off-by: James Morse (cherry picked from commit 20f0c13f4ffd01cb6fc239248afa05d602f9e8d4 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 5fc91bfdbd2fd..42fcc9d7ff7a2 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -88,7 +88,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .name = "MB", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), - .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, [RDT_RESOURCE_SMBA] = @@ -97,7 +96,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .name = "SMBA", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), - .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, }; @@ -192,6 +190,7 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; max_delay = eax.split.max_delay + 1; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->membw.max_bw = MAX_MBA_BW; r->mba.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { @@ -227,6 +226,7 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); hw_res->num_closid = edx + 1; + r->schema_fmt = RESCTRL_SCHEMA__AMD_MBA; r->membw.max_bw = 1 << eax; /* AMD does not use delay */ From e445e16ac9f83a6b379f99a88eb0f76985e7827b Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:30:08 +0000 Subject: [PATCH 181/247] NVIDIA: SAUCE: arm_mpam: resctrl: Convert MB resource to use percentage BugLink: https://bugs.launchpad.net/bugs/2122432 MPAMs bandwidth controls are both exposed to resctrl as if they take a percentage. Update the schema format so that user-space can be told this is a perentage, and files that describe this control format are exposed. (e.g. min_percent) Existing variation in this area is covered by requiring user-space to know if it is running on an Intel or AMD platform. Exposing the schema format directly will avoid modifying user-space to know it is running on an MPAM or RISCV platform. MPAM can also expose bitmap controls for memory bandwidth, which may become important for use-cases in the future. These are currently converted to a percentage to fit the existing definition of the MB resource. Signed-off-by: James Morse (cherry picked from commit ea03ef359eb04c8c0f557f589578bb4777b8e2b5 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 091d7a01769e8..25ab2b3ec2e4a 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1402,7 +1402,7 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, break; case RDT_RESOURCE_MBA: r->alloc_capable = true; - r->schema_fmt = RESCTRL_SCHEMA_RANGE; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->ctrl_scope = RESCTRL_L3_CACHE; r->mba.delay_linear = true; From 4947ce05782a88c8e2b52ac57cc14c1c5f0b286c Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:32:43 +0000 Subject: [PATCH 182/247] NVIDIA: SAUCE: fs/resctrl: Remove 'range' schema format BugLink: https://bugs.launchpad.net/bugs/2122432 Resctrl previously had a 'range' schema format that took some kind of number. This has since been split into percentage, MB/s and an AMD platform specific scheme. As range is no longer used, remove it. The last user is mba_sc which should be described as taking MB/s. Signed-off-by: James Morse (cherry picked from commit 93fda1d6632174fefddfe5e712110dd1e2947c95 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/ctrlmondata.c | 1 - fs/resctrl/rdtgroup.c | 3 +-- include/linux/resctrl.h | 4 ---- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index ac1c622c94c43..c3688cbe0ff5c 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -217,7 +217,6 @@ static int parse_line(char *line, struct resctrl_schema *s, case RESCTRL_SCHEMA_BITMAP: parse_ctrlval = &parse_cbm; break; - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 055fd575cadae..df1b937bc4c92 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2765,7 +2765,7 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type * update_mba_bw(). */ if (is_mba_sc(r)) { - s->schema_fmt = RESCTRL_SCHEMA_RANGE; + s->schema_fmt = RESCTRL_SCHEMA_MBPS; s->membw.min_bw = 0; s->membw.max_bw = MBA_MAX_MBPS; s->membw.bw_gran = 1; @@ -2775,7 +2775,6 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type case RESCTRL_SCHEMA_BITMAP: s->fmt_str = "%d=%x"; break; - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index f4f15d21abc51..d5a80db59603e 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -270,14 +270,12 @@ enum resctrl_scope { /** * enum resctrl_schema_fmt - The format user-space provides for a schema. * @RESCTRL_SCHEMA_BITMAP: The schema is a bitmap in hex. - * @RESCTRL_SCHEMA_RANGE: The schema is a decimal number. * @RESCTRL_SCHEMA_PERCENT: The schema is a percentage. * @RESCTRL_SCHEMA_MBPS: The schema ia a MBps value. * @RESCTRL_SCHEMA__AMD_MBA: The schema value is MBA for AMD platforms. */ enum resctrl_schema_fmt { RESCTRL_SCHEMA_BITMAP, - RESCTRL_SCHEMA_RANGE, RESCTRL_SCHEMA_PERCENT, RESCTRL_SCHEMA_MBPS, RESCTRL_SCHEMA__AMD_MBA, @@ -411,7 +409,6 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) switch (r->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: return BIT_MASK(r->cache.cbm_len) - 1; - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: @@ -431,7 +428,6 @@ static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) switch (s->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: return resctrl_get_resource_default_ctrl(s->res); - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: From cf4738d6049fd08b17806a39a6370c71ce8195e2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:49:06 +0000 Subject: [PATCH 183/247] NVIDIA: SAUCE: fs/resctrl: Add additional files for percentage and bitmap controls BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM has cache capacity controls that effectively take a percentage. Resctrl supports percentages, but the collection of files that are exposed to describe this control belong to the MB resource. To find the minimum granularity of the percentage cache capacity controls, user-space is expected to rad the banwdidth_gran file, and know this has nothing to do with bandwidth. The only problem here is the name of the file. Add duplicates of these properties with percentage and bitmap in the name. These will be exposed based on the schema format. The existing files must remain tied to the specific resources so that they remain visible to user-space. Using the same helpers ensures the values will always be the same regardless of the file used. These files are not exposed until the new RFTYPE schema flags are set on a resource 'fflags'. Signed-off-by: James Morse (cherry picked from commit 673bcb00d2371a2876e164da55d642fdf7657b8d https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/internal.h | 7 ++++++- fs/resctrl/rdtgroup.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 45f86943ddf24..24f340f5f4de0 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -236,14 +236,19 @@ struct rdtgroup { #define RFTYPE_TOP BIT(6) +/* files that are specific to a type of resource, e.g. throttle_mode */ #define RFTYPE_RES_CACHE BIT(8) - #define RFTYPE_RES_MB BIT(9) #define RFTYPE_DEBUG BIT(10) #define RFTYPE_ASSIGN_CONFIG BIT(11) +/* files that are specific to a type of control, e.g. percent_min */ +#define RFTYPE_SCHEMA_BITMAP BIT(11) +#define RFTYPE_SCHEMA_PERCENT BIT(12) +#define RFTYPE_SCHEMA_MBPS BIT(13) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index df1b937bc4c92..dd1bd01ceda7b 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1991,6 +1991,13 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .seq_show = resctrl_num_mbm_cntrs_show, }, + { + .name = "bitmap_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_default_ctrl_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_BITMAP, + }, { .name = "min_cbm_bits", .mode = 0444, @@ -1998,6 +2005,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_min_cbm_bits_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, + { + .name = "bitmaps_min_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_BITMAP, + }, { .name = "shareable_bits", .mode = 0444, @@ -2019,6 +2033,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_min_bw_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, + { + .name = "percent_min", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_PERCENT, + }, { .name = "bandwidth_gran", .mode = 0444, @@ -2026,6 +2047,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_bw_gran_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, + { + .name = "percent_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_PERCENT, + }, { .name = "delay_linear", .mode = 0444, From cd6af831fa87bca0c340fdfc46a00676b2b8eca3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 16:55:39 +0000 Subject: [PATCH 184/247] NVIDIA: SAUCE: fs/resctrl: Add fflags_from_schema() for files based on schema format BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM has cache capacity controls that effectively take a percentage. Resctrl supports percentages, but the collection of files that are exposed to describe this control belong to the MB resource. New files have been added that are selected based on the schema format. Apply the flags to enable these files based on the schema format. Add a new fflags_from_schema() that is used for controls. Signed-off-by: James Morse (cherry picked from commit a837ccc258380d6aeef86df709cc0484b60a4acf https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/rdtgroup.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index dd1bd01ceda7b..d2eb483ed4648 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2457,7 +2457,35 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) return RFTYPE_RES_MB; } - return WARN_ON_ONCE(1); + return 0; +} + +static u32 fflags_from_schema(struct resctrl_schema *s) +{ + struct rdt_resource *r = s->res; + u32 fflags = 0; + + /* Some resources are configured purely from their rid */ + fflags |= fflags_from_resource(r); + if (fflags) + return fflags; + + switch (s->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + fflags |= RFTYPE_SCHEMA_BITMAP; + break; + case RESCTRL_SCHEMA_PERCENT: + fflags |= RFTYPE_SCHEMA_PERCENT; + break; + case RESCTRL_SCHEMA_MBPS: + fflags |= RFTYPE_SCHEMA_MBPS; + break; + case RESCTRL_SCHEMA__AMD_MBA: + /* No standard files are exposed */ + break; + } + + return fflags; } static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) @@ -2480,7 +2508,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; + fflags = fflags_from_schema(s) | RFTYPE_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; From a5fd50b0d580fcb41674553a0a8f96b448f01809 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 10 Sep 2024 18:13:37 +0100 Subject: [PATCH 185/247] NVIDIA: SAUCE: fs/resctrl: Expose the schema format to user-space BugLink: https://bugs.launchpad.net/bugs/2122432 If more schemas are added to resctrl, user-space needs to know how to configure them. To allow user-space to configure schema it doesn't know about, it would be helpful to tell user-space the format, e.g. percentage. Add a file under info that describes the schema format. Percentages and 'mbps' are implicitly decimal, bitmaps are expected to be in hex. Signed-off-by: James Morse (cherry picked from commit b457019d995b2849e683aef0fd89066e64c679a4 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/rdtgroup.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index d2eb483ed4648..48ecde1ca9fd2 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1762,6 +1762,30 @@ static int mbm_local_bytes_config_show(struct kernfs_open_file *of, return 0; } +static int resctrl_schema_format_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + + switch (s->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + seq_puts(seq, "bitmap\n"); + break; + case RESCTRL_SCHEMA_PERCENT: + seq_puts(seq, "percentage\n"); + break; + case RESCTRL_SCHEMA_MBPS: + seq_puts(seq, "mbps\n"); + break; + /* The way these schema behave isn't discoverable from resctrl */ + case RESCTRL_SCHEMA__AMD_MBA: + seq_puts(seq, "platform\n"); + break; + } + + return 0; +} + static void mbm_config_write_domain(struct rdt_resource *r, struct rdt_mon_domain *d, u32 evtid, u32 val) { @@ -2192,6 +2216,14 @@ static struct rftype res_common_files[] = { .seq_show = rdtgroup_closid_show, .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, }, + { + .name = "schema_format", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_schema_format_show, + .fflags = RFTYPE_CTRL_INFO, + }, + }; static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) From 63dab86b308e158a1eb2ebb2e2752c6d4c007a59 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 12:35:13 +0000 Subject: [PATCH 186/247] NVIDIA: SAUCE: fs/resctrl: Add L2 and L3 'MAX' resource schema BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM can have both cache portion and cache capacity controls on any cache that supports MPAM. Cache portion bitmaps can be exposed via resctrl if they are implemented on L2 or L3. The cache capacity controls can not be used to isolate portions, which is in implicit in the L2 or L3 bitmap provided by user-space. These controls need to be configured with something more like a percentage. Add the resource enum entries for these two resources. No additional resctrl code is needed because the architecture code will specify this resource takes a 'percentage', re-using the support previously used only for the MB resource. Signed-off-by: James Morse (cherry picked from commit b601bbf375b016c417db4ec0e8bd6ae58b9057aa https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- include/linux/resctrl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index d5a80db59603e..055f27045b4da 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -54,6 +54,8 @@ enum resctrl_res_level { RDT_RESOURCE_L2, RDT_RESOURCE_MBA, RDT_RESOURCE_SMBA, + RDT_RESOURCE_L3_MAX, + RDT_RESOURCE_L2_MAX, /* Must be the last */ RDT_NUM_RESOURCES, From e9aa35023abba04a15a392ee2ffd799dac4f86f8 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 11:51:03 +0000 Subject: [PATCH 187/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add the glue code to convert to/from cmax BugLink: https://bugs.launchpad.net/bugs/2122432 MPAM's maximum cache-capacity controls take a fixed point fraction format. Instead of dumping this on user-space, convert it to a percentage. User-space using resctrl already knows how to handle percentages. Signed-off-by: James Morse (cherry picked from commit 183d4c43260089e6b51518e50427d0f04a6af875 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 69 ++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 25ab2b3ec2e4a..5f8354a3b60a1 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -700,6 +700,13 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return (class->props.cpbm_wd <= 32); } +static bool cache_has_usable_cmax(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + return mpam_has_feature(mpam_feat_cmax_cmax, cprops); +} + static bool mba_class_use_mbw_part(struct mpam_props *cprops) { if (!mpam_has_feature(mpam_feat_mbw_part, cprops) || @@ -867,6 +874,11 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) return percent_to_fract16(pc, cprops->bwa_wd); } +static u16 percent_to_cmax(u8 pc, struct mpam_props *cprops) +{ + return percent_to_fract16(pc, cprops->cmax_wd); +} + static u32 get_mba_min(struct mpam_props *cprops) { u32 val = 0; @@ -963,6 +975,7 @@ static bool topology_matches_l3(struct mpam_class *victim) /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { + bool has_cpor, has_cmax; struct mpam_class *class; struct mpam_resctrl_res *res; @@ -981,7 +994,9 @@ static void mpam_resctrl_pick_caches(void) continue; } - if (!cache_has_usable_cpor(class)) { + has_cpor = cache_has_usable_cpor(class); + has_cmax = cache_has_usable_cmax(class); + if (!has_cpor && !has_cmax) { pr_debug("class %u cache misses CPOR\n", class->level); continue; } @@ -994,12 +1009,24 @@ static void mpam_resctrl_pick_caches(void) continue; } - if (class->level == 2) - res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; - else - res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; - res->class = class; - exposed_alloc_capable = true; + if (has_cpor) { + pr_debug("pick_caches: Class has CPOR\n"); + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + res->class = class; + exposed_alloc_capable = true; + } + if (has_cmax) { + pr_debug("pick_caches: Class has CMAX\n"); + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2_MAX]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3_MAX]; + res->class = class; + exposed_alloc_capable = true; + } } } @@ -1399,6 +1426,23 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, * 'all the bits' is the correct answer here. */ r->cache.shareable_bits = resctrl_get_resource_default_ctrl(r); + break; + case RDT_RESOURCE_L2_MAX: + case RDT_RESOURCE_L3_MAX: + r->alloc_capable = true; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; + r->membw.min_bw = max(100 / (1 << cprops->cmax_wd), 1); + r->membw.bw_gran = max(100 / (1 << cprops->cmax_wd), 1); + r->membw.max_bw = 100; + + if (r->rid == RDT_RESOURCE_L2_MAX) { + r->name = "L2_MAX"; + r->ctrl_scope = RESCTRL_L2_CACHE; + } else { + r->name = "L3_MAX"; + r->ctrl_scope = RESCTRL_L3_CACHE; + } + break; case RDT_RESOURCE_MBA: r->alloc_capable = true; @@ -1575,6 +1619,10 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case RDT_RESOURCE_L3: configured_by = mpam_feat_cpor_part; break; + case RDT_RESOURCE_L2_MAX: + case RDT_RESOURCE_L3_MAX: + configured_by = mpam_feat_cmax_cmax; + break; case RDT_RESOURCE_MBA: if (mba_class_use_mbw_part(cprops)) { configured_by = mpam_feat_mbw_part; @@ -1596,6 +1644,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case mpam_feat_cpor_part: /* TODO: Scaling is not yet supported */ return cfg->cpbm; + case mpam_feat_cmax_cmax: + return fract16_to_percent(cfg->cmax, cprops->cmax_wd); case mpam_feat_mbw_part: /* TODO: Scaling is not yet supported */ return mbw_pbm_to_percent(cfg->mbw_pbm, cprops); @@ -1652,6 +1702,11 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, cfg.cpbm = cfg_val; mpam_set_feature(mpam_feat_cpor_part, &cfg); break; + case RDT_RESOURCE_L2_MAX: + case RDT_RESOURCE_L3_MAX: + cfg.cmax = percent_to_cmax(cfg_val, cprops); + mpam_set_feature(mpam_feat_cmax_cmax, &cfg); + break; case RDT_RESOURCE_MBA: if (mba_class_use_mbw_part(cprops)) { cfg.mbw_pbm = percent_to_mbw_pbm(cfg_val, cprops); From 7098ca730209c57b09dca25fb880e90bd2364bfd Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 16 Jul 2025 14:42:38 +0100 Subject: [PATCH 188/247] NVIDIA: SAUCE: mm,memory_hotplug: Add lockdep assertion helper BugLink: https://bugs.launchpad.net/bugs/2122432 The cpu hotplug lock has a helper lockdep_assert_cpus_held() that makes it easy to annotate functions that must be called with the cpu hotplug lock held. Do the same for memory. Signed-off-by: James Morse (cherry picked from commit f40d4b8451b3d9e197166ff33104bd63f93709d0 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- include/linux/memory_hotplug.h | 6 ++++++ mm/memory_hotplug.c | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 23f038a162319..acc5ac1e92491 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -334,4 +334,10 @@ extern int arch_create_linear_mapping(int nid, u64 start, u64 size, void arch_remove_linear_mapping(u64 start, u64 size); #endif /* CONFIG_MEMORY_HOTPLUG */ +#if defined(CONFIG_LOCKDEP) && defined(CONFIG_MEMORY_HOTPLUG) +void lockdep_assert_mems_held(void); +#else +static inline void lockdep_assert_mems_held(void) { } +#endif + #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 74318c7877156..89ec5ed8c488b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -218,6 +218,17 @@ void put_online_mems(void) percpu_up_read(&mem_hotplug_lock); } +#ifdef CONFIG_LOCKDEP +void lockdep_assert_mems_held(void) +{ + /* See lockdep_assert_cpus_held() */ + if (system_state < SYSTEM_RUNNING) + return; + + percpu_rwsem_assert_held(&mem_hotplug_lock); +} +#endif + bool movable_node_enabled = false; static int mhp_default_online_type = -1; From 84c398975de103b2a305cef4b1f31598c15a4576 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 8 Jul 2025 17:02:24 +0100 Subject: [PATCH 189/247] NVIDIA: SAUCE: fs/resctrl: Take memory hotplug lock whenever taking CPU hotplug lock BugLink: https://bugs.launchpad.net/bugs/2122432 resctrl takes the read side CPU hotplug lock whenever it is working with the list of domains. This prevents a CPU being brought online and the list being modified while resctrl is walking the list, or picking CPUs from the CPU masks. If resctrl domains for CPU-less NUMA nodes are to be supported, this would not be enough to prevent the domain list form being modified as a NUMA node can come online with only memory. Take the memory hotplug lock whenever the CPU hotplug lock is taken. Signed-off-by: James Morse (cherry picked from commit f5a082989a5f40b9b95515d68b230f8125648fdb https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- fs/resctrl/monitor.c | 5 +++++ fs/resctrl/pseudo_lock.c | 3 +++ fs/resctrl/rdtgroup.c | 17 +++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 5005e912bf0de..b1633bf6e6d4d 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -18,6 +18,7 @@ #define pr_fmt(fmt) "resctrl: " fmt #include +#include #include #include #include @@ -763,6 +764,7 @@ void cqm_handle_limbo(struct work_struct *work) struct rdt_mon_domain *d; cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); @@ -777,6 +779,7 @@ void cqm_handle_limbo(struct work_struct *work) } mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); } @@ -810,6 +813,7 @@ void mbm_handle_overflow(struct work_struct *work) struct rdt_resource *r; cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); /* @@ -843,6 +847,7 @@ void mbm_handle_overflow(struct work_struct *work) out_unlock: mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); } diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 87bbc2605de12..4086e61df3e1c 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -694,6 +695,7 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) int ret = -1; cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); if (rdtgrp->flags & RDT_DELETED) { @@ -741,6 +743,7 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) out: mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); return ret; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 48ecde1ca9fd2..e2451c93123fd 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1155,6 +1156,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, u32 ctrl_val; cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { @@ -1215,6 +1217,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } seq_putc(seq, '\n'); mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); return 0; } @@ -1719,6 +1722,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid bool sep = false; cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); list_for_each_entry(dom, &r->mon_domains, hdr.list) { @@ -1737,6 +1741,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid seq_puts(s, "\n"); mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); return 0; @@ -1881,6 +1886,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, return -EINVAL; cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1890,6 +1896,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); return ret ?: nbytes; @@ -1907,6 +1914,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, return -EINVAL; cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1916,6 +1924,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); return ret ?: nbytes; @@ -2727,6 +2736,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) rdtgroup_kn_get(rdtgrp, kn); cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); /* Was this group deleted while we waited? */ @@ -2744,6 +2754,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); rdtgroup_kn_put(rdtgrp, kn); @@ -2962,6 +2973,7 @@ static int rdt_get_tree(struct fs_context *fc) enable_abi_playground(); cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); /* * resctrl file system can only be mounted once. @@ -3066,6 +3078,7 @@ static int rdt_get_tree(struct fs_context *fc) out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); return ret; } @@ -3350,6 +3363,7 @@ static void rdt_kill_sb(struct super_block *sb) struct rdt_resource *r; cpus_read_lock(); + get_online_mems(); mutex_lock(&rdtgroup_mutex); rdt_disable_ctx(); @@ -3366,6 +3380,7 @@ static void rdt_kill_sb(struct super_block *sb) resctrl_mounted = false; kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); if (static_branch_unlikely(&resctrl_abi_playground)) @@ -4756,12 +4771,14 @@ static bool resctrl_online_domains_exist(void) void resctrl_exit(void) { cpus_read_lock(); + get_online_mems(); WARN_ON_ONCE(resctrl_online_domains_exist()); mutex_lock(&rdtgroup_mutex); resctrl_fs_teardown(); mutex_unlock(&rdtgroup_mutex); + put_online_mems(); cpus_read_unlock(); debugfs_remove_recursive(debugfs_resctrl); From 7ec7dcfb7c1ed8969670d5afb7dc71e9a3a450af Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 3 Jul 2025 15:55:48 +0100 Subject: [PATCH 190/247] NVIDIA: SAUCE: fs/resctrl: Add mount option for mb_uses_numa_nid and arch stubs BugLink: https://bugs.launchpad.net/bugs/2122432 Resctrl expects the domain IDs for the 'MB' resource to be the corresponding L3 cache-ids. This is a problem for platforms where the memory bandwidth controls are implemented somewhere other than the L3 cache, and exist on a platform with CPU-less NUMA nodes. Such platforms can't currently be exposed via resctrl as not all the memory bandwidth can be controlled. Add a mount option to allow user-space to opt-in to the domain IDs for the MB resource to be the NUMA nid instead. Signed-off-by: James Morse (cherry picked from commit ae8929caac02dccdc932666c1d8c906dda541bf1 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/include/asm/resctrl.h | 9 +++++++++ fs/resctrl/internal.h | 1 + fs/resctrl/rdtgroup.c | 26 ++++++++++++++++++++++---- include/linux/arm_mpam.h | 10 ++++++++++ 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 40a74a0617345..279aba8e97bf5 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -198,6 +198,15 @@ static inline bool resctrl_arch_mon_can_overflow(void) void resctrl_cpu_detect(struct cpuinfo_x86 *c); +static inline bool resctrl_arch_get_mb_uses_numa_nid(void) +{ + return false; +} + +static inline int resctrl_arch_set_mb_uses_numa_nid(bool enabled) +{ + return -EOPNOTSUPP; +} #else static inline void resctrl_arch_sched_in(struct task_struct *tsk) {} diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 24f340f5f4de0..f5f74342af317 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -42,6 +42,7 @@ struct rdt_fs_context { bool enable_cdpl3; bool enable_mba_mbps; bool enable_debug; + bool mb_uses_numa_nid; bool enable_abi_playground; }; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index e2451c93123fd..3c9981f545017 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2768,6 +2768,7 @@ static void rdt_disable_ctx(void) { resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); + resctrl_arch_set_mb_uses_numa_nid(false); set_mba_sc(false); resctrl_debug = false; @@ -2798,8 +2799,17 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx) if (ctx->enable_debug) resctrl_debug = true; + if (ctx->mb_uses_numa_nid) { + ret = resctrl_arch_set_mb_uses_numa_nid(true); + if (ret) + goto out_debug; + } + return 0; +out_debug: + resctrl_debug = false; + set_mba_sc(false); out_cdpl3: resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); out_cdpl2: @@ -3088,15 +3098,17 @@ enum rdt_param { Opt_cdpl2, Opt_mba_mbps, Opt_debug, + Opt_mb_uses_numa_nid, Opt_not_abi_playground, nr__rdt_params }; static const struct fs_parameter_spec rdt_fs_parameters[] = { - fsparam_flag("cdp", Opt_cdp), - fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_MBps", Opt_mba_mbps), - fsparam_flag("debug", Opt_debug), + fsparam_flag("cdp", Opt_cdp), + fsparam_flag("cdpl2", Opt_cdpl2), + fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), + fsparam_flag("mb_uses_numa_nid", Opt_mb_uses_numa_nid), /* * Some of MPAM's out of tree code exposes things through resctrl @@ -3134,6 +3146,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_debug: ctx->enable_debug = true; return 0; + case Opt_mb_uses_numa_nid: + ctx->mb_uses_numa_nid = true; + return 0; case Opt_not_abi_playground: ctx->enable_abi_playground = true; return 0; @@ -4391,6 +4406,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (resctrl_debug) seq_puts(seq, ",debug"); + if (resctrl_arch_get_mb_uses_numa_nid()) + seq_puts(seq, ",mb_uses_numa_nid"); + if (static_branch_unlikely(&resctrl_abi_playground)) seq_puts(seq, ",this_is_not_abi"); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 06827f240cf9e..b43494e734ded 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -78,6 +78,16 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +static inline bool resctrl_arch_get_mb_uses_numa_nid(void) +{ + return false; +} + +static inline bool resctrl_arch_set_mb_uses_numa_nid(bool enabled) +{ + return false; +} + /* * The CPU configuration for MPAM is cheap to write, and is only written if it * has changed. No need for fine grained enables. From e2a3a8cccb6adcc71da9b83e84d4db181d3520e4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 17 Oct 2025 11:27:06 +0100 Subject: [PATCH 191/247] NVIDIA: SAUCE: Fix unused variable warning BugLink: https://bugs.launchpad.net/bugs/2122432 idx is not used. Remove it to avoid build warning. The author is James but he doesn't add his Signed-off-by. (backported from commit c9b4fabe0b1b4805186d4326d47547993a02d191 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) [fenghuay: Change subject to a meaningfull one. Add commit message.] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 5f8354a3b60a1..a773433767361 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1847,7 +1847,7 @@ static void mpam_resctrl_domain_insert(struct list_head *list, static struct mpam_resctrl_dom * mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { - int err, idx; + int err; struct mpam_resctrl_dom *dom; struct rdt_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; From eba8ea7a28322ee03911f509331b69e90b96cbee Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 8 Jul 2025 14:06:31 +0100 Subject: [PATCH 192/247] NVIDIA: SAUCE: arm_mpam: resctrl: Pick whether MB can use NUMA nid instead of cache-id BugLink: https://bugs.launchpad.net/bugs/2122432 The MB domain ids are the L3 cache-id. This is unfortunate if the memory bandwidth controls are implemented for CPU-less NUMA nodes as there is no L3 whose cache-id can be used to expose these controls to resctrl. When picking the class to use as MB, note whether it is possible for the NUMA nid to be used as the domain-id. By default the MB resource will use the cache-id. Signed-off-by: James Morse (cherry picked from commit c2506e7fdb9e9de624af635f5060a1fe56a6bb80 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 57 +++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index a773433767361..e0ed713a52d0e 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -54,6 +54,14 @@ static bool exposed_mon_capable; */ static bool cdp_enabled; +/* + * To support CPU-less NUMA nodes, user-space needs to opt in to the MB + * domain IDs being the NUMA nid instead of the corresponding CPU's L3 + * cache-id. + */ +static bool mb_uses_numa_nid; +static bool mb_numa_nid_possible; +static bool mb_l3_cache_id_possible; /* * If resctrl_init() succeeded, resctrl_exit() can be used to remove support * for the filesystem in the event of an error. @@ -972,6 +980,15 @@ static bool topology_matches_l3(struct mpam_class *victim) return true; } +static bool topology_matches_numa(struct mpam_class *victim) +{ + /* + * For now, check this is a memory class, in which case component + * id are already NUMA nid. + */ + return (victim->type == MPAM_CLASS_MEMORY); +} + /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { @@ -1041,6 +1058,8 @@ static void mpam_resctrl_pick_mba(void) list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) { struct mpam_props *cprops = &class->props; + bool l3_cache_id_possible = false; + bool numa_nid_possible = false; if (class->level < 3) { pr_debug("class %u is before L3\n", class->level); @@ -1057,8 +1076,18 @@ static void mpam_resctrl_pick_mba(void) continue; } - if (!topology_matches_l3(class)) { - pr_debug("class %u topology doesn't match L3\n", class->level); + if (topology_matches_numa(class)) { + pr_debug("class %u topology matches NUMA domains\n", class->level); + numa_nid_possible = true; + } + + if (topology_matches_l3(class)) { + pr_debug("class %u topology matches L3\n", class->level); + l3_cache_id_possible = true; + } + + if (!l3_cache_id_possible && !numa_nid_possible) { + pr_debug("class %u has no matching topology for MB\n", class->level); continue; } @@ -1067,8 +1096,17 @@ static void mpam_resctrl_pick_mba(void) * mbm_local is implicitly part of the L3, pick a resource to be MBA * that as close as possible to the L3. */ - if (!candidate_class || class->level < candidate_class->level) - candidate_class = class; + if (!candidate_class || class->level < candidate_class->level) { + /* + * Refuse to pick a closer class if it would prevent cache-id + * being used as domain-id by default. + */ + if (!candidate_class || l3_cache_id_possible) { + candidate_class = class; + mb_l3_cache_id_possible = l3_cache_id_possible; + mb_numa_nid_possible = numa_nid_possible; + } + } } if (candidate_class) { @@ -1445,7 +1483,10 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, break; case RDT_RESOURCE_MBA: - r->alloc_capable = true; + /* Domain ID is the L3 cache-id by default */ + if (mb_l3_cache_id_possible) + r->alloc_capable = true; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->ctrl_scope = RESCTRL_L3_CACHE; @@ -1467,8 +1508,14 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res, static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) { + bool is_mb; struct mpam_class *class = comp->class; + is_mb = (mpam_resctrl_controls[RDT_RESOURCE_MBA].class == class); + + if (is_mb && mb_uses_numa_nid && topology_matches_numa(class)) + return comp->comp_id; + if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; From 514992c8a5ea6d74e162c392ecec7bb2cd482a6b Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 3 Jul 2025 17:19:47 +0100 Subject: [PATCH 193/247] NVIDIA: SAUCE: arm_mpam: resctrl: Change domain_hdr online/offline to work with a set of CPUs BugLink: https://bugs.launchpad.net/bugs/2122432 mpam_resctrl_offline_domain_hdr() expects to take a single CPU that is going offline. Once all CPUs are offline, the domain header is removed from its parent list, and the structure can be freed. This doesn't work for NUMA nodes. Change the CPU passed to mpam_resctrl_offline_domain_hdr() and mpam_resctrl_domain_hdr_init to be a cpumask. This allows a single CPU to be passed for CPUs going offline, and cpu_possible_mask to be passed for a NUMA node going offline. Signed-off-by: James Morse (cherry picked from commit 093483e5bca0aef546208b32eedf59f3aac665ff https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 61 ++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index e0ed713a52d0e..c32a49fea2a74 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1830,30 +1830,46 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) mpam_reset_class_locked(res->class); } -static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, +/** + * mpam_resctrl_domain_hdr_init() - Bring a subset of a domain online. + * @onlined_cpus: The set of CPUs that are online from the domain's + * perspective. + * @comp: The mpam component being brought online. + * @hdr: The header representing the domain. + * + * Adds @onlined_cpus to @hdr's cpu_mask, and sets the @hdr id. + * For NUMA nodes, @onlined_cpus will be cpu_possible_mask. + */ +static void mpam_resctrl_domain_hdr_init(const struct cpumask *onlined_cpus, + struct mpam_component *comp, struct rdt_domain_hdr *hdr) { + int cpu = cpumask_any(onlined_cpus); + lockdep_assert_cpus_held(); INIT_LIST_HEAD(&hdr->list); hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); - cpumask_set_cpu(cpu, &hdr->cpu_mask); + cpumask_and(&hdr->cpu_mask, &hdr->cpu_mask, onlined_cpus); } /** - * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. - * @cpu: The CPU to remove from the domain. + * mpam_resctrl_offline_domain_hdr() - Take a subset of a domain offline. + * @offlined_cpus: The set of CPUs that are offline from the domain's + * perspective. * @hdr: The domain's header. * - * Removes @cpu from the header mask. If this was the last CPU in the domain, + * Removes @offlined_cpus from @hdr's cpu_mask. If the list is empty, * the domain header is removed from its parent list and true is returned, * indicating the parent structure can be freed. * If there are other CPUs in the domain, returns false. + * + * For NUMA nodes, @offlined_cpus will be cpu_possible_mask. */ -static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, +static bool mpam_resctrl_offline_domain_hdr(const struct cpumask *offlined_cpus, struct rdt_domain_hdr *hdr) { - cpumask_clear_cpu(cpu, &hdr->cpu_mask); + cpumask_andnot(&hdr->cpu_mask, &hdr->cpu_mask, offlined_cpus); if (cpumask_empty(&hdr->cpu_mask)) { list_del(&hdr->list); return true; @@ -1862,14 +1878,18 @@ static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, return false; } -static struct mpam_component *find_component(struct mpam_class *victim, int cpu) +static struct mpam_component *find_component(struct mpam_class *victim, + const struct cpumask *onlined_cpus) { struct mpam_component *victim_comp; guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(victim_comp, &victim->components, class_list, srcu_read_lock_held(&mpam_srcu)) { - if (cpumask_test_cpu(cpu, &victim_comp->affinity)) + struct cpumask tmp; + + cpumask_andnot(&tmp, onlined_cpus, &victim_comp->affinity); + if (cpumask_empty(&tmp)) return victim_comp; } @@ -1892,12 +1912,14 @@ static void mpam_resctrl_domain_insert(struct list_head *list, } static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, + struct mpam_resctrl_res *res) { int err; struct mpam_resctrl_dom *dom; struct rdt_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; + int cpu = cpumask_any(onlined_cpus); struct mpam_class *class = res->class; struct mpam_component *comp_iter, *ctrl_comp; struct rdt_resource *r = &res->resctrl_res; @@ -1927,7 +1949,8 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) dom->ctrl_comp = ctrl_comp; ctrl_d = &dom->resctrl_ctrl_dom; - mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, &ctrl_d->hdr); + + mpam_resctrl_domain_hdr_init(onlined_cpus, ctrl_comp, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); err = resctrl_online_ctrl_domain(r, ctrl_d); @@ -1957,7 +1980,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) if (!mon->class) continue; // dummy resource - mon_comp = find_component(mon->class, cpu); + mon_comp = find_component(mon->class, onlined_cpus); dom->mon_comp[i] = mon_comp; if (mon_comp) any_mon_comp = mon_comp; @@ -1967,7 +1990,8 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) dom->mbm_local_evt_cfg = MPAM_RESTRL_EVT_CONFIG_VALID; mon_d = &dom->resctrl_mon_dom; - mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, &mon_d->hdr); + mpam_resctrl_domain_hdr_init(onlined_cpus, any_mon_comp, + &mon_d->hdr); mon_d->hdr.type = RESCTRL_MON_DOMAIN; mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); err = resctrl_online_mon_domain(r, mon_d); @@ -1981,7 +2005,8 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) goto out; offline_mon_hdr: - mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + mpam_resctrl_offline_domain_hdr(onlined_cpus, &ctrl_d->hdr); + offline_ctrl_domain: resctrl_offline_ctrl_domain(r, ctrl_d); out: @@ -2069,7 +2094,7 @@ int mpam_resctrl_online_cpu(unsigned int cpu) dom = mpam_resctrl_get_domain_from_cpu(cpu, res); if (!dom) - dom = mpam_resctrl_alloc_domain(cpu, res); + dom = mpam_resctrl_alloc_domain(cpumask_of(cpu), res); if (IS_ERR(dom)) { err = PTR_ERR(dom); break; @@ -2112,7 +2137,8 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) mpam_reset_component_locked(dom->ctrl_comp); ctrl_d = &dom->resctrl_ctrl_dom; - ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpumask_of(cpu), + &ctrl_d->hdr); if (ctrl_dom_empty) resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); } @@ -2120,7 +2146,8 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) mon_dom_empty = true; if (exposed_mon_capable) { mon_d = &dom->resctrl_mon_dom; - mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpumask_of(cpu), + &mon_d->hdr); if (mon_dom_empty) resctrl_offline_mon_domain(&res->resctrl_res, mon_d); } From fa77901c260397bb801752b9b18fdac139333673 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 8 Jul 2025 14:03:05 +0100 Subject: [PATCH 194/247] NVIDIA: SAUCE: untested: arm_mpam: resctrl: Split mpam_resctrl_alloc_domain() to have CPU and node BugLink: https://bugs.launchpad.net/bugs/2122432 mpam_resctrl_alloc_domain() brings a domain with CPUs online. To allow for domains that don't have any CPUs, split it into a CPU and NUMA node version. Signed-off-by: James Morse (cherry picked from commit 817d04bd296871b61dd70f68d160b85837dfe9a8 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 82 +++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 22 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index c32a49fea2a74..cc7c69e2c3a7b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -1912,36 +1913,19 @@ static void mpam_resctrl_domain_insert(struct list_head *list, } static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, +mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, int nid, + struct mpam_component *ctrl_comp, struct mpam_resctrl_res *res) { int err; struct mpam_resctrl_dom *dom; struct rdt_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; - int cpu = cpumask_any(onlined_cpus); - struct mpam_class *class = res->class; - struct mpam_component *comp_iter, *ctrl_comp; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_held(&domain_list_lock); - ctrl_comp = NULL; - idx = srcu_read_lock(&mpam_srcu); - list_for_each_entry_srcu(comp_iter, &class->components, class_list, - srcu_read_lock_held(&mpam_srcu)) { - if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { - ctrl_comp = comp_iter; - break; - } - } - srcu_read_unlock(&mpam_srcu, idx); - - /* cpu with unknown exported component? */ - if (WARN_ON_ONCE(!ctrl_comp)) - return ERR_PTR(-EINVAL); - - dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); + dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, nid); if (!dom) return ERR_PTR(-ENOMEM); @@ -1949,7 +1933,6 @@ mpam_resctrl_alloc_domain(const struct cpumask *onlined_cpus, dom->ctrl_comp = ctrl_comp; ctrl_d = &dom->resctrl_ctrl_dom; - mpam_resctrl_domain_hdr_init(onlined_cpus, ctrl_comp, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); @@ -2057,6 +2040,61 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) * component. * For the monitors, we need to search the list of events... */ +static struct mpam_resctrl_dom * +mpam_resctrl_alloc_domain_cpu(int cpu, struct mpam_resctrl_res *res) +{ + struct mpam_component *comp_iter, *ctrl_comp; + struct mpam_class *class = res->class; + int idx; + + ctrl_comp = NULL; + idx = srcu_read_lock(&mpam_srcu); + list_for_each_entry_srcu(comp_iter, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { + ctrl_comp = comp_iter; + break; + } + } + srcu_read_unlock(&mpam_srcu, idx); + + /* cpu with unknown exported component? */ + if (WARN_ON_ONCE(!ctrl_comp)) + return ERR_PTR(-EINVAL); + + return mpam_resctrl_alloc_domain(cpumask_of(cpu), cpu_to_node(cpu), + ctrl_comp, res); +} + +static struct mpam_resctrl_dom * +mpam_resctrl_alloc_domain_nid(int nid, struct mpam_resctrl_res *res) +{ + struct mpam_component *comp_iter, *ctrl_comp; + struct mpam_class *class = res->class; + int idx; + + /* Only the memory class uses comp_id as nid */ + if (class->type != MPAM_CLASS_MEMORY) + return ERR_PTR(-EINVAL); + + ctrl_comp = NULL; + idx = srcu_read_lock(&mpam_srcu); + list_for_each_entry_srcu(comp_iter, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (comp_iter->comp_id == nid) { + ctrl_comp = comp_iter; + break; + } + } + srcu_read_unlock(&mpam_srcu, idx); + + /* cpu with unknown exported component? */ + if (WARN_ON_ONCE(!ctrl_comp)) + return ERR_PTR(-EINVAL); + + return mpam_resctrl_alloc_domain(cpu_possible_mask, nid, ctrl_comp, res); +} + static struct mpam_resctrl_dom * mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) { @@ -2094,7 +2132,7 @@ int mpam_resctrl_online_cpu(unsigned int cpu) dom = mpam_resctrl_get_domain_from_cpu(cpu, res); if (!dom) - dom = mpam_resctrl_alloc_domain(cpumask_of(cpu), res); + dom = mpam_resctrl_alloc_domain_cpu(cpu, res); if (IS_ERR(dom)) { err = PTR_ERR(dom); break; From 1ade50196e818a2dc23690a8d58812f1db8a1988 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 8 Jul 2025 14:15:27 +0100 Subject: [PATCH 195/247] NVIDIA: SAUCE: arm_mpam: resctrl: Add NUMA node notifier for domain online/offline BugLink: https://bugs.launchpad.net/bugs/2122432 To expose resctrl resources that contain CPU-less NUMA domains, resctrl needs to be told when a CPU-less NUMA domain comes online. This can't be done with the cpuhp callbacks. Add a memory hotplug notifier, and use this to create and destroy resctrl domains. Signed-off-by: James Morse (cherry picked from commit caf4034229d8df2c306658c2ddbe3c1ab73df109 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 108 +++++++++++++++++++++++++++++++++ include/linux/memory.h | 1 + 2 files changed, 109 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index cc7c69e2c3a7b..da079b343c1bd 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2118,6 +2119,26 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) return mpam_resctrl_get_mon_domain_from_cpu(cpu); } +static struct mpam_resctrl_dom * +mpam_get_domain_from_nid(int nid, struct mpam_resctrl_res *res) +{ + struct rdt_ctrl_domain *d; + struct mpam_resctrl_dom *dom; + + list_for_each_entry(d, &res->resctrl_res.ctrl_domains, hdr.list) { + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + + /* Only the memory class uses comp_id as nid */ + if (dom->ctrl_comp->class->type != MPAM_CLASS_MEMORY) + continue; + + if (dom->ctrl_comp->comp_id == nid) + return dom; + } + + return NULL; +} + int mpam_resctrl_online_cpu(unsigned int cpu) { int i, err = 0; @@ -2198,6 +2219,88 @@ int mpam_resctrl_offline_cpu(unsigned int cpu) return 0; } +static int mpam_resctrl_online_node(unsigned int nid) +{ + struct mpam_resctrl_dom *dom; + struct mpam_resctrl_res *res; + + /* Domain IDs as NUMA nid is only defined for MBA */ + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + if (!res->class) + return 0; // dummy_resource; + + dom = mpam_get_domain_from_nid(nid, res); + if (!dom) + dom = mpam_resctrl_alloc_domain_nid(nid, res); + if (IS_ERR(dom)) + return PTR_ERR(dom); + + return 0; +} + +static int mpam_resctrl_offline_node(unsigned int nid) +{ + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + struct rdt_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; + + /* Domain IDs as NUMA nid is only defined for MBA */ + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + if (!res->class) + return 0; // dummy_resource; + + dom = mpam_get_domain_from_nid(nid, res); + if (WARN_ON_ONCE(!dom)) + return 0; + + ctrl_d = &dom->resctrl_ctrl_dom; + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + if (!mpam_resctrl_offline_domain_hdr(cpu_possible_mask, &ctrl_d->hdr)) + return 0; + + // TODO: skip monitor domains if there are no monitors for this resource + mon_d = &dom->resctrl_mon_dom; + resctrl_offline_mon_domain(&res->resctrl_res, mon_d); + if (!mpam_resctrl_offline_domain_hdr(cpu_possible_mask, &mon_d->hdr)) + return 0; + + kfree(dom); + + return 0; +} + +static int mpam_resctrl_node_notifier(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct node_notify *nn = arg; + + if (nn->nid < 0 || !mb_uses_numa_nid) + return NOTIFY_OK; + + /* + * Ignore nid that have CPUs. Resctrl needs to see the cpu offline + * call for each CPU to update the CPUs in control groups. Moving + * the overflow handler isn't an issue as only L3 can be mon_capable, + * and NUMA nid used as domain-id are only an option for MBA. + */ + if (!cpumask_empty(cpumask_of_node(nn->nid))) + return NOTIFY_OK; + + switch (action) { + case NODE_ADDED_FIRST_MEMORY: + mpam_resctrl_online_node(nn->nid); + break; + case NODE_REMOVED_LAST_MEMORY: + mpam_resctrl_offline_node(nn->nid); + break; + default: + /* don't care */ + } + + return NOTIFY_OK; +} + int mpam_resctrl_setup(void) { int err = 0; @@ -2244,6 +2347,11 @@ int mpam_resctrl_setup(void) mpam_resctrl_monitor_init(mon, j); } + if (mb_numa_nid_possible) { + hotplug_node_notifier(mpam_resctrl_node_notifier, + RESCTRL_CALLBACK_PRI); + } + cpus_read_unlock(); if (err || (!exposed_alloc_capable && !exposed_mon_capable)) { diff --git a/include/linux/memory.h b/include/linux/memory.h index 40eb70ccb09d5..2a770e7c6ab1e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -126,6 +126,7 @@ struct mem_section; #define CPUSET_CALLBACK_PRI 10 #define MEMTIER_HOTPLUG_PRI 100 #define KSM_CALLBACK_PRI 100 +#define RESCTRL_CALLBACK_PRI 100 #ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) From 80e24c06ebf851aca9d08f2981a257d96f32c4bc Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 8 Jul 2025 14:18:35 +0100 Subject: [PATCH 196/247] NVIDIA: SAUCE: untested: arm_mpam: resctrl: Allow resctrl to enable NUMA nid as MB domain-id BugLink: https://bugs.launchpad.net/bugs/2122432 Enable resctrl's use of NUMA nid as the domain-id for the MB resource. Changing this state involves changing the IDs of all the domains visible to resctrl. Writing to this list means preventing CPU and memory hotplug. Signed-off-by: James Morse (cherry picked from commit a795ac909c6c050daaf095abc9043217ddf5e746 https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_resctrl.c | 49 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 11 ++------ 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index da079b343c1bd..8e87afa90656a 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1573,6 +1574,54 @@ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) return 0; } +bool resctrl_arch_get_mb_uses_numa_nid(void) +{ + return mb_uses_numa_nid; +} + +int resctrl_arch_set_mb_uses_numa_nid(bool enabled) +{ + struct rdt_resource *r; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + struct rdt_ctrl_domain *ctrl_d; + + lockdep_assert_cpus_held(); + lockdep_assert_mems_held(); + + if (!mb_numa_nid_possible) + return -EOPNOTSUPP; + + if (mb_uses_numa_nid == enabled) + return 0; + + /* Domain IDs as NUMA nid is only defined for MBA */ + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + if (!res->class) + return -EOPNOTSUPP; + r = &res->resctrl_res; + + /* repaint the domain IDs */ + mb_uses_numa_nid = enabled; + list_for_each_entry(ctrl_d, &r->ctrl_domains, hdr.list) { + int cpu = cpumask_any(&ctrl_d->hdr.cpu_mask); + + dom = container_of(ctrl_d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + ctrl_d->hdr.id = mpam_resctrl_pick_domain_id(cpu, dom->ctrl_comp); + } + + /* monitor domains are unaffected and should continue to use the L3 */ + + if (!enabled && mb_l3_cache_id_possible) + r->alloc_capable = true; + else if (enabled && mb_numa_nid_possible) + r->alloc_capable = true; + else + r->alloc_capable = false; + + return 0; +} + static void mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, enum resctrl_event_id type) { diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index b43494e734ded..aa7d6e1854741 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -78,15 +78,8 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); -static inline bool resctrl_arch_get_mb_uses_numa_nid(void) -{ - return false; -} - -static inline bool resctrl_arch_set_mb_uses_numa_nid(bool enabled) -{ - return false; -} +bool resctrl_arch_get_mb_uses_numa_nid(void); +int resctrl_arch_set_mb_uses_numa_nid(bool enabled); /* * The CPU configuration for MPAM is cheap to write, and is only written if it From 918392a9c964bd8f6a177acbf2fcd82e04c91800 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 28 Nov 2023 08:17:10 -0800 Subject: [PATCH 197/247] NVIDIA: SAUCE: [Config] RESCTRL configs added to annotations BugLink: https://bugs.launchpad.net/bugs/2122432 Modified for latest MPAM. Signed-off-by: Brad Figg Signed-off-by: Koba Ko Signed-off-by: Fenghua Yu (forward ported from commit 77bd02cc7fa7428a3b7cfa7ad12bf0d6ae63644b https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-6.14-next) [fenghuay: change 6.14 path to 6.17] Signed-off-by: Fenghua Yu Acked-by: Matt Ochs Acked-by: Carol L Soto Acked-by: Jacob Martin Acked-by: Abdur Rahman Acked-by: Koba Ko Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- debian.nvidia-6.17/config/annotations | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index f90afe88e3a07..7a30c4243da95 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -240,3 +240,21 @@ CONFIG_TCG_ARM_CRB_FFA policy<{'arm64': 'y'}> CONFIG_TOOLS_SUPPORT_RELR policy<{'amd64': 'y', 'arm64': '-'}> CONFIG_VFIO_CONTAINER policy<{'amd64': 'y', 'arm64': 'n'}> CONFIG_VFIO_IOMMU_TYPE1 policy<{'amd64': 'm', 'arm64': '-'}> +CONFIG_ACPI_MPAM policy<{'amd64': '-', 'arm64': 'y'}> +CONFIG_ARCH_HAS_CPU_RESCTRL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ARM64_MPAM policy<{'amd64': '-', 'arm64': 'y'}> +CONFIG_ARM64_MPAM_DRIVER policy<{'arm64': 'y'}> +CONFIG_ARM_CPU_RESCTRL policy<{'amd64': '-', 'arm64': '-'}> +CONFIG_CGROUP_RESCTRL policy<{'amd64': 'n', 'arm64': 'n'}> +CONFIG_PROC_CPU_RESCTRL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_RESCTRL_FS policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_RESCTRL_FS_PSEUDO_LOCK policy<{'amd64': 'y', 'arm64': '-'}> +CONFIG_RESCTRL_IOMMU policy<{'amd64': '-', 'arm64': 'y'}> +CONFIG_RESCTRL_PMU policy<{'amd64': 'n', 'arm64': 'y'}> +CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID policy<{'amd64': '-', 'arm64': 'y'}> + +CONFIG_AMPERE_ERRATUM_AC04_CPU_23 policy<{'arm64': 'y'}> +CONFIG_ARM64_SME policy<{'arm64': 'y'}> +CONFIG_EC_HUAWEI_GAOKUN policy<{'arm64': 'n'}> +CONFIG_ARM64_MPAM_DRIVER_DEBUG policy<{'amd64': 'n', 'arm64': 'n'}> +CONFIG_FTRACE_SORT_STARTUP_TEST policy<{'arm64': 'n'}> From a7118491241f7c4fe1d544911c2585f70ca4aec6 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 30 Oct 2025 02:10:23 +0000 Subject: [PATCH 198/247] NVIDIA: SAUCE: arm_mpam: Fix missing SHIFT definitions BugLink: https://bugs.launchpad.net/bugs/2122432 Define the missing SHIFT definitions to fix build errors. Fixes: a76ea208e3c8 ("NVIDIA: SAUCE: arm_mpam: Add quirk framework") Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_internal.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 7c7160d6042ad..6ddffa99e36c9 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -285,7 +285,7 @@ struct mpam_quirk { #define IIDR_PROD(x) ((x) << MPAMF_IIDR_PRODUCTID_SHIFT) #define IIDR_VAR(x) ((x) << MPAMF_IIDR_VARIANT_SHIFT) -#define IIDR_REV(x) ((x) << MPAMF_IIDR_REVISON_SHIFT) +#define IIDR_REV(x) ((x) << MPAMF_IIDR_REVISION_SHIFT) #define IIDR_IMP(x) ((x) << MPAMF_IIDR_IMPLEMENTER_SHIFT) #define IIDR_MATCH_ONE (IIDR_PROD(0xfff) | IIDR_VAR(0xf) | IIDR_REV(0xf) | IIDR_IMP(0xfff)) @@ -691,6 +691,11 @@ static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } #define MPAMF_IIDR_VARIANT GENMASK(19, 16) #define MPAMF_IIDR_PRODUCTID GENMASK(31, 20) +#define MPAMF_IIDR_IMPLEMENTER_SHIFT 0 +#define MPAMF_IIDR_REVISION_SHIFT 12 +#define MPAMF_IIDR_VARIANT_SHIFT 16 +#define MPAMF_IIDR_PRODUCTID_SHIFT 20 + /* MPAMF_AIDR - MPAM architecture ID register */ #define MPAMF_AIDR_ARCH_MINOR_REV GENMASK(3, 0) #define MPAMF_AIDR_ARCH_MAJOR_REV GENMASK(7, 4) From ca8503b3efa5350c2bc383e67300ef8d504e502d Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 20 Nov 2025 07:04:06 +0000 Subject: [PATCH 199/247] NVIDIA: SAUCE: Fix partid_max range issue BugLink: https://bugs.launchpad.net/bugs/2122432 partid is from 0 to partid_max, inclusively. partid_max + 1 is out of valid partid range. Accessing partid_max + 1 will generate error interrupt and cause MPAM disabled. Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_devices.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 89bdae8bd86d6..f0740b5d59b5b 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1832,7 +1832,7 @@ static int mpam_reprogram_ris(void *_arg) spin_lock(&partid_max_lock); partid_max = mpam_partid_max; spin_unlock(&partid_max_lock); - for (partid = 0; partid <= partid_max + 1; partid++) + for (partid = 0; partid <= partid_max; partid++) mpam_reprogram_ris_partid(ris, partid, cfg); return 0; From 79b570ce93681404e60740ffb406f9fd28601d4f Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Thu, 16 Oct 2025 08:34:19 -0500 Subject: [PATCH 200/247] x86,fs/resctrl: Fix NULL pointer dereference with events force-disabled in mbm_event mode BugLink: https://bugs.launchpad.net/bugs/2122432 The following NULL pointer dereference is encountered on mount of resctrl fs after booting a system that supports assignable counters with the "rdt=!mbmtotal,!mbmlocal" kernel parameters: BUG: kernel NULL pointer dereference, address: 0000000000000008 RIP: 0010:mbm_cntr_get Call Trace: rdtgroup_assign_cntr_event rdtgroup_assign_cntrs rdt_get_tree Specifying the kernel parameter "rdt=!mbmtotal,!mbmlocal" effectively disables the legacy X86_FEATURE_CQM_MBM_TOTAL and X86_FEATURE_CQM_MBM_LOCAL features and the MBM events they represent. This results in the per-domain MBM event related data structures to not be allocated during early initialization. resctrl fs initialization follows by implicitly enabling both MBM total and local events on a system that supports assignable counters (mbm_event mode), but this enabling occurs after the per-domain data structures have been created. After booting, resctrl fs assumes that an enabled event can access all its state. This results in NULL pointer dereference when resctrl attempts to access the un-allocated structures of an enabled event. Remove the late MBM event enabling from resctrl fs. This leaves a problem where the X86_FEATURE_CQM_MBM_TOTAL and X86_FEATURE_CQM_MBM_LOCAL features may be disabled while assignable counter (mbm_event) mode is enabled without any events to support. Switching between the "default" and "mbm_event" mode without any events is not practical. Create a dependency between the X86_FEATURE_{CQM_MBM_TOTAL,CQM_MBM_LOCAL} and X86_FEATURE_ABMC (assignable counter) hardware features. An x86 system that supports assignable counters now requires support of X86_FEATURE_CQM_MBM_TOTAL or X86_FEATURE_CQM_MBM_LOCAL. This ensures all needed MBM related data structures are created before use and that it is only possible to switch between "default" and "mbm_event" mode when the same events are available in both modes. This dependency does not exist in the hardware but this usage of these feature settings work for known systems. [ bp: Massage commit message. ] Fixes: 13390861b426e ("x86,fs/resctrl: Detect Assignable Bandwidth Monitoring feature details") Co-developed-by: Reinette Chatre Signed-off-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://patch.msgid.link/a62e6ac063d0693475615edd213d5be5e55443e6.1760560934.git.babu.moger@amd.com (cherry picked from commit 19de7113bfac33ba92c004a9b510612bb745cfa0) Signed-off-by: Tushar Dave Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- arch/x86/kernel/cpu/resctrl/monitor.c | 11 ++++++++++- fs/resctrl/monitor.c | 16 +++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2cd25a0d4637e..fe1a2aa53c16a 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -458,7 +458,16 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } - if (rdt_cpu_has(X86_FEATURE_ABMC)) { + /* + * resctrl assumes a system that supports assignable counters can + * switch to "default" mode. Ensure that there is a "default" mode + * to switch to. This enforces a dependency between the independent + * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL + * hardware features. + */ + if (rdt_cpu_has(X86_FEATURE_ABMC) && + (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) || + rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) { r->mon.mbm_cntr_assignable = true; cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b1633bf6e6d4d..e62432467817f 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1817,15 +1817,13 @@ int resctrl_mon_resource_init(void) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; if (r->mon.mbm_cntr_assignable) { - if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); - if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); - mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; - mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & - (READS_TO_LOCAL_MEM | - READS_TO_LOCAL_S_MEM | - NON_TEMP_WRITE_TO_LOCAL_MEM); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); r->mon.mbm_assign_on_mkdir = true; resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); From 2c12a0a04fb55b935fac06c5e0905eaffe205bd3 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Fri, 12 Dec 2025 21:45:24 +0000 Subject: [PATCH 201/247] NVIDIA: SAUCE: [Config] Update RESCTRL annotations BugLink: https://bugs.launchpad.net/bugs/2122432 Add 'CONFIG_ARM64_MPAM_RESCTRL_FS' to annotations. No code yet exits for 'CONFIG_CGROUP_RESCTRL' and 'CONFIG_RESCTRL_PMU', remove them from annotations. Signed-off-by: Tushar Dave Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- debian.nvidia-6.17/config/annotations | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index 7a30c4243da95..f6f7f1912aac9 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -244,13 +244,12 @@ CONFIG_ACPI_MPAM policy<{'amd64': '-', 'arm64': ' CONFIG_ARCH_HAS_CPU_RESCTRL policy<{'amd64': 'y', 'arm64': 'y'}> CONFIG_ARM64_MPAM policy<{'amd64': '-', 'arm64': 'y'}> CONFIG_ARM64_MPAM_DRIVER policy<{'arm64': 'y'}> +CONFIG_ARM64_MPAM_RESCTRL_FS policy<{'arm64': 'y'}> CONFIG_ARM_CPU_RESCTRL policy<{'amd64': '-', 'arm64': '-'}> -CONFIG_CGROUP_RESCTRL policy<{'amd64': 'n', 'arm64': 'n'}> CONFIG_PROC_CPU_RESCTRL policy<{'amd64': 'y', 'arm64': 'y'}> CONFIG_RESCTRL_FS policy<{'amd64': 'y', 'arm64': 'y'}> CONFIG_RESCTRL_FS_PSEUDO_LOCK policy<{'amd64': 'y', 'arm64': '-'}> CONFIG_RESCTRL_IOMMU policy<{'amd64': '-', 'arm64': 'y'}> -CONFIG_RESCTRL_PMU policy<{'amd64': 'n', 'arm64': 'y'}> CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID policy<{'amd64': '-', 'arm64': 'y'}> CONFIG_AMPERE_ERRATUM_AC04_CPU_23 policy<{'arm64': 'y'}> From 1411151de961bc6822a5d212944902fbb6abe0c6 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Fri, 12 Dec 2025 23:11:10 +0000 Subject: [PATCH 202/247] NVIDIA: SAUCE: arm_mpam: resctrl: Fix MPAM kunit BugLink: https://bugs.launchpad.net/bugs/2122432 KUNIT_CASE_PARAM macro's parameter generator function expects signature 'const void* gen_params(const void *prev, char *desc)' but function test_all_bwa_wd_gen_params() has wrong signature, causing compilation failure. Signed-off-by: Tushar Dave Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/test_mpam_resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c index 3ece77289931e..53289255fc537 100644 --- a/drivers/resctrl/test_mpam_resctrl.c +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -250,7 +250,7 @@ static void test_percent_to_mbw_max(struct kunit *test) KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift); } -static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev, +static const void *test_all_bwa_wd_gen_params(const void *prev, char *desc) { uintptr_t param = (uintptr_t)prev; From f98a7a8d3b1e2e6ea4463d9f17fa32e739521a1e Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Mon, 15 Dec 2025 19:30:47 -0600 Subject: [PATCH 203/247] NVIDIA: SAUCE: resctrl/mpam: Align packed mpam_props to fix arm64 KUnit alignment fault BugLink: https://bugs.launchpad.net/bugs/2122432 KUnit builds pack struct mpam_props, which can misalign its DECLARE_BITMAP (features). On arm64, bitops perform unsigned long accesses that fault on misaligned addresses, causing mpam_resctrl KUnit tests to abort (EC=0x25 DABT, FSC=0x21 alignment fault). Keep the struct packed (to preserve padding-sanitization intent) but force its alignment to __alignof__(unsigned long) so bitmap operations are naturally aligned. No functional change outside tests. Signed-off-by: Tushar Dave Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/mpam_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 6ddffa99e36c9..c2cb5129e3e21 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -256,7 +256,7 @@ struct mpam_props { * removed, and will false-positive if the compiler introduces padding that * isn't cleared during sanitisation. */ -} PACKED_FOR_KUNIT; +} PACKED_FOR_KUNIT __aligned(__alignof__(unsigned long)); #define mpam_has_feature(_feat, x) test_bit(_feat, (x)->features) #define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) From 10660bdd0a0714000f038b5798a5823008771ac6 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Mon, 15 Dec 2025 19:30:47 -0600 Subject: [PATCH 204/247] NVIDIA: SAUCE: resctrl/tests: mpam_devices: compare only meaningful bytes of mpam_props BugLink: https://bugs.launchpad.net/bugs/2122432 Aligning struct mpam_props introduces potential tail padding beyond the last field. The test previously used memcmp over the entire struct, which now fails due to padding differences rather than content. Compare only up to the last meaningful field (via offsetof + sizeof) to avoid false negatives. No behavioral change to driver logic. Signed-off-by: Tushar Dave Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Noah Wager --- drivers/resctrl/test_mpam_devices.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/test_mpam_devices.c b/drivers/resctrl/test_mpam_devices.c index 55d0278c19941..dea3a6fdfd195 100644 --- a/drivers/resctrl/test_mpam_devices.c +++ b/drivers/resctrl/test_mpam_devices.c @@ -12,17 +12,19 @@ static void test__props_mismatch(struct kunit *test) { struct mpam_props parent = { 0 }; struct mpam_props child; + size_t props_bytes = offsetof(struct mpam_props, num_mbwu_mon) + + sizeof(parent.num_mbwu_mon); memset(&child, 0xff, sizeof(child)); __props_mismatch(&parent, &child, false); memset(&child, 0, sizeof(child)); - KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0); + KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, props_bytes), 0); memset(&child, 0xff, sizeof(child)); __props_mismatch(&parent, &child, true); - KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0); + KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, props_bytes), 0); } static struct list_head fake_classes_list; From 9a1a9284b81539711e70c6049cb4effef642dcf3 Mon Sep 17 00:00:00 2001 From: Ian May Date: Thu, 8 Jan 2026 07:46:54 -0800 Subject: [PATCH 205/247] NVIDIA: [Config]: Update annotations Ignore: yes Signed-off-by: Ian May Signed-off-by: Jacob Martin --- debian.nvidia-6.17/config/annotations | 32 +++++++++++++-------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index f6f7f1912aac9..3624c02cf2034 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -195,10 +195,18 @@ CONFIG_ULTRASOC_SMB note<'Required for Grace enablem # ---- Annotations without notes ---- +CONFIG_ACPI_MPAM policy<{'amd64': '-', 'arm64': 'y'}> +CONFIG_ARCH_HAS_CPU_RESCTRL policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_ARM64_MPAM policy<{'amd64': '-', 'arm64': 'y'}> +CONFIG_ARM64_MPAM_DRIVER policy<{'arm64': 'y'}> +CONFIG_ARM64_MPAM_DRIVER_DEBUG policy<{'amd64': '-', 'arm64': 'n'}> +CONFIG_ARM64_MPAM_RESCTRL_FS policy<{'arm64': 'y'}> +CONFIG_ARM_CPU_RESCTRL policy<{'amd64': '-', 'arm64': '-'}> CONFIG_ARM_FFA_TRANSPORT policy<{'arm64': 'y'}> CONFIG_ARM_SMMU_V3_IOMMUFD policy<{'arm64': 'y'}> CONFIG_AS_VERSION policy<{'amd64': '24200', 'arm64': '24200'}> CONFIG_AX88796B_RUST_PHY policy<{'amd64': '-', 'arm64': '-'}> +CONFIG_BATTERY_HUAWEI_GAOKUN policy<{'arm64': '-'}> CONFIG_BCH policy<{'amd64': 'm', 'arm64': 'y'}> CONFIG_BINDGEN_VERSION_TEXT policy<{'amd64': '-', 'arm64': '-'}> CONFIG_BLK_DEV_RUST_NULL policy<{'amd64': '-', 'arm64': '-'}> @@ -214,6 +222,7 @@ CONFIG_DRM_NOUVEAU_CH7006 policy<{'amd64': '-', 'arm64': ' CONFIG_DRM_NOUVEAU_SIL164 policy<{'amd64': '-', 'arm64': '-'}> CONFIG_DRM_NOVA policy<{'amd64': '-', 'arm64': '-'}> CONFIG_DRM_PANIC_SCREEN_QR_CODE policy<{'amd64': '-', 'arm64': '-'}> +CONFIG_EC_HUAWEI_GAOKUN policy<{'arm64': 'n'}> CONFIG_GCC_VERSION policy<{'amd64': '130300', 'arm64': '130300'}> CONFIG_HAVE_RUST policy<{'amd64': 'y', 'arm64': '-'}> CONFIG_IOMMUFD_VFIO_CONTAINER policy<{'arm64': 'y'}> @@ -223,8 +232,13 @@ CONFIG_NVGRACE_EGM policy<{'arm64': 'm'}> CONFIG_NVIDIA_FFA_EC policy<{'arm64': 'y'}> CONFIG_PAHOLE_VERSION policy<{'amd64': '125', 'arm64': '125'}> CONFIG_PINCTRL_MT8901 policy<{'arm64': 'y'}> +CONFIG_PROC_CPU_RESCTRL policy<{'amd64': 'y', 'arm64': 'y'}> CONFIG_R8127 policy<{'amd64': 'n', 'arm64': 'm'}> CONFIG_RELR policy<{'arm64': '-'}> +CONFIG_RESCTRL_FS policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_RESCTRL_FS_PSEUDO_LOCK policy<{'amd64': 'y', 'arm64': '-'}> +CONFIG_RESCTRL_IOMMU policy<{'amd64': '-', 'arm64': 'y'}> +CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID policy<{'amd64': '-', 'arm64': 'y'}> CONFIG_RUSTC_HAS_COERCE_POINTEE policy<{'amd64': '-', 'arm64': '-'}> CONFIG_RUSTC_LLVM_VERSION policy<{'amd64': '180103', 'arm64': '180103'}> CONFIG_RUSTC_SUPPORTS_ARM64 policy<{'arm64': '-'}> @@ -238,22 +252,6 @@ CONFIG_RUST_PHYLIB_ABSTRACTIONS policy<{'amd64': '-', 'arm64': ' CONFIG_SAMPLES_RUST policy<{'amd64': '-', 'arm64': '-'}> CONFIG_TCG_ARM_CRB_FFA policy<{'arm64': 'y'}> CONFIG_TOOLS_SUPPORT_RELR policy<{'amd64': 'y', 'arm64': '-'}> +CONFIG_UCSI_HUAWEI_GAOKUN policy<{'arm64': '-'}> CONFIG_VFIO_CONTAINER policy<{'amd64': 'y', 'arm64': 'n'}> CONFIG_VFIO_IOMMU_TYPE1 policy<{'amd64': 'm', 'arm64': '-'}> -CONFIG_ACPI_MPAM policy<{'amd64': '-', 'arm64': 'y'}> -CONFIG_ARCH_HAS_CPU_RESCTRL policy<{'amd64': 'y', 'arm64': 'y'}> -CONFIG_ARM64_MPAM policy<{'amd64': '-', 'arm64': 'y'}> -CONFIG_ARM64_MPAM_DRIVER policy<{'arm64': 'y'}> -CONFIG_ARM64_MPAM_RESCTRL_FS policy<{'arm64': 'y'}> -CONFIG_ARM_CPU_RESCTRL policy<{'amd64': '-', 'arm64': '-'}> -CONFIG_PROC_CPU_RESCTRL policy<{'amd64': 'y', 'arm64': 'y'}> -CONFIG_RESCTRL_FS policy<{'amd64': 'y', 'arm64': 'y'}> -CONFIG_RESCTRL_FS_PSEUDO_LOCK policy<{'amd64': 'y', 'arm64': '-'}> -CONFIG_RESCTRL_IOMMU policy<{'amd64': '-', 'arm64': 'y'}> -CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID policy<{'amd64': '-', 'arm64': 'y'}> - -CONFIG_AMPERE_ERRATUM_AC04_CPU_23 policy<{'arm64': 'y'}> -CONFIG_ARM64_SME policy<{'arm64': 'y'}> -CONFIG_EC_HUAWEI_GAOKUN policy<{'arm64': 'n'}> -CONFIG_ARM64_MPAM_DRIVER_DEBUG policy<{'amd64': 'n', 'arm64': 'n'}> -CONFIG_FTRACE_SORT_STARTUP_TEST policy<{'arm64': 'n'}> From 18a062fe0c4965eb6a2ddddfe8de8e465d05a98e Mon Sep 17 00:00:00 2001 From: Surabhi Chythanya Kumar Date: Thu, 8 Jan 2026 18:13:30 -0800 Subject: [PATCH 206/247] NVIDIA: SAUCE: MEDIATEK: platform: Add PCIe Hotplug Driver for CX7 on DGX Spark BugLink: https://bugs.launchpad.net/bugs/2138269 This driver manages PCIe link for NVIDIA ConnectX-7 (CX7) hot-plug/unplug on DGX Spark systems with GB10 SoC. It disables the PCIe link on cable removal and enables it on cable insertion. Upstream-friendly improvements over 6.14 driver: - Separated from MTK pinctrl driver into NVIDIA platform driver - Configuration via ACPI (_CRS and _DSD), no hardcoded values - Device-managed resources (devm_*) for automatic cleanup - Thread-safe state management with locking - Enhanced error handling and logging - Uses standard Linux kernel APIs The driver exposes a sysfs interface to emulate cable plug in/out: echo 1 > /sys/devices/platform/MTKP0001:00/pcie_hotplug/debug_state # plug in echo 0 > /sys/devices/platform/MTKP0001:00/pcie_hotplug/debug_state # plug out It also provides a runtime enable/disable switch via sysfs: echo 1 > /sys/devices/platform/MTKP0001:00/pcie_hotplug/hotplug_enabled # Enable echo 0 > /sys/devices/platform/MTKP0001:00/pcie_hotplug/hotplug_enabled # Disable This allows enabling/disabling hotplug functionality. Hotplug is disabled by default and must be explicitly enabled via userspace. It also implements uevent notifications for coordination with userspace: * cable plug-in: Report plug-in uevent (driver) Enable PCIe link (driver) Rescan CX7 devices (application) * cable removal: Report removal uevent (driver) Remove CX7 devices (application) Disable PCIe link (driver) Signed-off-by: Vaibhav Vyas Signed-off-by: Scott Fudally Signed-off-by: Surabhi Chythanya Kumar Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Noah Wager Acked-by: Jacob Martin Signed-off-by: Brad Figg --- debian.nvidia-6.17/config/annotations | 3 + drivers/platform/arm64/Kconfig | 2 + drivers/platform/arm64/Makefile | 1 + drivers/platform/arm64/nvidia/Kconfig | 17 + drivers/platform/arm64/nvidia/Makefile | 9 + .../platform/arm64/nvidia/mtk-pcie-hotplug.c | 2324 +++++++++++++++++ 6 files changed, 2356 insertions(+) create mode 100644 drivers/platform/arm64/nvidia/Kconfig create mode 100644 drivers/platform/arm64/nvidia/Makefile create mode 100644 drivers/platform/arm64/nvidia/mtk-pcie-hotplug.c diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index 3624c02cf2034..f26617dfcb574 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -141,6 +141,9 @@ CONFIG_MICROSOFT_MANA note<'LP: #2084598'> CONFIG_MTD policy<{'amd64': 'm', 'arm64': 'y'}> CONFIG_MTD note<'Essential for boot on ARM64'> +CONFIG_MTK_PCIE_HOTPLUG policy<{'arm64': 'm'}> +CONFIG_MTK_PCIE_HOTPLUG note<'CX7 PCIe hotplug driver for NVIDIA DGX Spark systems with GB10 SoC.'> + CONFIG_NOUVEAU_DEBUG policy<{'amd64': '-', 'arm64': '-'}> CONFIG_NOUVEAU_DEBUG note<'Disable nouveau for NVIDIA kernels'> diff --git a/drivers/platform/arm64/Kconfig b/drivers/platform/arm64/Kconfig index e76bd7e07e217..2782d5933e178 100644 --- a/drivers/platform/arm64/Kconfig +++ b/drivers/platform/arm64/Kconfig @@ -95,4 +95,6 @@ config NVIDIA_FFA_EC Say M or Y here to include this support. +source "drivers/platform/arm64/nvidia/Kconfig" + endif # ARM64_PLATFORM_DEVICES diff --git a/drivers/platform/arm64/Makefile b/drivers/platform/arm64/Makefile index 4edb84d5ae213..47733ec5f26ad 100644 --- a/drivers/platform/arm64/Makefile +++ b/drivers/platform/arm64/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_EC_HUAWEI_GAOKUN) += huawei-gaokun-ec.o obj-$(CONFIG_EC_LENOVO_YOGA_C630) += lenovo-yoga-c630.o obj-$(CONFIG_EC_LENOVO_YOGA_SLIM7X) += lenovo-yoga-slim7x.o obj-$(CONFIG_NVIDIA_FFA_EC) += nvidia-ffa-ec.o +obj-y += nvidia/ diff --git a/drivers/platform/arm64/nvidia/Kconfig b/drivers/platform/arm64/nvidia/Kconfig new file mode 100644 index 0000000000000..b12b290f30d4f --- /dev/null +++ b/drivers/platform/arm64/nvidia/Kconfig @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# NVIDIA ARM64 Platform-Specific Device Drivers +# + +config MTK_PCIE_HOTPLUG + tristate "CX7 PCIe Hotplug Driver" + depends on EINT_MTK + depends on PCI && ACPI + help + Say Y here to support PCIe device plug in/out detection. + It will disable PCIe link when plug out and enable + PCIe link after plug in. + + This is particularly useful for GB10 SoC. + + If unsure, say N. diff --git a/drivers/platform/arm64/nvidia/Makefile b/drivers/platform/arm64/nvidia/Makefile new file mode 100644 index 0000000000000..37cfbebb8d1af --- /dev/null +++ b/drivers/platform/arm64/nvidia/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for NVIDIA ARM64 platform-specific drivers +# +# CX7 PCIe Hotplug Driver +# Provides hotplug support for CX7 PCIe devices on GB10 SoC-based systems +# + +obj-$(CONFIG_MTK_PCIE_HOTPLUG) += mtk-pcie-hotplug.o diff --git a/drivers/platform/arm64/nvidia/mtk-pcie-hotplug.c b/drivers/platform/arm64/nvidia/mtk-pcie-hotplug.c new file mode 100644 index 0000000000000..06a84a29aa6fd --- /dev/null +++ b/drivers/platform/arm64/nvidia/mtk-pcie-hotplug.c @@ -0,0 +1,2324 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014-2025 MediaTek Inc. + * Copyright (c) 2025-2026 NVIDIA Corporation + * + * CX7 PCIe Hotplug Driver + * + * Manages PCIe device hotplug using GPIO interrupts and ACPI resources. + * Supports cable insertion/removal detection and device power management. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HP_PORT_MAX 3 +#define HP_POLL_CNT_MAX 200 +#define MAX_VENDOR_DATA_LEN 16 +#define CX7_HP_MMIO_REGION_COUNT 5 /* TOP, PROTECT, CKM, MAC Port 0, MAC Port 1 */ +#define CX7_HP_MIN_GPIO_COUNT 4 /* Minimum required: BOOT, PRSNT, PERST, EN */ +#define PINCTRL_MAPPING_ENTRY_SIZE 5 /* dev_name, state, ctrl_dev, group, function */ +/* Indices for pinctrl mapping entry strings */ +#define PINCTRL_IDX_DEV_NAME 0 +#define PINCTRL_IDX_STATE 1 +#define PINCTRL_IDX_CTRL_DEV 2 +#define PINCTRL_IDX_GROUP 3 +#define PINCTRL_IDX_FUNCTION 4 + +/* Hardware timing requirements (in microseconds unless noted) */ +#define CX7_HP_DELAY_SHORT_US 10 /* Short delay for register writes */ +#define CX7_HP_DELAY_STANDARD_US 10000 /* Standard delay (10ms) */ +#define CX7_HP_DELAY_BUS_PROTECT_US 5000 /* Bus protection setup delay */ +#define CX7_HP_DELAY_PHY_RESET_US 3000 /* PHY reset delay */ +#define CX7_HP_DELAY_LINK_STABLE_MS 100 /* Link stabilization delay (ms) */ +#define CX7_HP_POLL_SLEEP_US 10000 /* Polling loop sleep interval */ + +#define PLUG_IN_EVT "HOTPLUG_STATE=plugin" +#define REMOVAL_EVT "HOTPLUG_STATE=removal" + +/* Bus protection stages to prevent PCIe core reset glitches */ +#define BUS_PROTECT_INIT 0 +#define BUS_PROTECT_CABLE_REMOVAL 1 +#define BUS_PROTECT_CABLE_PLUGIN 2 +#define BUS_PROTECT_CLEANUP 3 + +enum cx7_hp_state { + STATE_READY = 0, + STATE_PLUG_OUT, /* Cable plug-out */ + STATE_DEV_POWER_OFF, /* Device is powered off */ + STATE_PLUG_IN, /* Cable plug-in detected */ + STATE_DEV_POWER_ON, /* Device is powered on */ + STATE_DEV_FW_START, /* Device firmware is running */ + STATE_RESCAN, /* Device ready, can perform bus rescan */ + STATE_UNKNOWN +}; + +enum pcie_pin_index { + PCIE_PIN_BOOT = 0, /* Device boot status pin */ + PCIE_PIN_PRSNT, /* Presence detection pin */ + PCIE_PIN_PERST, /* PCIe reset pin */ + PCIE_PIN_EN, /* Power enable pin */ + PCIE_PIN_CLQ0, /* Clock request pin 0 */ + PCIE_PIN_CLQ1, /* Clock request pin 1 */ + PCIE_PIN_MAX +}; + +struct pcie_port_info { + int domain; + int bus; + int devfn; +}; + +struct rp_bus_mmio_top { + u32 ctrl; + u32 port_bits[HP_PORT_MAX]; + u32 update_bit; +}; + +struct rp_bus_mmio_protect { + u32 mode; + u32 enable; + u32 port_bits[HP_PORT_MAX]; +}; + +struct rp_bus_mmio_mac { + u32 init_ctrl; + u32 ltssm_bit; + u32 phy_rst_bit; +}; + +struct rp_bus_mmio_ckm { + u32 ctrl; + u32 disable_bit; +}; + +struct rp_bus_mmio_info { + struct rp_bus_mmio_top top; + struct rp_bus_mmio_protect protect; + struct rp_bus_mmio_mac mac; + struct rp_bus_mmio_ckm ckm; +}; + +struct gpio_acpi_context { + struct device *dev; + unsigned int debounce_timeout_us; + int pin; + int wake_capable; + int triggering; + int polarity; + unsigned long irq_flags; + int valid; + unsigned int connection_type; + char vendor_data[MAX_VENDOR_DATA_LEN + 1]; +}; + +struct cx7_hp_dev; + +/** + * struct cx7_hp_plat_data - Platform configuration data parsed from ACPI + * + * Platform-specific configuration parsed from ACPI devices: + * - RES0 device (PNP0C02): PCIe configuration and MMIO register offsets via _DSD + * - PEDE device (MTKP0001): Pinctrl mappings via _DSD + */ +struct cx7_hp_plat_data { + int port_nums; + struct pcie_port_info ports[HP_PORT_MAX]; + u32 vendor_id; + u32 device_id; + int num_devices; + struct rp_bus_mmio_info rp_bus_mmio; + u32 ltssm_reg; + u32 ltssm_l0_state; + int pin_nums; + struct pinctrl_map *parsed_pinmap; +}; + +struct cx7_hp_gpio_ctx { + struct gpio_desc *desc; + struct gpio_acpi_context *ctx; + struct cx7_hp_dev *hp_dev; +}; + +struct acpi_gpio_parse_context { + struct gpio_acpi_context *ctx; + struct cx7_hp_dev *hp_dev; +}; + +struct acpi_gpio_walk_context { + struct device *dev; + struct gpio_info { + unsigned int pin; + unsigned int connection_type; + unsigned int triggering; + unsigned int polarity; + unsigned int debounce_timeout; + unsigned int wake_capable; + char vendor_data[MAX_VENDOR_DATA_LEN + 1]; + char resource_source[16]; + unsigned int resource_source_index; + } gpios[PCIE_PIN_MAX]; + int count; +}; + +struct cx7_hp_acpi_mmio { + struct acpi_resource_fixed_memory32 + mmio_regions[CX7_HP_MMIO_REGION_COUNT]; + int count; + struct device *dev; +}; + +enum cx7_hp_debug_val { + CX7_HP_DEBUG_PLUG_OUT = 0, + CX7_HP_DEBUG_PLUG_IN, + CX7_HP_DEBUG_MAX_VAL +}; + +struct cx7_hp_mmio_runtime { + void __iomem *top_base; + void __iomem *protect_base; + void __iomem *ckm_base; + void __iomem *mac_port_base[HP_PORT_MAX]; +}; + +/** + * cx7_hp_dev - Hotplug device structure + * + * ACPI resource sources: + * - MMIO addresses: RES0 device (PNP0C02) _CRS, stored in mmio field + * - GPIO resources: PEDE device (MTKP0001) _CRS, stored in pins field + */ +struct cx7_hp_dev { + struct cx7_hp_gpio_ctx *pins; + struct cx7_hp_plat_data *pd; + struct platform_device *pdev; + enum cx7_hp_state state; + int gpio_count; + int boot_pin; + int prsnt_pin; + enum cx7_hp_debug_val debug_state; + bool hotplug_enabled; + spinlock_t lock; + struct pci_dev *cached_root_ports[HP_PORT_MAX]; + struct cx7_hp_mmio_runtime mmio; + struct gpio_device *gdev; + struct notifier_block pci_notifier; +}; + +/* ACPI _DSD device properties GUID: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 */ +static const guid_t device_properties_guid = +GUID_INIT(0xdaffd814, 0x6eba, 0x4d8c, + 0x8a, 0x91, 0xbc, 0x9b, + 0xbf, 0x4a, 0xa3, 0x01); + +/** + * cx7_hp_parse_pinctrl_config_dsd - Parse pinctrl configuration from PEDE device _DSD + * @hp_dev: hotplug device + * + * Parses pin-nums and pinctrl-mappings from _DSD. + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_parse_pinctrl_config_dsd(struct cx7_hp_dev *hp_dev) +{ + struct acpi_device *adev; + struct device *dev = &hp_dev->pdev->dev; + const union acpi_object *mappings_pkg, *mapping_entry; + struct pinctrl_map *pinmap; + u32 pin_nums = 0; + int k; + const char *strings[PINCTRL_MAPPING_ENTRY_SIZE]; + + adev = ACPI_COMPANION(dev); + if (!adev) { + dev_err(dev, "Failed to get ACPI companion device\n"); + return -ENODEV; + } + + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + acpi_status status; + const union acpi_object *dsd_pkg, *props_pkg = NULL; + int i, j; + + status = acpi_evaluate_object_typed(adev->handle, "_DSD", NULL, &buffer, + ACPI_TYPE_PACKAGE); + if (ACPI_FAILURE(status)) { + dev_err(dev, "Failed to evaluate _DSD: %s\n", + acpi_format_exception(status)); + return -ENODEV; + } + + dsd_pkg = buffer.pointer; + if (!dsd_pkg || dsd_pkg->type != ACPI_TYPE_PACKAGE) { + dev_err(dev, "Invalid _DSD package\n"); + ACPI_FREE(buffer.pointer); + return -EINVAL; + } + /* Find Device Properties GUID package */ + for (i = 0; i + 1 < dsd_pkg->package.count; i += 2) { + const union acpi_object *guid = &dsd_pkg->package.elements[i]; + const union acpi_object *pkg = + &dsd_pkg->package.elements[i + 1]; + + /* Verify GUID matches Device Properties GUID */ + if (guid->type == ACPI_TYPE_BUFFER && guid->buffer.length == 16 && + pkg->type == ACPI_TYPE_PACKAGE && + guid_equal((guid_t *)guid->buffer.pointer, + &device_properties_guid)) { + props_pkg = pkg; + break; + } + } + + if (!props_pkg) { + dev_err(dev, + "Device Properties GUID package not found in _DSD\n"); + ACPI_FREE(buffer.pointer); + return -EINVAL; + } + + for (j = 0; j < props_pkg->package.count; j++) { + const union acpi_object *prop = &props_pkg->package.elements[j]; + + if (prop->type != ACPI_TYPE_PACKAGE || + prop->package.count != 2 || + prop->package.elements[0].type != ACPI_TYPE_STRING) + continue; + + const char *prop_name = + prop->package.elements[0].string.pointer; + const union acpi_object *prop_value = + &prop->package.elements[1]; + + if (!strcmp(prop_name, "pin-nums")) { + if (prop_value->type == ACPI_TYPE_INTEGER) { + pin_nums = prop_value->integer.value; + } + } else if (!strcmp(prop_name, "pinctrl-mappings")) { + if (prop_value->type == ACPI_TYPE_PACKAGE) + mappings_pkg = prop_value; + } + } + + if (pin_nums == 0) { + hp_dev->pd->pin_nums = 0; + ACPI_FREE(buffer.pointer); + return 0; + } + + if (!mappings_pkg) { + dev_err(dev, + "Missing required _DSD property: pinctrl-mappings\n"); + ACPI_FREE(buffer.pointer); + return -EINVAL; + } + + if (mappings_pkg->package.count != pin_nums) { + dev_err(dev, + "pinctrl-mappings count mismatch: expected %u, got %u\n", + pin_nums, mappings_pkg->package.count); + ACPI_FREE(buffer.pointer); + return -EINVAL; + } + + /* Allocate pinmap array */ + pinmap = devm_kcalloc(dev, pin_nums, sizeof(*pinmap), GFP_KERNEL); + if (!pinmap) { + ACPI_FREE(buffer.pointer); + return -ENOMEM; + } + + /* Parse each mapping entry */ + for (k = 0; k < pin_nums; k++) { + mapping_entry = &mappings_pkg->package.elements[k]; + if (mapping_entry->type != ACPI_TYPE_PACKAGE || + mapping_entry->package.count != ARRAY_SIZE(strings)) { + dev_err(dev, + "Invalid pinctrl mapping entry %d: expected Package(%zu), " + "got %s(count=%u)\n", + k, ARRAY_SIZE(strings), + mapping_entry->type == ACPI_TYPE_PACKAGE ? + "Package" : "non-Package", + mapping_entry->type == ACPI_TYPE_PACKAGE ? + mapping_entry->package.count : 0); + ACPI_FREE(buffer.pointer); + return -EINVAL; + } + + /* Extract strings: dev_name, state, ctrl_dev, group, function */ + for (int l = 0; l < ARRAY_SIZE(strings); l++) { + if (mapping_entry->package.elements[l].type != + ACPI_TYPE_STRING) { + dev_err(dev, + "Mapping entry %d element %d is not a string\n", + k, l); + ACPI_FREE(buffer.pointer); + return -EINVAL; + } + strings[l] = + mapping_entry->package.elements[l].string.pointer; + } + + /* Populate pinctrl_map structure */ + pinmap[k].dev_name = + devm_kstrdup(dev, strings[PINCTRL_IDX_DEV_NAME], + GFP_KERNEL); + pinmap[k].name = + devm_kstrdup(dev, strings[PINCTRL_IDX_STATE], GFP_KERNEL); + pinmap[k].type = PIN_MAP_TYPE_MUX_GROUP; + pinmap[k].ctrl_dev_name = + devm_kstrdup(dev, strings[PINCTRL_IDX_CTRL_DEV], + GFP_KERNEL); + pinmap[k].data.mux.group = + devm_kstrdup(dev, strings[PINCTRL_IDX_GROUP], GFP_KERNEL); + pinmap[k].data.mux.function = + devm_kstrdup(dev, strings[PINCTRL_IDX_FUNCTION], + GFP_KERNEL); + + if (!pinmap[k].dev_name || !pinmap[k].name || + !pinmap[k].ctrl_dev_name || !pinmap[k].data.mux.group || + !pinmap[k].data.mux.function) { + dev_err(dev, + "Failed to allocate memory for mapping %d\n", + k); + ACPI_FREE(buffer.pointer); + return -ENOMEM; + } + } + + hp_dev->pd->pin_nums = pin_nums; + hp_dev->pd->parsed_pinmap = pinmap; + ACPI_FREE(buffer.pointer); + dev_dbg(dev, "Successfully parsed %u pinctrl mappings from ACPI\n", + pin_nums); + return 0; +} + +/** + * cx7_hp_pinctrl_init - Register pinctrl mappings for the device + * @hp_dev: hotplug device + * + * Parses pinctrl mappings from _DSD and registers them. + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_pinctrl_init(struct cx7_hp_dev *hp_dev) +{ + int ret; + + ret = cx7_hp_parse_pinctrl_config_dsd(hp_dev); + if (ret) { + dev_err(&hp_dev->pdev->dev, + "Failed to parse pinctrl configuration from ACPI: %d\n", + ret); + return ret; + } + + if (!hp_dev->pd->pin_nums) + return 0; + + ret = + pinctrl_register_mappings(hp_dev->pd->parsed_pinmap, + hp_dev->pd->pin_nums); + if (ret) { + dev_err(&hp_dev->pdev->dev, + "Failed to register pinctrl mappings\n"); + return ret; + } + + dev_dbg(&hp_dev->pdev->dev, "Registered %u pinctrl mappings\n", + hp_dev->pd->pin_nums); + return 0; +} + +/** + * cx7_hp_pinctrl_remove - Unregister pinctrl mappings + * @hp_dev: hotplug device + */ +static void cx7_hp_pinctrl_remove(struct cx7_hp_dev *hp_dev) +{ + if (!hp_dev->pd->pin_nums) + return; + + pinctrl_unregister_mappings(hp_dev->pd->parsed_pinmap); +} + +/** + * cx7_hp_change_pinctrl_state - Change pinctrl state + * @hp_dev: hotplug device + * @new_state: new pinctrl state name + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_change_pinctrl_state(struct cx7_hp_dev *hp_dev, + const char *new_state) +{ + struct pinctrl *pinctrl; + struct pinctrl_state *state; + int ret; + + pinctrl = devm_pinctrl_get(&hp_dev->pdev->dev); + if (IS_ERR(pinctrl)) { + dev_err(&hp_dev->pdev->dev, "Failed to get pinctrl\n"); + return PTR_ERR(pinctrl); + } + + state = pinctrl_lookup_state(pinctrl, new_state); + if (IS_ERR(state)) { + dev_err(&hp_dev->pdev->dev, "Failed to lookup state:%s\n", + new_state); + return PTR_ERR(state); + } + + ret = pinctrl_select_state(pinctrl, state); + if (ret) { + dev_err(&hp_dev->pdev->dev, + "Failed to select pinctrl state:%s\n", new_state); + return ret; + } + + return 0; +} + +/** + * cx7_hp_send_uevent - Send uevent to userspace + * @hp_dev: hotplug device + * @msg: uevent message string + */ +static void cx7_hp_send_uevent(struct cx7_hp_dev *hp_dev, const char *msg) +{ + char *uevent = NULL; + char *envp[2]; + + uevent = kasprintf(GFP_KERNEL, msg); + if (!uevent) { + dev_err(&hp_dev->pdev->dev, + "Failed to allocate uevent string\n"); + return; + } + + envp[0] = uevent; + envp[1] = NULL; + + if (kobject_uevent_env(&hp_dev->pdev->dev.kobj, KOBJ_CHANGE, envp)) + dev_err(&hp_dev->pdev->dev, "Failed to send uevent\n"); + + kfree(uevent); +} + +/** + * cx7_hp_reg_update_bits - Update specific bits in a register + * @base: MMIO base address + * @offset: Register offset + * @mask: Bits to modify + * @set: true to set bits, false to clear bits + */ +static inline void cx7_hp_reg_update_bits(void __iomem *base, u32 offset, + u32 mask, bool set) +{ + u32 val = readl(base + offset); + + if (set) + val |= mask; + else + val &= ~mask; + + writel(val, base + offset); +} + +/** + * cx7_hp_toggle_update_bit - Toggle control register update bit + * @base: MMIO base address + * @ctrl_offset: Control register offset + * @bits: Bits to set/clear before toggling update + * @update_bit: Update bit mask + * @set: true to set bits, false to clear bits + * + * Performs the sequence: modify bits, clear update bit, set update bit + */ +static void cx7_hp_toggle_update_bit(void __iomem *base, u32 ctrl_offset, + u32 bits, u32 update_bit, bool set) +{ + cx7_hp_reg_update_bits(base, ctrl_offset, bits, set); + cx7_hp_reg_update_bits(base, ctrl_offset, update_bit, false); + cx7_hp_reg_update_bits(base, ctrl_offset, update_bit, true); +} + +/** + * cx7_hp_bus_protect_enable - Enable bus protection for a port + * @dev: hotplug device + * @port_idx: Port index + */ +static void cx7_hp_bus_protect_enable(struct cx7_hp_dev *dev, int port_idx) +{ + struct rp_bus_mmio_info *mmio_info = &dev->pd->rp_bus_mmio; + u32 port_bit = mmio_info->protect.port_bits[port_idx]; + + cx7_hp_reg_update_bits(dev->mmio.protect_base, + mmio_info->protect.mode, port_bit, true); + cx7_hp_reg_update_bits(dev->mmio.protect_base, + mmio_info->protect.enable, port_bit, true); +} + +/** + * cx7_hp_bus_protect_disable - Disable bus protection for a port + * @dev: hotplug device + * @port_idx: Port index + */ +static void cx7_hp_bus_protect_disable(struct cx7_hp_dev *dev, int port_idx) +{ + struct rp_bus_mmio_info *mmio_info = &dev->pd->rp_bus_mmio; + u32 port_bit = mmio_info->protect.port_bits[port_idx]; + + cx7_hp_reg_update_bits(dev->mmio.protect_base, + mmio_info->protect.enable, port_bit, false); + cx7_hp_reg_update_bits(dev->mmio.protect_base, + mmio_info->protect.mode, port_bit, false); +} + +/** + * cx7_hp_ckm_control - Control clock module + * @dev: hotplug device + * @disable: true to disable clock, false to enable + */ +static void cx7_hp_ckm_control(struct cx7_hp_dev *dev, bool disable) +{ + struct rp_bus_mmio_info *mmio_info = &dev->pd->rp_bus_mmio; + + if (!dev->mmio.ckm_base) + return; + + cx7_hp_reg_update_bits(dev->mmio.ckm_base, mmio_info->ckm.ctrl, + mmio_info->ckm.disable_bit, disable); +} + +/** + * cx7_hp_parse_mmio_resources - ACPI resource callback for parsing MMIO from _CRS + * @ares: ACPI resource being processed + * @data: pointer to cx7_hp_acpi_mmio structure + * + * Returns: AE_OK to continue iteration, AE_ERROR on error + */ +static acpi_status cx7_hp_parse_mmio_resources(struct acpi_resource *ares, + void *data) +{ + struct cx7_hp_acpi_mmio *parsed = data; + + switch (ares->type) { + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: + if (parsed->count >= CX7_HP_MMIO_REGION_COUNT) { + dev_warn(parsed->dev, + "More than %d MMIO regions found in platform configuration device, ignoring extras\n", + CX7_HP_MMIO_REGION_COUNT); + break; + } + parsed->mmio_regions[parsed->count] = ares->data.fixed_memory32; + parsed->count++; + break; + default: + break; + } + + return AE_OK; +} + +/** + * cx7_hp_find_pcie_config_device - Find PCIe configuration device by HID + * + * Finds the ACPI device that provides PCIe configuration via _DSD properties + * and MMIO resources via _CRS. + * + * Returns: acpi_device pointer on success (with reference), NULL on failure + */ +static struct acpi_device *cx7_hp_find_pcie_config_device(void) +{ + return acpi_dev_get_first_match_dev("PNP0C02", NULL, -1); +} + +/** + * cx7_hp_parse_pcie_config_dsd - Parse PCIe configuration from _DSD + * @pdev: platform device + * @pd: platform data to populate + * + * Parses PCIe MMIO register offsets, bit positions, port configuration, and PCIe device + * identification from PCIe configuration device _DSD. + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_parse_pcie_config_dsd(struct platform_device *pdev, + struct cx7_hp_plat_data *pd) +{ + struct acpi_device *config_adev; + struct device *dev = &pdev->dev; + u32 val, bit1; + + config_adev = cx7_hp_find_pcie_config_device(); + if (!config_adev) { + dev_err(dev, + "Platform configuration device (PNP0C02) not found - _DSD is required\n"); + return -ENODEV; + } + + if (!acpi_dev_has_props(config_adev)) { + dev_err(dev, + "Platform configuration device has no _DSD properties. Check DSDT.\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "mac-init-ctrl-offset", &val)) { + dev_err(dev, + "Missing required _DSD property: mac-init-ctrl-offset\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.mac.init_ctrl = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "mac-ltssm-bit", &val)) { + dev_err(dev, "Missing required _DSD property: mac-ltssm-bit\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.mac.ltssm_bit = BIT(val); + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "mac-phy-rst-bit", &val)) { + dev_err(dev, + "Missing required _DSD property: mac-phy-rst-bit\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.mac.phy_rst_bit = BIT(val); + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "top-ctrl-offset", &val)) { + dev_err(dev, + "Missing required _DSD property: top-ctrl-offset\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.top.ctrl = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "top-update-bit", &val)) { + dev_err(dev, + "Missing required _DSD property: top-update-bit\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.top.update_bit = BIT(val); + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "top-port0-bit", &val)) { + dev_err(dev, "Missing required _DSD property: top-port0-bit\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.top.port_bits[0] = BIT(val); + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "top-port1-bit", &val)) { + dev_err(dev, "Missing required _DSD property: top-port1-bit\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.top.port_bits[1] = BIT(val); + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "protect-mode-offset", &val)) { + dev_err(dev, + "Missing required _DSD property: protect-mode-offset\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.protect.mode = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "protect-enable-offset", &val)) { + dev_err(dev, + "Missing required _DSD property: protect-enable-offset\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.protect.enable = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "protect-port0-bit", &val)) { + dev_err(dev, + "Missing required _DSD property: protect-port0-bit\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.protect.port_bits[0] = BIT(val); + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "protect-port1-bit", &val)) { + dev_err(dev, + "Missing required _DSD property: protect-port1-bit\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.protect.port_bits[1] = BIT(val); + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "ckm-ctrl-offset", &val)) { + dev_err(dev, + "Missing required _DSD property: ckm-ctrl-offset\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.ckm.ctrl = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "ckm-disable-bit0", &val)) { + dev_err(dev, + "Missing required _DSD property: ckm-disable-bit0\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "ckm-disable-bit1", &bit1)) { + dev_err(dev, + "Missing required _DSD property: ckm-disable-bit1\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->rp_bus_mmio.ckm.disable_bit = BIT(val) | BIT(bit1); + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "ltssm-reg-offset", &val)) { + dev_err(dev, + "Missing required _DSD property: ltssm-reg-offset\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->ltssm_reg = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "ltssm-l0-state", &val)) { + dev_err(dev, + "Missing required _DSD property: ltssm-l0-state\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->ltssm_l0_state = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "port-nums", &val)) { + dev_err(dev, "Missing required _DSD property: port-nums\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + if (val == 0 || val > HP_PORT_MAX) { + dev_err(dev, + "Invalid _DSD property port-nums: %u (must be 1-%d)\n", + val, HP_PORT_MAX); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->port_nums = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "port0-domain", &val)) { + dev_err(dev, "Missing required _DSD property: port0-domain\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->ports[0].domain = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "port0-bus", &val)) { + dev_err(dev, "Missing required _DSD property: port0-bus\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->ports[0].bus = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "port0-devfn", &val)) { + dev_err(dev, "Missing required _DSD property: port0-devfn\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->ports[0].devfn = val; + + if (pd->port_nums >= 2) { + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "port1-domain", &val)) { + dev_err(dev, + "Missing required _DSD property: port1-domain\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->ports[1].domain = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "port1-bus", &val)) { + dev_err(dev, + "Missing required _DSD property: port1-bus\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->ports[1].bus = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "port1-devfn", &val)) { + dev_err(dev, + "Missing required _DSD property: port1-devfn\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->ports[1].devfn = val; + } + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "vendor-id", &val)) { + dev_err(dev, "Missing required _DSD property: vendor-id\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->vendor_id = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "device-id", &val)) { + dev_err(dev, "Missing required _DSD property: device-id\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->device_id = val; + + if (fwnode_property_read_u32 + (acpi_fwnode_handle(config_adev), "num-devices", &val)) { + dev_err(dev, "Missing required _DSD property: num-devices\n"); + acpi_dev_put(config_adev); + return -EINVAL; + } + pd->num_devices = val; + + dev_dbg(dev, "Successfully parsed all required _DSD properties\n"); + + acpi_dev_put(config_adev); + return 0; +} + +/** + * cx7_hp_parse_mmio_resources_from_acpi - Parse MMIO regions from _CRS + * @dev: hotplug device + * @parsed: pointer to parsed MMIO structure + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_parse_mmio_resources_from_acpi(struct cx7_hp_dev *dev, + struct cx7_hp_acpi_mmio + *parsed) +{ + struct acpi_device *config_adev; + acpi_status status; + int ret = 0; + + if (!dev || !dev->pdev) { + return -EINVAL; + } + + config_adev = cx7_hp_find_pcie_config_device(); + if (!config_adev) + return -ENODEV; + + parsed->count = 0; + memset(parsed->mmio_regions, 0, sizeof(parsed->mmio_regions)); + + status = + acpi_walk_resources(config_adev->handle, METHOD_NAME__CRS, + cx7_hp_parse_mmio_resources, parsed); + if (ACPI_FAILURE(status)) { + dev_err(&dev->pdev->dev, + "Failed to walk platform configuration resources: %s\n", + acpi_format_exception(status)); + ret = -ENODEV; + goto out; + } + + if (parsed->count < CX7_HP_MMIO_REGION_COUNT) { + dev_warn(&dev->pdev->dev, + "Expected %d MMIO regions from platform configuration device, found %d\n", + CX7_HP_MMIO_REGION_COUNT, parsed->count); + ret = -ENODEV; + goto out; + } + +out: + acpi_dev_put(config_adev); + return ret; +} + +/** + * cx7_hp_map_mmio_resources - Map all MMIO regions from ACPI _CRS + * @dev: hotplug device + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_map_mmio_resources(struct cx7_hp_dev *dev) +{ + struct platform_device *pdev = dev->pdev; + struct cx7_hp_acpi_mmio parsed = {.count = 0, .dev = &pdev->dev }; + int ret; + int i; + + ret = cx7_hp_parse_mmio_resources_from_acpi(dev, &parsed); + if (ret) { + dev_err(&pdev->dev, + "Failed to get MMIO regions from platform configuration device\n"); + return ret; + } + + dev_dbg(&pdev->dev, "Found %d MMIO regions in _CRS, mapping...\n", + parsed.count); + + int mapped_count = 0; + for (i = 0; i < parsed.count; i++) { + void __iomem *base = NULL; + u32 addr = parsed.mmio_regions[i].address; + u32 size = parsed.mmio_regions[i].address_length; + + switch (i) { + case 0: + if (dev->pd->port_nums >= 1) { + base = devm_ioremap(&pdev->dev, addr, size); + if (!base) { + dev_err(&pdev->dev, + "Failed to map MAC Port 0 region (0x%08x)\n", + addr); + return -ENOMEM; + } + dev->mmio.mac_port_base[0] = base; + mapped_count++; + } + break; + case 1: + if (dev->pd->port_nums >= 2) { + base = devm_ioremap(&pdev->dev, addr, size); + if (!base) { + dev_err(&pdev->dev, + "Failed to map MAC Port 1 region (0x%08x)\n", + addr); + return -ENOMEM; + } + dev->mmio.mac_port_base[1] = base; + mapped_count++; + } + break; + case 2: + base = devm_ioremap(&pdev->dev, addr, size); + if (!base) { + dev_err(&pdev->dev, + "Failed to map TOP region (0x%08x)\n", + addr); + return -ENOMEM; + } + dev->mmio.top_base = base; + mapped_count++; + break; + case 3: + base = devm_ioremap(&pdev->dev, addr, size); + if (!base) { + dev_err(&pdev->dev, + "Failed to map PROTECT region (0x%08x)\n", + addr); + return -ENOMEM; + } + dev->mmio.protect_base = base; + mapped_count++; + break; + case 4: + base = devm_ioremap(&pdev->dev, addr, size); + if (!base) { + dev_err(&pdev->dev, + "Failed to map CKM region (0x%08x)\n", + addr); + return -ENOMEM; + } + dev->mmio.ckm_base = base; + mapped_count++; + break; + default: + dev_warn(&pdev->dev, + "Unexpected MMIO region at 0x%08x (size 0x%x), skipping\n", + addr, size); + break; + } + } + + if (!dev->mmio.top_base || !dev->mmio.protect_base + || !dev->mmio.ckm_base || (dev->pd->port_nums >= 1 + && !dev->mmio.mac_port_base[0]) + || (dev->pd->port_nums >= 2 && !dev->mmio.mac_port_base[1])) { + dev_err(&pdev->dev, + "Required MMIO regions not mapped from ACPI _CRS (mapped %d)\n", + mapped_count); + if (!dev->mmio.top_base) + dev_err(&pdev->dev, " Missing: TOP\n"); + if (!dev->mmio.protect_base) + dev_err(&pdev->dev, " Missing: PROTECT\n"); + if (!dev->mmio.ckm_base) + dev_err(&pdev->dev, " Missing: CKM\n"); + if (dev->pd->port_nums >= 1 && !dev->mmio.mac_port_base[0]) + dev_err(&pdev->dev, + " Missing: MAC Port 0 (port_nums=%d)\n", + dev->pd->port_nums); + if (dev->pd->port_nums >= 2 && !dev->mmio.mac_port_base[1]) + dev_err(&pdev->dev, + " Missing: MAC Port 1 (port_nums=%d)\n", + dev->pd->port_nums); + dev->mmio.top_base = NULL; + dev->mmio.protect_base = NULL; + dev->mmio.ckm_base = NULL; + for (i = 0; i < HP_PORT_MAX; i++) + dev->mmio.mac_port_base[i] = NULL; + return -ENODEV; + } + + dev_dbg(&pdev->dev, + "Successfully mapped all MMIO regions from ACPI _CRS\n"); + return 0; +} + +/** + * cx7_hp_rp_bus_protect - Bus protection handler + * @dev: hotplug device + * @port_idx: port index (0-based) + * @stage: protection stage (BUS_PROTECT_INIT, BUS_PROTECT_CLEANUP, etc.) + */ +static void cx7_hp_rp_bus_protect(struct cx7_hp_dev *dev, int port_idx, + int stage) +{ + switch (stage) { + case BUS_PROTECT_INIT: + { + int ret; + + ret = cx7_hp_map_mmio_resources(dev); + if (ret) { + dev_err(&dev->pdev->dev, + "Failed to map MMIO resources during bus init: %d\n", + ret); + return; + } + } + return; + + case BUS_PROTECT_CLEANUP: + { + int i; + + for (i = 0; i < HP_PORT_MAX; i++) { + if (dev->mmio.mac_port_base[i]) + dev->mmio.mac_port_base[i] = NULL; + } + if (dev->mmio.top_base) + dev->mmio.top_base = NULL; + if (dev->mmio.protect_base) + dev->mmio.protect_base = NULL; + if (dev->mmio.ckm_base) + dev->mmio.ckm_base = NULL; + } + return; + + case BUS_PROTECT_CABLE_REMOVAL: + case BUS_PROTECT_CABLE_PLUGIN: + { + struct rp_bus_mmio_info *mmio_info = + &dev->pd->rp_bus_mmio; + void __iomem *mac_base; + + if (port_idx >= dev->pd->port_nums) + return; + + mac_base = dev->mmio.mac_port_base[port_idx]; + if (!mac_base) + return; + + if (stage == BUS_PROTECT_CABLE_REMOVAL) { + cx7_hp_reg_update_bits(mac_base, + mmio_info->mac.init_ctrl, + mmio_info->mac.ltssm_bit, + false); + cx7_hp_reg_update_bits(mac_base, + mmio_info->mac.init_ctrl, + mmio_info->mac. + phy_rst_bit, false); + return; + } + + cx7_hp_toggle_update_bit(dev->mmio.top_base, + mmio_info->top.ctrl, + mmio_info->top. + port_bits[port_idx], + mmio_info->top.update_bit, + false); + udelay(CX7_HP_DELAY_SHORT_US); + + cx7_hp_bus_protect_enable(dev, port_idx); + usleep_range(CX7_HP_DELAY_BUS_PROTECT_US, + CX7_HP_DELAY_BUS_PROTECT_US + 1000); + + cx7_hp_reg_update_bits(mac_base, + mmio_info->mac.init_ctrl, + mmio_info->mac.phy_rst_bit, + true); + cx7_hp_reg_update_bits(mac_base, + mmio_info->mac.init_ctrl, + mmio_info->mac.ltssm_bit, true); + usleep_range(CX7_HP_DELAY_PHY_RESET_US, + CX7_HP_DELAY_PHY_RESET_US + 1000); + + cx7_hp_bus_protect_disable(dev, port_idx); + + cx7_hp_toggle_update_bit(dev->mmio.top_base, + mmio_info->top.ctrl, + mmio_info->top. + port_bits[port_idx], + mmio_info->top.update_bit, + true); + } + break; + + default: + dev_warn(&dev->pdev->dev, "Unknown bus protect stage: %d\n", + stage); + break; + } +} + +/** + * retrain_pcie_link - Retrain PCIe link + * @dev: PCI device + */ +static void retrain_pcie_link(struct pci_dev *dev) +{ + u16 link_control, lnksta; + int pos, i = 0; + + pos = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (!pos) { + dev_err(&dev->dev, "PCIe capability not found\n"); + return; + } + + pci_read_config_word(dev, pos + PCI_EXP_LNKCTL, &link_control); + link_control |= PCI_EXP_LNKCTL_RL; + + pci_write_config_word(dev, pos + PCI_EXP_LNKCTL, link_control); + + while (i < HP_POLL_CNT_MAX) { + i++; + pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta); + if (lnksta & PCI_EXP_LNKSTA_DLLLA) + break; + usleep_range(CX7_HP_POLL_SLEEP_US, CX7_HP_POLL_SLEEP_US + 1000); + } + + pcie_capability_write_word(dev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS); +} + +/** + * get_port_root_port - Get PCI root port device for a port + * @hp_dev: hotplug device + * @port_idx: port index + * + * Returns cached or newly found root port, or NULL if not found. + */ +static struct pci_dev *get_port_root_port(struct cx7_hp_dev *hp_dev, + int port_idx) +{ + struct pcie_port_info *port; + + if (!hp_dev->pd || port_idx >= hp_dev->pd->port_nums) + return NULL; + + port = &hp_dev->pd->ports[port_idx]; + + if (!hp_dev->cached_root_ports[port_idx]) { + hp_dev->cached_root_ports[port_idx] = + pci_get_domain_bus_and_slot(port->domain, + port->bus, port->devfn); + if (!hp_dev->cached_root_ports[port_idx]) { + dev_warn(&hp_dev->pdev->dev, + "Root port not found for domain %d bus %d\n", + port->domain, port->bus); + return NULL; + } + } + + return hp_dev->cached_root_ports[port_idx]; +} + +/** + * remove_device - Remove PCIe devices and power down hardware + * @dev: hotplug device + */ +static void remove_device(struct cx7_hp_dev *dev) +{ + int i; + + dev_info(&dev->pdev->dev, "Cable removal\n"); + + for (i = 0; i < dev->pd->port_nums; i++) + cx7_hp_rp_bus_protect(dev, i, BUS_PROTECT_CABLE_REMOVAL); + + gpiod_set_value(dev->pins[PCIE_PIN_PERST].desc, 0); + cx7_hp_change_pinctrl_state(dev, "default"); + cx7_hp_ckm_control(dev, true); + gpiod_set_value(dev->pins[PCIE_PIN_EN].desc, 0); +} + +/** + * polling_link_to_l0 - Poll until all PCIe ports reach L0 state + * @dev: hotplug device + * + * Returns: 0 on success, negative error code on failure + */ +static int polling_link_to_l0(struct cx7_hp_dev *dev) +{ + struct pci_dev *pci_dev; + u32 ltssm_reg; + u32 l0_state; + u32 ltssm_vals[HP_PORT_MAX] = { 0 }; + int count = 0; + int i; + bool all_l0; + + ltssm_reg = dev->pd->ltssm_reg; + l0_state = dev->pd->ltssm_l0_state; + + if (!ltssm_reg || !l0_state) + return 0; /* Skip if not configured */ + + /* Poll until all ports reach L0 state */ + all_l0 = false; + while (!all_l0) { + all_l0 = true; + + for (i = 0; i < dev->pd->port_nums; i++) { + pci_dev = get_port_root_port(dev, i); + if (!pci_dev) { + all_l0 = false; + continue; + } + + pci_read_config_dword(pci_dev, ltssm_reg, + <ssm_vals[i]); + if ((ltssm_vals[i] & l0_state) != l0_state) + all_l0 = false; + } + + if (all_l0) + break; + + usleep_range(CX7_HP_POLL_SLEEP_US, CX7_HP_POLL_SLEEP_US + 1000); + count++; + + if (count > HP_POLL_CNT_MAX) { + dev_err(&dev->pdev->dev, + "Timeout waiting for link to reach L0 (reached max count)\n"); + break; + } + } + + if (count > HP_POLL_CNT_MAX) { + return -ETIMEDOUT; + } + + return 0; +} + +/** + * rescan_device - Rescan PCIe bus to discover devices + * @dev: hotplug device + * + * Returns: 0 on success, negative error code on failure + */ +static int rescan_device(struct cx7_hp_dev *dev) +{ + struct pci_dev *pci_dev; + int i, err; + + err = cx7_hp_change_pinctrl_state(dev, "clkreqn"); + if (err) + return err; + + cx7_hp_ckm_control(dev, false); + usleep_range(CX7_HP_DELAY_STANDARD_US, CX7_HP_DELAY_STANDARD_US + 1000); + + for (i = 0; i < dev->pd->port_nums; i++) { + pci_dev = get_port_root_port(dev, i); + if (!pci_dev) + continue; + + err = pm_runtime_resume_and_get(&pci_dev->dev); + if (err < 0) { + dev_err(&dev->pdev->dev, + "Runtime resume failed for %s: %d\n", + pci_name(pci_dev), err); + } + } + + gpiod_set_value(dev->pins[PCIE_PIN_PERST].desc, 1); + + for (i = 0; i < dev->pd->port_nums; i++) + cx7_hp_rp_bus_protect(dev, i, BUS_PROTECT_CABLE_PLUGIN); + + err = polling_link_to_l0(dev); + if (err) + return err; + + for (i = 0; i < dev->pd->port_nums; i++) { + pci_dev = get_port_root_port(dev, i); + if (pci_dev) + retrain_pcie_link(pci_dev); + } + + msleep(CX7_HP_DELAY_LINK_STABLE_MS); + + return 0; +} + +/** + * cx7_hp_work - Work queue handler for hotplug state machine + * @irq: interrupt number + * @dev_id: GPIO context pointer + * + * Processes hotplug state transitions based on current state. + */ +static irqreturn_t cx7_hp_work(int irq, void *dev_id) +{ + struct cx7_hp_gpio_ctx *app_ctx = dev_id; + struct cx7_hp_dev *hp_dev; + enum cx7_hp_state state; + unsigned long flags; + int ret; + + if (!app_ctx || !app_ctx->hp_dev) + return IRQ_NONE; + + hp_dev = app_ctx->hp_dev; + + spin_lock_irqsave(&hp_dev->lock, flags); + if (!hp_dev->hotplug_enabled) { + spin_unlock_irqrestore(&hp_dev->lock, flags); + return IRQ_HANDLED; + } + state = hp_dev->state; + spin_unlock_irqrestore(&hp_dev->lock, flags); + + switch (state) { + case STATE_PLUG_OUT: + remove_device(hp_dev); + break; + case STATE_PLUG_IN: + dev_info(&hp_dev->pdev->dev, "Cable plugin\n"); + gpiod_set_value(hp_dev->pins[PCIE_PIN_EN].desc, 1); + break; + case STATE_DEV_POWER_OFF: + case STATE_DEV_POWER_ON: + case STATE_DEV_FW_START: + break; + case STATE_RESCAN: + ret = rescan_device(hp_dev); + spin_lock_irqsave(&hp_dev->lock, flags); + if (ret) + dev_err(app_ctx->ctx->dev, "Rescan failed: %d\n", ret); + else + hp_dev->state = STATE_READY; + spin_unlock_irqrestore(&hp_dev->lock, flags); + break; + default: + dev_err(app_ctx->ctx->dev, "Unknown state: %d\n", state); + break; + } + + return IRQ_HANDLED; +} + +/** + * hotplug_irq_handler - GPIO interrupt handler for hotplug events + * @irq: interrupt number + * @dev_id: GPIO context pointer + * + * Handles presence detection and boot status GPIO interrupts. + */ +static irqreturn_t hotplug_irq_handler(int irq, void *dev_id) +{ + struct cx7_hp_gpio_ctx *app_ctx = dev_id; + struct cx7_hp_dev *hp_dev = app_ctx->hp_dev; + struct gpio_acpi_context *gpio_ctx = app_ctx->ctx; + unsigned long flags; + int value; + enum cx7_hp_state state; + + value = gpiod_get_value(app_ctx->desc); + + if (gpio_ctx->pin == hp_dev->prsnt_pin) { + if (value) { + cx7_hp_send_uevent(hp_dev, REMOVAL_EVT); + } else { + cx7_hp_send_uevent(hp_dev, PLUG_IN_EVT); + } + return IRQ_HANDLED; + } + + spin_lock_irqsave(&hp_dev->lock, flags); + if (!hp_dev->hotplug_enabled) { + spin_unlock_irqrestore(&hp_dev->lock, flags); + return IRQ_HANDLED; + } + state = hp_dev->state; + + if (gpio_ctx->pin == hp_dev->boot_pin) { + if (value && state == STATE_PLUG_IN) { + hp_dev->state = STATE_DEV_POWER_ON; + } else if (value && state == STATE_DEV_FW_START) { + hp_dev->state = STATE_RESCAN; + } else if (!value && state == STATE_DEV_POWER_ON) { + hp_dev->state = STATE_DEV_FW_START; + } else if (!value && state == STATE_PLUG_OUT) { + hp_dev->state = STATE_DEV_POWER_OFF; + } else { + spin_unlock_irqrestore(&hp_dev->lock, flags); + return IRQ_HANDLED; + } + spin_unlock_irqrestore(&hp_dev->lock, flags); + return IRQ_WAKE_THREAD; + } + + dev_err(gpio_ctx->dev, + "Unknown GPIO pin event: pin=%d irq=%d value=%d\n", + gpio_ctx->pin, irq, value); + spin_unlock_irqrestore(&hp_dev->lock, flags); + return IRQ_HANDLED; +} + +/** + * acpi_gpio_collect_handler - ACPI resource handler to collect all GPIO resources + * @ares: ACPI resource structure + * @context: Pointer to acpi_gpio_walk_context + * + * Returns: AE_OK to continue iteration + */ +static acpi_status acpi_gpio_collect_handler(struct acpi_resource *ares, + void *context) +{ + struct acpi_gpio_walk_context *walk_ctx = context; + struct acpi_resource_gpio *agpio; + int length; + + if (ares->type != ACPI_RESOURCE_TYPE_GPIO) + return AE_OK; + + if (walk_ctx->count >= PCIE_PIN_MAX) { + dev_warn(walk_ctx->dev, + "Too many GPIO resources, truncating at %d\n", + PCIE_PIN_MAX); + return AE_OK; + } + + agpio = &ares->data.gpio; + + if (!agpio->pin_table || agpio->pin_table_length == 0) { + dev_warn(walk_ctx->dev, "GPIO resource has no pin table\n"); + return AE_OK; + } + + walk_ctx->gpios[walk_ctx->count].pin = agpio->pin_table[0]; + walk_ctx->gpios[walk_ctx->count].connection_type = + agpio->connection_type; + walk_ctx->gpios[walk_ctx->count].triggering = agpio->triggering; + walk_ctx->gpios[walk_ctx->count].polarity = agpio->polarity; + walk_ctx->gpios[walk_ctx->count].debounce_timeout = + agpio->debounce_timeout; + walk_ctx->gpios[walk_ctx->count].wake_capable = agpio->wake_capable; + + if (agpio->vendor_length && agpio->vendor_data) { + length = min_t(int, agpio->vendor_length, MAX_VENDOR_DATA_LEN); + memcpy(walk_ctx->gpios[walk_ctx->count].vendor_data, + agpio->vendor_data, length); + walk_ctx->gpios[walk_ctx->count].vendor_data[length] = '\0'; + } else { + walk_ctx->gpios[walk_ctx->count].vendor_data[0] = '\0'; + } + + if (agpio->resource_source.string_ptr) { + length = min_t(int, agpio->resource_source.string_length, 15); + memcpy(walk_ctx->gpios[walk_ctx->count].resource_source, + agpio->resource_source.string_ptr, length); + walk_ctx->gpios[walk_ctx->count].resource_source[length] = '\0'; + } else { + walk_ctx->gpios[walk_ctx->count].resource_source[0] = '\0'; + } + walk_ctx->gpios[walk_ctx->count].resource_source_index = + agpio->resource_source.index; + walk_ctx->count++; + return AE_OK; +} + +/** + * cx7_hp_walk_acpi_gpios - Walk ACPI _CRS to collect all GPIO resources + * @pdev: Platform device + * @walk_ctx: Context structure to fill with GPIO information + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_walk_acpi_gpios(struct platform_device *pdev, + struct acpi_gpio_walk_context *walk_ctx) +{ + struct acpi_device *adev; + acpi_status status; + + adev = ACPI_COMPANION(&pdev->dev); + if (!adev) { + dev_err(&pdev->dev, "Failed to get ACPI companion device\n"); + return -ENODEV; + } + + memset(walk_ctx, 0, sizeof(*walk_ctx)); + walk_ctx->dev = &pdev->dev; + + status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS, + acpi_gpio_collect_handler, walk_ctx); + if (ACPI_FAILURE(status)) { + dev_err(&pdev->dev, "Failed to walk ACPI GPIO resources: %s\n", + acpi_format_exception(status)); + return -EIO; + } + + dev_dbg(&pdev->dev, "Found %d GPIO resources via ACPI walk\n", + walk_ctx->count); + + if (walk_ctx->count == 0) { + dev_err(&pdev->dev, "No GPIO resources found in ACPI _CRS\n"); + return -ENODEV; + } + + return 0; +} + +/** + * acpi_gpio_lookup_handler - ACPI resource handler to look up a specific GPIO pin + * @ares: ACPI resource being processed + * @context: Pointer to acpi_gpio_parse_context + * + * Returns: AE_OK to continue iteration + */ +static acpi_status acpi_gpio_lookup_handler(struct acpi_resource *ares, + void *context) +{ + struct acpi_gpio_parse_context *parse_ctx = context; + struct gpio_acpi_context *ctx = parse_ctx->ctx; + struct cx7_hp_dev *hp_dev = parse_ctx->hp_dev; + struct acpi_resource_gpio *agpio; + int length; + + if (ares->type != ACPI_RESOURCE_TYPE_GPIO) + return AE_OK; + + agpio = &ares->data.gpio; + + if (ctx->pin != agpio->pin_table[0]) + return AE_OK; + + ctx->valid = 1; + ctx->debounce_timeout_us = agpio->debounce_timeout * 10; + ctx->wake_capable = agpio->wake_capable; + ctx->triggering = agpio->triggering; + ctx->polarity = agpio->polarity; + ctx->connection_type = agpio->connection_type; + + if (agpio->vendor_length && agpio->vendor_data && hp_dev) { + length = min_t(int, agpio->vendor_length, MAX_VENDOR_DATA_LEN); + memcpy(&ctx->vendor_data[0], agpio->vendor_data, length); + ctx->vendor_data[length] = '\0'; + + if (!strncmp("BOOT", ctx->vendor_data, strlen("BOOT"))) + hp_dev->boot_pin = ctx->pin; + else if (!strncmp("PRSNT", ctx->vendor_data, strlen("PRSNT"))) + hp_dev->prsnt_pin = ctx->pin; + } + + if (agpio->triggering == ACPI_EDGE_SENSITIVE) { + if (agpio->polarity == ACPI_ACTIVE_LOW) + ctx->irq_flags = IRQF_TRIGGER_FALLING; + else if (agpio->polarity == ACPI_ACTIVE_HIGH) + ctx->irq_flags = IRQF_TRIGGER_RISING; + else + ctx->irq_flags = + (IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING); + } else { + if (agpio->polarity == ACPI_ACTIVE_LOW) + ctx->irq_flags = IRQF_TRIGGER_LOW; + else + ctx->irq_flags = IRQF_TRIGGER_HIGH; + } + + return AE_OK; +} + +/** + * pci_devices_present_on_domain() - Check if PCI devices exist on a domain + * @domain: PCI domain number to check + * + * Returns: true if any PCI devices are present on the specified domain, + * false otherwise. This is used as a safety check before hardware shutdown. + */ +static bool pci_devices_present_on_domain(int domain) +{ + struct pci_bus *bus; + struct pci_dev *dev; + bool has_endpoint_devices = false; + + bus = pci_find_bus(domain, 1); + if (!bus) + return false; + + list_for_each_entry(dev, &bus->devices, bus_list) { + has_endpoint_devices = true; + break; + } + + return has_endpoint_devices; +} + +static ssize_t debug_state_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cx7_hp_dev *hp_dev = dev_get_drvdata(dev); + + if (!hp_dev) + return -EINVAL; + + return scnprintf(buf, PAGE_SIZE, "%d\n", hp_dev->debug_state); +} + +static ssize_t debug_state_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cx7_hp_dev *hp_dev = dev_get_drvdata(dev); + unsigned long val, flags; + int err, i; + + if (!hp_dev || !hp_dev->pd) + return -EINVAL; + + err = kstrtoul(buf, 10, &val); + if (err) + return err; + + spin_lock_irqsave(&hp_dev->lock, flags); + if (!hp_dev->hotplug_enabled) { + spin_unlock_irqrestore(&hp_dev->lock, flags); + dev_info(dev, "Hotplug is disabled.\n"); + return -EPERM; + } + spin_unlock_irqrestore(&hp_dev->lock, flags); + + switch (val) { + case CX7_HP_DEBUG_PLUG_OUT: + /* Safety check: Verify no devices on the bus before hardware shutdown. */ + for (i = 0; i < hp_dev->pd->port_nums; i++) { + if (pci_devices_present_on_domain + (hp_dev->pd->ports[i].domain)) { + dev_err(dev, + "PCI devices still present, remove them first\n"); + return -EBUSY; + } + } + + spin_lock_irqsave(&hp_dev->lock, flags); + hp_dev->state = STATE_PLUG_OUT; + hp_dev->debug_state = val; + spin_unlock_irqrestore(&hp_dev->lock, flags); + remove_device(hp_dev); + return count; + + case CX7_HP_DEBUG_PLUG_IN: + for (i = 0; i < hp_dev->pd->port_nums; i++) { + if (pci_devices_present_on_domain + (hp_dev->pd->ports[i].domain)) { + dev_err(dev, + "PCI devices already present, cannot reinitialize hardware\n"); + return -EBUSY; + } + } + + spin_lock_irqsave(&hp_dev->lock, flags); + hp_dev->state = STATE_PLUG_IN; + hp_dev->debug_state = val; + spin_unlock_irqrestore(&hp_dev->lock, flags); + dev_info(dev, "Cable plugin\n"); + gpiod_set_value(hp_dev->pins[PCIE_PIN_EN].desc, 1); + return count; + + default: + return -EINVAL; + } + + return count; +} + +DEVICE_ATTR_RW(debug_state); + +static ssize_t hotplug_enabled_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cx7_hp_dev *hp_dev = dev_get_drvdata(dev); + + if (!hp_dev) + return -EINVAL; + + return scnprintf(buf, PAGE_SIZE, "%d\n", hp_dev->hotplug_enabled ? 1 : 0); +} + +static ssize_t hotplug_enabled_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cx7_hp_dev *hp_dev = dev_get_drvdata(dev); + unsigned long val; + int err; + + if (!hp_dev) + return -EINVAL; + + err = kstrtoul(buf, 10, &val); + if (err) + return err; + + hp_dev->hotplug_enabled = (val != 0); + dev_info(dev, "Hotplug %s\n", hp_dev->hotplug_enabled ? "enabled" : "disabled"); + + return count; +} + +DEVICE_ATTR_RW(hotplug_enabled); + +static struct attribute *cx7_hp_attrs[] = { + &dev_attr_debug_state.attr, + &dev_attr_hotplug_enabled.attr, + NULL +}; + +static const struct attribute_group cx7_hp_attr_group = { + .name = "pcie_hotplug", + .attrs = cx7_hp_attrs +}; + +/** + * gpio_acpi_setup - Setup GPIO ACPI context from _CRS + * @pdev: platform device + * @desc: GPIO descriptor + * @hp_dev: hotplug device + * @gpio_index: GPIO index + * + * Returns: GPIO ACPI context on success, NULL on failure + */ +static struct gpio_acpi_context *gpio_acpi_setup(struct platform_device *pdev, + struct gpio_desc *desc, + struct cx7_hp_dev *hp_dev, + int gpio_index) +{ + struct acpi_gpio_parse_context parse_ctx; + struct gpio_acpi_context *ctx; + struct acpi_device *adev; + acpi_status status; + + adev = ACPI_COMPANION(&pdev->dev); + if (!adev) { + dev_err(&pdev->dev, "Failed to get ACPI companion device\n"); + return NULL; + } + + ctx = devm_kzalloc(&pdev->dev, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ctx->pin = + desc_to_gpio(desc) - + gpio_device_get_base(gpiod_to_gpio_device(desc)); + ctx->dev = &pdev->dev; + + parse_ctx.ctx = ctx; + parse_ctx.hp_dev = hp_dev; + + status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS, + acpi_gpio_lookup_handler, &parse_ctx); + if (ACPI_FAILURE(status)) { + devm_kfree(&pdev->dev, ctx); + return NULL; + } + + if (ctx->valid) { + if (gpio_index == PCIE_PIN_BOOT && hp_dev->boot_pin == -1) { + hp_dev->boot_pin = ctx->pin; + } else if (gpio_index == PCIE_PIN_PRSNT + && hp_dev->prsnt_pin == -1) { + hp_dev->prsnt_pin = ctx->pin; + } + return ctx; + } + + devm_kfree(&pdev->dev, ctx); + return NULL; +} + +/** + * cx7_hp_setup_irq - Setup IRQ for GPIO + * @app_ctx: GPIO context + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_setup_irq(struct cx7_hp_gpio_ctx *app_ctx) +{ + struct gpio_acpi_context *ctx = app_ctx->ctx; + int irq, ret; + + irq = gpiod_to_irq(app_ctx->desc); + if (irq < 0) { + dev_err(ctx->dev, "Failed to get IRQ for GPIO\n"); + return irq; + } + + if (ctx->wake_capable) + enable_irq_wake(irq); + + ret = devm_request_threaded_irq(ctx->dev, irq, + hotplug_irq_handler, cx7_hp_work, + ctx->irq_flags | IRQF_ONESHOT, + "pcie_hotplug", app_ctx); + if (ret) + dev_err(ctx->dev, "Failed to request IRQ %d: %d\n", irq, ret); + + return ret; +} + +/** + * cx7_hp_put_gpio_device - Release GPIO device reference + * @data: GPIO device pointer + */ +static void cx7_hp_put_gpio_device(void *data) +{ + struct gpio_device *gdev = data; + + gpio_device_put(gdev); +} + +/** + * cx7_hp_discover_pcie_devices - Discover existing PCI devices on managed ports + * @pdev: platform device + * @pd: platform data + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_discover_pcie_devices(struct platform_device *pdev, + struct cx7_hp_plat_data *pd) +{ + struct pci_dev *pci_dev = NULL; + int device_count = 0; + int i; + + if (!pd->vendor_id || !pd->device_id) + return 0; + + while ((pci_dev = pci_get_device(pd->vendor_id, + pd->device_id, pci_dev)) != NULL) { + if (!pci_dev->state_saved) { + pci_dev_put(pci_dev); + return -EPROBE_DEFER; + } + + for (i = 0; i < pd->port_nums; i++) { + if (pci_domain_nr(pci_dev->bus) == pd->ports[i].domain) + break; + } + + if (i == pd->port_nums) { + dev_err(&pdev->dev, + "Device %s found on unexpected domain %d\n", + pci_name(pci_dev), pci_domain_nr(pci_dev->bus)); + pci_dev_put(pci_dev); + return -ENODEV; + } + + device_count++; + } + + if (pd->num_devices && device_count != pd->num_devices) { + dev_err(&pdev->dev, + "Required number of devices not found. Expected=%d Actual=%d\n", + pd->num_devices, device_count); + return -ENODEV; + } + + return 0; +} + +/** + * cx7_hp_init_pcie_data - Initialize PCIe data from _DSD and discover devices + * @pdev: platform device + * @pd: platform data to populate + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_init_pcie_data(struct platform_device *pdev, + struct cx7_hp_plat_data *pd) +{ + int ret; + + ret = cx7_hp_parse_pcie_config_dsd(pdev, pd); + if (ret) { + dev_err(&pdev->dev, + "Failed to parse PCIe configuration _DSD properties: %d\n", + ret); + return ret; + } + + if (pd->port_nums == 0 || pd->port_nums >= HP_PORT_MAX) { + dev_err(&pdev->dev, + "Invalid port count from _DSD: %d (must be 1-%d)\n", + pd->port_nums, HP_PORT_MAX - 1); + return -EINVAL; + } + + ret = cx7_hp_discover_pcie_devices(pdev, pd); + if (ret) { + dev_dbg(&pdev->dev, "Device discovery failed: %d\n", ret); + return ret; + } + + return 0; +} + +/** + * cx7_hp_enumerate_gpios - Enumerate GPIOs from ACPI + * @pdev: Platform device + * @hp_dev: Hotplug device structure + * + * Returns: Number of GPIOs found, or negative error code + */ +static int cx7_hp_enumerate_gpios(struct platform_device *pdev, + struct cx7_hp_dev *hp_dev) +{ + struct acpi_gpio_walk_context walk_ctx; + struct fwnode_handle *gpio_fwnode = NULL; + struct acpi_device *gpio_adev = NULL; + acpi_handle gpio_handle; + acpi_status status; + int ret, i; + + ret = cx7_hp_walk_acpi_gpios(pdev, &walk_ctx); + if (ret) { + dev_err(&pdev->dev, "Failed to walk ACPI GPIO resources: %d\n", + ret); + return ret; + } + + if (walk_ctx.count < CX7_HP_MIN_GPIO_COUNT) { + dev_err(&pdev->dev, + "Insufficient GPIOs from ACPI: required at least %d, got %d\n", + CX7_HP_MIN_GPIO_COUNT, walk_ctx.count); + return -ENODEV; + } + + /* Find GPIO device using resource_source from first GPIO */ + if (walk_ctx.count == 0 || walk_ctx.gpios[0].resource_source[0] == '\0') { + dev_err(&pdev->dev, + "No resource_source in ACPI GPIO resources\n"); + return -ENODEV; + } + + status = + acpi_get_handle(NULL, walk_ctx.gpios[0].resource_source, + &gpio_handle); + if (ACPI_FAILURE(status)) { + dev_err(&pdev->dev, + "Failed to get ACPI handle for GPIO controller %s\n", + walk_ctx.gpios[0].resource_source); + return -ENODEV; + } + + gpio_adev = acpi_fetch_acpi_dev(gpio_handle); + if (!gpio_adev) { + dev_err(&pdev->dev, + "Failed to get ACPI device for GPIO controller %s\n", + walk_ctx.gpios[0].resource_source); + return -ENODEV; + } + + gpio_fwnode = acpi_fwnode_handle(gpio_adev); + hp_dev->gdev = gpio_device_find_by_fwnode(gpio_fwnode); + if (!hp_dev->gdev) { + return dev_err_probe(&pdev->dev, -EPROBE_DEFER, + "GPIO controller not available\n"); + } + + /* Successfully found GPIO device - manage reference */ + ret = devm_add_action_or_reset(&pdev->dev, cx7_hp_put_gpio_device, + hp_dev->gdev); + if (ret) { + gpio_device_put(hp_dev->gdev); + hp_dev->gdev = NULL; + dev_err(&pdev->dev, "Failed to register GPIO device cleanup\n"); + return ret; + } + + hp_dev->gpio_count = walk_ctx.count; + + hp_dev->pins = devm_kzalloc(&pdev->dev, + sizeof(struct cx7_hp_gpio_ctx) * + hp_dev->gpio_count, GFP_KERNEL); + if (!hp_dev->pins) { + dev_err(&pdev->dev, "Failed to allocate memory for GPIOs\n"); + return -ENOMEM; + } + + for (i = 0; i < hp_dev->gpio_count; i++) { + struct cx7_hp_gpio_ctx *app_ctx = &hp_dev->pins[i]; + + app_ctx->desc = + gpio_device_get_desc(hp_dev->gdev, walk_ctx.gpios[i].pin); + if (IS_ERR(app_ctx->desc)) { + dev_err(&pdev->dev, + "Failed to get GPIO descriptor for ACPI pin %u (index %d): %ld\n", + walk_ctx.gpios[i].pin, i, + PTR_ERR(app_ctx->desc)); + return PTR_ERR(app_ctx->desc); + } + + app_ctx->hp_dev = hp_dev; + } + + return hp_dev->gpio_count; +} + +/** + * cx7_hp_pci_notifier - PCI bus notifier to configure MPS for CX7 devices + * @nb: notifier block + * @action: bus notification action + * @data: pointer to device being added/removed + * + * Returns: NOTIFY_OK on success, NOTIFY_DONE if not a CX7 device + */ +static int cx7_hp_pci_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + struct cx7_hp_dev *hp_dev; + unsigned long flags; + + if (action != BUS_NOTIFY_ADD_DEVICE) + return NOTIFY_DONE; + + hp_dev = container_of(nb, struct cx7_hp_dev, pci_notifier); + if (!hp_dev || !hp_dev->pd) + return NOTIFY_DONE; + + spin_lock_irqsave(&hp_dev->lock, flags); + if (!hp_dev->hotplug_enabled) { + spin_unlock_irqrestore(&hp_dev->lock, flags); + return NOTIFY_DONE; + } + spin_unlock_irqrestore(&hp_dev->lock, flags); + + if (!pdev || !hp_dev->pd->vendor_id || !hp_dev->pd->device_id) + return NOTIFY_DONE; + + if (pdev->vendor != hp_dev->pd->vendor_id || + pdev->device != hp_dev->pd->device_id) + return NOTIFY_DONE; + + if (pdev->bus) + pcie_bus_configure_settings(pdev->bus); + + return NOTIFY_OK; +} + +/** + * cx7_hp_probe - Platform device probe function + * @pdev: platform device + * + * Initializes the PCIe hotplug driver, parses ACPI resources, and sets up + * GPIO interrupts and sysfs interface. + * + * Returns: 0 on success, negative error code on failure + */ +static int cx7_hp_probe(struct platform_device *pdev) +{ + struct cx7_hp_plat_data *pd; + struct cx7_hp_gpio_ctx *app_ctx; + struct cx7_hp_dev *hp_dev; + int ret, i; + + pd = devm_kzalloc(&pdev->dev, sizeof(*pd), GFP_KERNEL); + if (!pd) { + dev_err(&pdev->dev, + "Failed to allocate memory for platform data\n"); + return -ENOMEM; + } + + ret = cx7_hp_init_pcie_data(pdev, pd); + if (ret) + return ret; + + hp_dev = devm_kzalloc(&pdev->dev, sizeof(*hp_dev), GFP_KERNEL); + if (!hp_dev) { + dev_err(&pdev->dev, + "Failed to allocate memory for hotplug device\n"); + return -ENOMEM; + } + + hp_dev->pdev = pdev; + hp_dev->pd = pd; + hp_dev->state = STATE_READY; + hp_dev->boot_pin = -1; + hp_dev->prsnt_pin = -1; + hp_dev->hotplug_enabled = false; + spin_lock_init(&hp_dev->lock); + + for (i = 0; i < HP_PORT_MAX; i++) + hp_dev->cached_root_ports[i] = NULL; + + ret = cx7_hp_enumerate_gpios(pdev, hp_dev); + if (ret < 0) { + dev_err(&pdev->dev, "Failed to enumerate GPIOs from ACPI: %d\n", + ret); + return ret; + } + + for (i = 0; i < hp_dev->gpio_count; i++) { + app_ctx = &hp_dev->pins[i]; + + app_ctx->ctx = gpio_acpi_setup(pdev, app_ctx->desc, hp_dev, i); + if (!app_ctx->ctx) { + dev_err(&pdev->dev, "Failed to setup GPIO %d\n", i); + return -ENODEV; + } + + gpiod_set_debounce(app_ctx->desc, + app_ctx->ctx->debounce_timeout_us); + + if (app_ctx->ctx->connection_type == + ACPI_RESOURCE_GPIO_TYPE_INT) { + ret = cx7_hp_setup_irq(app_ctx); + if (ret) { + dev_err(&pdev->dev, + "Failed to setup IRQ for GPIO %d\n", i); + return ret; + } + } + } + + platform_set_drvdata(pdev, hp_dev); + + ret = cx7_hp_pinctrl_init(hp_dev); + if (ret) { + dev_err(&pdev->dev, "Pinmux init failed, ret: %d\n", ret); + return ret; + } + + ret = sysfs_create_group(&pdev->dev.kobj, &cx7_hp_attr_group); + if (ret) { + dev_err(&pdev->dev, "Sysfs creation failed: %d\n", ret); + goto pinctrl_remove; + } + + cx7_hp_rp_bus_protect(hp_dev, 0, BUS_PROTECT_INIT); + + hp_dev->pci_notifier.notifier_call = cx7_hp_pci_notifier; + ret = bus_register_notifier(&pci_bus_type, &hp_dev->pci_notifier); + if (ret) { + dev_err(&pdev->dev, "Failed to register PCI bus notifier: %d\n", + ret); + goto sysfs_remove; + } + + if (gpiod_get_value(hp_dev->pins[PCIE_PIN_PRSNT].desc)) { + hp_dev->debug_state = CX7_HP_DEBUG_PLUG_OUT; + cx7_hp_send_uevent(hp_dev, REMOVAL_EVT); + } else { + hp_dev->debug_state = CX7_HP_DEBUG_PLUG_IN; + cx7_hp_send_uevent(hp_dev, PLUG_IN_EVT); + } + + dev_info(&pdev->dev, "PCIe hotplug driver initialized successfully\n"); + return 0; + +sysfs_remove: + sysfs_remove_group(&pdev->dev.kobj, &cx7_hp_attr_group); +pinctrl_remove: + cx7_hp_pinctrl_remove(hp_dev); + return ret; +} + +/** + * cx7_hp_remove - Platform device remove function + * @pdev: platform device + * + * Cleans up GPIO pins, pinctrl, sysfs interface, and bus protection. + */ +static void cx7_hp_remove(struct platform_device *pdev) +{ + struct cx7_hp_dev *hp_dev = platform_get_drvdata(pdev); + int i; + + if (!hp_dev) + return; + + sysfs_remove_group(&pdev->dev.kobj, &cx7_hp_attr_group); + + bus_unregister_notifier(&pci_bus_type, &hp_dev->pci_notifier); + + cx7_hp_rp_bus_protect(hp_dev, 0, BUS_PROTECT_CLEANUP); + + cx7_hp_pinctrl_remove(hp_dev); + + for (i = 0; i < hp_dev->pd->port_nums; i++) { + if (hp_dev->cached_root_ports[i]) + pci_dev_put(hp_dev->cached_root_ports[i]); + } + + platform_set_drvdata(pdev, NULL); +} + +static const struct acpi_device_id cx7_hp_acpi_match[] = { + {"MTKP0001", 0}, + {} +}; + +MODULE_DEVICE_TABLE(acpi, cx7_hp_acpi_match); + +static struct platform_driver cx7_hp_driver = { + .probe = cx7_hp_probe, + .remove = cx7_hp_remove, + .driver = { + .name = "cx7-pcie-hotplug", + .acpi_match_table = ACPI_PTR(cx7_hp_acpi_match), + }, +}; + +module_platform_driver(cx7_hp_driver); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("CX7 PCIe Hotplug Driver for NVIDIA DGX Systems"); From 67c7b6ae9f370ec697f718b4a81b12e645ba597e Mon Sep 17 00:00:00 2001 From: Brad Figg Date: Fri, 29 Mar 2024 13:31:34 -0700 Subject: [PATCH 207/247] NVIDIA: [Config] Add nvidia-fs build dependencies BugLink: https://bugs.launchpad.net/bugs/2059814 Signed-off-by: Brad Figg Acked-by: Brad Figg Acked-by: Ian May Signed-off-by: Ian May Signed-off-by: Jacob Martin (cherry picked from commit a64b5977c0cb9bb66af5f3d9fd7ed2a7eaebc131 linux-nvidia-6.14) Signed-off-by: Abdur Rahman --- debian.nvidia-6.17/control.stub.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debian.nvidia-6.17/control.stub.in b/debian.nvidia-6.17/control.stub.in index b10ee18491eba..808f7dca049c3 100644 --- a/debian.nvidia-6.17/control.stub.in +++ b/debian.nvidia-6.17/control.stub.in @@ -52,6 +52,8 @@ Build-Depends: uuid-dev , zstd , bpftool:native [amd64 arm64] , + nvidia-dkms-kernel [amd64 arm64] , + nvidia-kernel-source [amd64 arm64] , Build-Depends-Indep: asciidoc , bzip2 , From 05d25507e598ad74094878ea251b77866a4949ac Mon Sep 17 00:00:00 2001 From: Abdur Rahman Date: Fri, 16 Jan 2026 15:21:38 -0500 Subject: [PATCH 208/247] UBUNTU: Start new release Ignore: yes Signed-off-by: Abdur Rahman --- debian.nvidia-6.17/changelog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/debian.nvidia-6.17/changelog b/debian.nvidia-6.17/changelog index 5475820539ce4..d41e80f162f8b 100644 --- a/debian.nvidia-6.17/changelog +++ b/debian.nvidia-6.17/changelog @@ -1,3 +1,11 @@ +linux-nvidia-6.17 (6.17.0-1007.7) UNRELEASED; urgency=medium + + CHANGELOG: Do not edit directly. Autogenerated at release. + CHANGELOG: Use the printchanges target to see the current changes. + CHANGELOG: Use the insertchanges target to create the final log. + + -- Abdur Rahman Fri, 16 Jan 2026 15:21:38 -0500 + linux-nvidia-6.17 (6.17.0-1006.6) noble; urgency=medium * noble/linux-nvidia-6.17: 6.17.0-1006.6 -proposed tracker (LP: #2136206) From bc5ca3ed328070f97202cb3bd63cf1842a26a403 Mon Sep 17 00:00:00 2001 From: Abdur Rahman Date: Fri, 16 Jan 2026 15:22:34 -0500 Subject: [PATCH 209/247] UBUNTU: link-to-tracker: update tracking bug BugLink: https://bugs.launchpad.net/bugs/2137561 Properties: no-test-build Signed-off-by: Abdur Rahman --- debian.nvidia-6.17/tracking-bug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian.nvidia-6.17/tracking-bug b/debian.nvidia-6.17/tracking-bug index b3effa0c06b00..780343b0a29a0 100644 --- a/debian.nvidia-6.17/tracking-bug +++ b/debian.nvidia-6.17/tracking-bug @@ -1 +1 @@ -2136206 d2025.12.15-1 +2137561 d2025.12.18-1 From c048342be768c65340c09b4b391fc938c764a339 Mon Sep 17 00:00:00 2001 From: Abdur Rahman Date: Fri, 16 Jan 2026 15:24:37 -0500 Subject: [PATCH 210/247] UBUNTU: [Packaging] debian.nvidia-6.17/dkms-versions -- update from kernel-versions (main/d2025.12.18) BugLink: https://bugs.launchpad.net/bugs/1786013 Signed-off-by: Abdur Rahman --- debian.nvidia-6.17/dkms-versions | 1 + 1 file changed, 1 insertion(+) diff --git a/debian.nvidia-6.17/dkms-versions b/debian.nvidia-6.17/dkms-versions index e9a45983e54e3..131617ee2b513 100644 --- a/debian.nvidia-6.17/dkms-versions +++ b/debian.nvidia-6.17/dkms-versions @@ -1,3 +1,4 @@ zfs-linux 2.3.4-1ubuntu2 modulename=zfs debpath=pool/universe/z/%package%/zfs-dkms_%version%_all.deb arch=amd64 arch=arm64 arch=ppc64el arch=s390x arch=riscv64 rprovides=spl-modules rprovides=spl-dkms rprovides=zfs-modules rprovides=zfs-dkms v4l2loopback 0.15.0-0ubuntu2 modulename=v4l2loopback debpath=pool/universe/v/%package%/v4l2loopback-dkms_%version%_all.deb arch=amd64 rprovides=v4l2loopback-modules rprovides=v4l2loopback-dkms mstflint 4.26.0-1 modulename=mstflint_access debpath=pool/universe/m/%package%/mstflint-dkms_%version%_all.deb arch=amd64 arch=arm64 rprovides=mstflint-modules rprovides=mstflint-dkms +nvidia-fs 2.28.0-1 modulename=nvidia-fs debpath=pool/universe/n/%package%/nvidia-fs-dkms_%version%_amd64.deb arch=amd64 arch=arm64 rprovides=nvidia-fs-modules rprovides=nvidia-fs-dkms type=standalone From 899b59aee509b68da27ca8a64923f99c9654562c Mon Sep 17 00:00:00 2001 From: Abdur Rahman Date: Fri, 16 Jan 2026 15:25:40 -0500 Subject: [PATCH 211/247] UBUNTU: Ubuntu-nvidia-6.17-6.17.0-1007.7 Signed-off-by: Abdur Rahman --- debian.nvidia-6.17/changelog | 3398 +++++++++++++++++++++++++++++++- debian.nvidia-6.17/reconstruct | 3 + 2 files changed, 3396 insertions(+), 5 deletions(-) diff --git a/debian.nvidia-6.17/changelog b/debian.nvidia-6.17/changelog index d41e80f162f8b..34e0bc30c37e7 100644 --- a/debian.nvidia-6.17/changelog +++ b/debian.nvidia-6.17/changelog @@ -1,10 +1,3398 @@ -linux-nvidia-6.17 (6.17.0-1007.7) UNRELEASED; urgency=medium +linux-nvidia-6.17 (6.17.0-1007.7) noble; urgency=medium - CHANGELOG: Do not edit directly. Autogenerated at release. - CHANGELOG: Use the printchanges target to see the current changes. - CHANGELOG: Use the insertchanges target to create the final log. + * noble/linux-nvidia-6.17: 6.17.0-1007.7 -proposed tracker (LP: #2137561) - -- Abdur Rahman Fri, 16 Jan 2026 15:21:38 -0500 + * Packaging resync (LP: #1786013) + - [Packaging] debian.nvidia-6.17/dkms-versions -- update from kernel- + versions (main/d2025.12.18) + + * Enable GDS in the 6.8 based linux-nvidia kernel (LP: #2059814) + - NVIDIA: [Config] Add nvidia-fs build dependencies + + * Add PCIe Hotplug Driver for CX7 on DGX Spark (LP: #2138269) + - NVIDIA: SAUCE: MEDIATEK: platform: Add PCIe Hotplug Driver for CX7 on + DGX Spark + + * Backport support for Grace MPAM (LP: #2122432) + - x86,fs/resctrl: Consolidate monitor event descriptions + - x86,fs/resctrl: Replace architecture event enabled checks + - x86/resctrl: Remove the rdt_mon_features global variable + - x86,fs/resctrl: Prepare for more monitor events + - x86/cpufeatures: Add support for Assignable Bandwidth Monitoring + Counters (ABMC) + - x86/resctrl: Add ABMC feature in the command line options + - x86,fs/resctrl: Consolidate monitoring related data from rdt_resource + - x86,fs/resctrl: Detect Assignable Bandwidth Monitoring feature details + - x86/resctrl: Add support to enable/disable AMD ABMC feature + - fs/resctrl: Introduce the interface to display monitoring modes + - fs/resctrl: Add resctrl file to display number of assignable counters + - fs/resctrl: Introduce mbm_cntr_cfg to track assignable counters per + domain + - fs/resctrl: Introduce interface to display number of free MBM counters + - x86/resctrl: Add data structures and definitions for ABMC assignment + - fs/resctrl: Introduce event configuration field in struct mon_evt + - x86,fs/resctrl: Implement resctrl_arch_config_cntr() to assign a counter + with ABMC + - fs/resctrl: Add the functionality to assign MBM events + - fs/resctrl: Add the functionality to unassign MBM events + - fs/resctrl: Pass struct rdtgroup instead of individual members + - fs/resctrl: Introduce counter ID read, reset calls in mbm_event mode + - x86/resctrl: Implement resctrl_arch_reset_cntr() and + resctrl_arch_cntr_read() + - fs/resctrl: Support counter read/reset with mbm_event assignment mode + - fs/resctrl: Add event configuration directory under info/L3_MON/ + - fs/resctrl: Provide interface to update the event configurations + - fs/resctrl: Introduce mbm_assign_on_mkdir to enable assignments on mkdir + - fs/resctrl: Auto assign counters on mkdir and clean up on group removal + - fs/resctrl: Introduce mbm_L3_assignments to list assignments in a group + - fs/resctrl: Introduce the interface to modify assignments in a group + - fs/resctrl: Disable BMEC event configuration when mbm_event mode is + enabled + - fs/resctrl: Introduce the interface to switch between monitor modes + - x86/resctrl: Configure mbm_event mode if supported + - MAINTAINERS: resctrl: Add myself as reviewer + - fs/resctrl: Fix counter auto-assignment on mkdir with mbm_event enabled + - NVIDIA: SAUCE: DT: cacheinfo: Expose the code to generate a cache-id + from a device_node + - NVIDIA: SAUCE: ACPI / PPTT: Add a helper to fill a cpumask from a + processor container + - NVIDIA: SAUCE: ACPI / PPTT: Stop acpi_count_levels() expecting callers + to clear levels + - NVIDIA: SAUCE: ACPI / PPTT: Find cache level by cache-id + - NVIDIA: SAUCE: ACPI / PPTT: Add a helper to fill a cpumask from a + cache_id + - NVIDIA: SAUCE: DROP: ACPI / PPTT: Add a for_each_acpi_pptt_entry() + helper + - NVIDIA: SAUCE: arm64: kconfig: Add Kconfig entry for MPAM + - NVIDIA: SAUCE: ACPI / MPAM: Parse the MPAM table + - NVIDIA: SAUCE: DT: dt-bindings: arm: Add MPAM MSC binding + - NVIDIA: SAUCE: arm_mpam: Add probe/remove for mpam msc driver and kbuild + boiler plate + - NVIDIA: SAUCE: arm_mpam: parse resources + - NVIDIA: SAUCE: DT: arm_mpam: Add support for memory controller MSC on DT + platforms + - NVIDIA: SAUCE: arm_mpam: Add the class and component structures for + firmware described ris + - NVIDIA: SAUCE: arm_mpam: Add MPAM MSC register layout definitions + - NVIDIA: SAUCE: arm_mpam: Add cpuhp callbacks to probe MSC hardware + - NVIDIA: SAUCE: arm_mpam: Probe hardware to find the supported partid/pmg + values + - NVIDIA: SAUCE: arm_mpam: Add helpers for managing the locking around the + mon_sel registers + - NVIDIA: SAUCE: arm_mpam: Probe the hardware features resctrl supports + - NVIDIA: SAUCE: arm_mpam: Merge supported features during mpam_enable() + into mpam_class + - NVIDIA: SAUCE: arm_mpam: Reset MSC controls from cpuhp callbacks + - NVIDIA: SAUCE: arm_mpam: Add a helper to touch an MSC from any CPU + - NVIDIA: SAUCE: arm_mpam: Extend reset logic to allow devices to be reset + any time + - NVIDIA: SAUCE: arm_mpam: Register and enable IRQs + - NVIDIA: SAUCE: arm_mpam: Use a static key to indicate when mpam is + enabled + - NVIDIA: SAUCE: arm_mpam: Allow configuration to be applied and restored + during cpu online + - NVIDIA: SAUCE: arm_mpam: Probe and reset the rest of the features + - NVIDIA: SAUCE: arm_mpam: Add helpers to allocate monitors + - NVIDIA: SAUCE: arm_mpam: Add mpam_msmon_read() to read monitor value + - NVIDIA: SAUCE: fixup for _msmon_read, reported by Zeng + - NVIDIA: SAUCE: arm_mpam: Track bandwidth counter state for overflow and + power management + - NVIDIA: SAUCE: arm_mpam: Probe for long/lwd mbwu counters + - NVIDIA: SAUCE: arm_mpam: Use long MBWU counters if supported + - NVIDIA: SAUCE: arm_mpam: Add helper to reset saved mbwu state + - NVIDIA: SAUCE: arm_mpam: Add kunit test for bitmap reset + - NVIDIA: SAUCE: arm_mpam: Add kunit tests for props_mismatch() + - NVIDIA: SAUCE: arm64: mpam: Context switch the MPAM registers + - NVIDIA: SAUCE: arm64: mpam: Re-initialise MPAM regs when CPU comes + online + - NVIDIA: SAUCE: arm64: mpam: Advertise the CPUs MPAM limits to the driver + - NVIDIA: SAUCE: arm64: mpam: Add cpu_pm notifier to restore MPAM sysregs + - NVIDIA: SAUCE: arm64: mpam: Add helpers to change a tasks and cpu mpam + partid/pmg values + - NVIDIA: SAUCE: cacheinfo: Add helper to find the cache size from + cpu+level + - NVIDIA: SAUCE: arm_mpam: resctrl: Add boilerplate cpuhp and domain + allocation + - NVIDIA: SAUCE: arm_mpam: resctrl: Pick the caches we will use as resctrl + resources + - NVIDIA: SAUCE: arm_mpam: resctrl: Implement + resctrl_arch_reset_all_ctrls() + - NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_get_config() + - NVIDIA: SAUCE: arm_mpam: resctrl: Implement helpers to update + configuration + - NVIDIA: SAUCE: arm_mpam: resctrl: Add plumbing against arm64 task and + cpu hooks + - NVIDIA: SAUCE: arm_mpam: resctrl: Add CDP emulation + - NVIDIA: SAUCE: arm_mpam: resctrl: Add rmid index helpers + - NVIDIA: SAUCE: arm_mpam: resctrl: Convert to/from MPAMs bitmaps and + fixed-point formats + - NVIDIA: SAUCE: arm_mpam: resctrl: Add support for 'MB' resource + - NVIDIA: SAUCE: arm_mpam: resctrl: Reject oversized memory bandwidth + portion bitmaps + - NVIDIA: SAUCE: arm_mpam: resctrl: Fix MB min_bandwidth value exposed to + userspace + - NVIDIA: SAUCE: arm_mpam: resctrl: Add kunit test for control format + conversions + - NVIDIA: SAUCE: arm_mpam: resctrl: Add support for csu counters + - NVIDIA: SAUCE: untested: arm_mpam: resctrl: pick classes for use as mbm + counters + - NVIDIA: SAUCE: arm_mpam: resctrl: Pre-allocate free running monitors + - NVIDIA: SAUCE: arm_mpam: resctrl: Pre-allocate assignable monitors + - NVIDIA: SAUCE: arm_mpam: resctrl: Add kunit test for ABMC/CDP + interactions + - NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_config_cntr() for + ABMC use + - NVIDIA: SAUCE: arm_mpam: resctrl: Allow resctrl to allocate monitors + - NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_rmid_read() and + resctrl_arch_reset_rmid() + - NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_cntr_read() & + resctrl_arch_reset_cntr() + - NVIDIA: SAUCE: untested: arm_mpam: resctrl: Allow monitors to be + configured with filters + - NVIDIA: SAUCE: arm_mpam: resctrl: Add empty definitions for fine-grained + enables + - NVIDIA: SAUCE: arm64: mpam: Select ARCH_HAS_CPU_RESCTRL + - NVIDIA: SAUCE: fs/resctrl: Don't touch rmid_ptrs[] in free_rmid() when + there are no monitors + - NVIDIA: SAUCE: fs/resctrl: Avoid a race with dom_data_exit() and + closid_num_dirty_rmid[] + - NVIDIA: SAUCE: fs/resctrl: Avoid a race with dom_data_exit() and + rmid_ptrs[] + - NVIDIA: SAUCE: perf/arm-cmn: Stop claiming all the resources + - NVIDIA: SAUCE: arm_mpam: resctrl: Call resctrl_init() on platforms that + can support resctrl + - NVIDIA: SAUCE: arm_mpam: resctrl: Call resctrl_exit() in the event of + errors + - NVIDIA: SAUCE: arm_mpam: resctrl: Update the rmid reallocation limit + - NVIDIA: SAUCE: arm_mpam: resctrl: Sort the order of the domain lists + - NVIDIA: SAUCE: arm_mpam: Generate a configuration for min controls + - NVIDIA: SAUCE: arm_mpam: Add quirk framework + - NVIDIA: SAUCE: arm_mpam: Add workaround for T241-MPAM-1 + - NVIDIA: SAUCE: arm_mpam: Add workaround for T241-MPAM-4 + - NVIDIA: SAUCE: arm_mpam: Add workaround for T241-MPAM-6 + - NVIDIA: SAUCE: arm_mpam: Quirk CMN-650's CSU NRDY behaviour + - NVIDIA: SAUCE: debugfs: Add helpers for creating cpumask entries in + debugfs + - NVIDIA: SAUCE: arm_mpam: Add debugfs entries to show the MSC/RIS the + driver discovered + - NVIDIA: SAUCE: arm_mpam: Add force-disable debugfs trigger + - NVIDIA: SAUCE: arm_mpam: Expose the number of NRDY retries in debugfs + - NVIDIA: SAUCE: arm_mpam: Add resctrl_arch_round_bw() + - NVIDIA: SAUCE: fs/resctrl,x86/resctrl: Factor mba rounding to be per- + arch + - NVIDIA: SAUCE: arm_mpam: Split the locking around the mon_sel registers + - NVIDIA: SAUCE: arm_mpam: Relax num_rmids parameter advertised to + userspace + - NVIDIA: SAUCE: arm_mpam: Allow the maximum partid to be overridden from + the command line + - NVIDIA: SAUCE: arm_mpam: Allow MSC to be forced to have an unknown + location + - NVIDIA: SAUCE: fs/resctrl: Add this_is_not_abi mount option + - NVIDIA: SAUCE: iommu/arm-smmu-v3: Register SMMU capabilities with MPAM + - NVIDIA: SAUCE: iommu/arm-smmu-v3: Add mpam helpers to query and set + state + - NVIDIA: SAUCE: iommu: Add helpers to get and set the QoS state + - NVIDIA: SAUCE: iommu: Add helpers to retrieve iommu_groups by id or + kobject + - NVIDIA: SAUCE: iommu: Add helper to retrieve iommu kset + - NVIDIA: SAUCE: kobject: Add kset_get_next_obj() to allow a kset to be + walked + - NVIDIA: SAUCE: arm_mpam: resctrl: Add iommu helpers to get/set the + partid and pmg + - NVIDIA: SAUCE: fs/resctrl: Add support for assigning iommu_groups to + resctrl groups + - NVIDIA: SAUCE: firmware: arm_scmi: add MPAM-FB SCMI protocol stub + - NVIDIA: SAUCE: arm_mpam: add MPAM-FB MSC firmware access support + - NVIDIA: SAUCE: arm_mpam: Allow duplicate PCC subspace_ids + - NVIDIA: SAUCE: untested: mpam: Convert pcc_channels list to XArray and + cleanup + - NVIDIA: SAUCE: x86/resctrl: Add stub to allow other architecture to + disable monitor overflow + - NVIDIA: SAUCE: arm_mpam: resctrl: Determine if any exposed counter can + overflow + - NVIDIA: SAUCE: fs/restrl: Allow the overflow handler to be disabled + - NVIDIA: SAUCE: fs/resctrl: Uniform data type of + component_id/domid/id/cache_id + - NVIDIA: SAUCE: arm_mpam: Allow cmax/cmin to be configured + - NVIDIA: SAUCE: arm_mpam: Rename mbw conversion to 'fract16' for code re- + use + - NVIDIA: SAUCE: fs/resctrl: Group all the MBA specific properties in a + separate struct + - NVIDIA: SAUCE: fs/resctrl: Abstract duplicate domain test to a helper + - NVIDIA: SAUCE: fs/resctrl: Move MBA supported check to parse_line() + instead of parse_bw() + - NVIDIA: SAUCE: fs/resctrl: Rename resctrl_get_default_ctrl() to include + resource + - NVIDIA: SAUCE: fs/resctrl: Add a schema format to the schema, allowing + it to be different + - NVIDIA: SAUCE: fs/resctrl: Use schema format to check the resource is a + bitmap + - NVIDIA: SAUCE: fs/resctrl: Add specific schema types for 'range' + - NVIDIA: SAUCE: x86/resctrl: Move over to specifying MBA control formats + - NVIDIA: SAUCE: arm_mpam: resctrl: Convert MB resource to use percentage + - NVIDIA: SAUCE: fs/resctrl: Remove 'range' schema format + - NVIDIA: SAUCE: fs/resctrl: Add additional files for percentage and + bitmap controls + - NVIDIA: SAUCE: fs/resctrl: Add fflags_from_schema() for files based on + schema format + - NVIDIA: SAUCE: fs/resctrl: Expose the schema format to user-space + - NVIDIA: SAUCE: fs/resctrl: Add L2 and L3 'MAX' resource schema + - NVIDIA: SAUCE: arm_mpam: resctrl: Add the glue code to convert to/from + cmax + - NVIDIA: SAUCE: mm,memory_hotplug: Add lockdep assertion helper + - NVIDIA: SAUCE: fs/resctrl: Take memory hotplug lock whenever taking CPU + hotplug lock + - NVIDIA: SAUCE: fs/resctrl: Add mount option for mb_uses_numa_nid and + arch stubs + - NVIDIA: SAUCE: Fix unused variable warning + - NVIDIA: SAUCE: arm_mpam: resctrl: Pick whether MB can use NUMA nid + instead of cache-id + - NVIDIA: SAUCE: arm_mpam: resctrl: Change domain_hdr online/offline to + work with a set of CPUs + - NVIDIA: SAUCE: untested: arm_mpam: resctrl: Split + mpam_resctrl_alloc_domain() to have CPU and node + - NVIDIA: SAUCE: arm_mpam: resctrl: Add NUMA node notifier for domain + online/offline + - NVIDIA: SAUCE: untested: arm_mpam: resctrl: Allow resctrl to enable NUMA + nid as MB domain-id + - NVIDIA: SAUCE: [Config] RESCTRL configs added to annotations + - NVIDIA: SAUCE: arm_mpam: Fix missing SHIFT definitions + - NVIDIA: SAUCE: Fix partid_max range issue + - x86,fs/resctrl: Fix NULL pointer dereference with events force-disabled + in mbm_event mode + - NVIDIA: SAUCE: [Config] Update RESCTRL annotations + - NVIDIA: SAUCE: arm_mpam: resctrl: Fix MPAM kunit + - NVIDIA: SAUCE: resctrl/mpam: Align packed mpam_props to fix arm64 KUnit + alignment fault + - NVIDIA: SAUCE: resctrl/tests: mpam_devices: compare only meaningful + bytes of mpam_props + + * Add patches to Fix CPU_CYCLES counting on SMT cores by avoiding + PMCCNTR_EL0 (LP: #2136812) + - perf: arm_pmuv3: Factor out PMCCNTR_EL0 use conditions + - perf: arm_pmuv3: Don't use PMCCNTR_EL0 on SMT cores + + [ Ubuntu: 6.17.0-14.14 ] + + * questing/linux: 6.17.0-14.14 -proposed tracker (LP: #2137849) + * Packaging resync (LP: #1786013) + - [Packaging] debian.master/dkms-versions -- update from kernel-versions + (main/2026.01.12) + * ubuntu_kselftests:_net/net:gre_gso.sh failing (LP: #2136820) + - SAUCE increase socat timeout in gre_gso.sh + * ubuntu_blktrace_smoke_test fails on questing with rust coreutils + (LP: #2137698) + - SAUCE: Revert "ext4: fail unaligned direct IO write with EINVAL" + * bareudp.sh in ubuntu_kselftests_net fails because of dash default shell + (LP: #2129812) + - selftests: net: use BASH for bareudp testing + * CVE-2025-40256 + - xfrm: also call xfrm_state_delete_tunnel at destroy time for states that + were never added + * Enable PMF on AMD HPT/STX/KRK (LP: #2125022) + - platform/x86/amd/pmf: Add support for adjusting PMF PPT and PPT APU + thresholds + - platform/x86/amd/pmf: Extend custom BIOS inputs for more policies + - platform/x86/amd/pmf: Update ta_pmf_action structure member + - platform/x86/amd/pmf: Add helper to verify BIOS input notifications are + enable/disable + - platform/x86/amd/pmf: Add custom BIOS input support for AMD_CPU_ID_PS + - platform/x86/amd/pmf: Preserve custom BIOS inputs for evaluating the + policies + - platform/x86/amd/pmf: Call enact function sooner to process early + pending requests + - platform/x86/amd/pmf: Add debug logs for pending requests and custom + BIOS inputs + * Questing update: v6.17.8 upstream stable release (LP: #2136850) + - iommufd/selftest: Fix ioctl return value in _test_cmd_trigger_vevents() + - drm/mediatek: Add pm_runtime support for GCE power control + - drm/i915: Fix conversion between clock ticks and nanoseconds + - drm/amdgpu: set default gfx reset masks for gfx6-8 + - drm/amd/display: Don't stretch non-native images by default in eDP + - smb: client: fix refcount leak in smb2_set_path_attr + - iommufd: Make vfio_compat's unmap succeed if the range is already empty + - futex: Optimize per-cpu reference counting + - drm/amd: Fix suspend failure with secure display TA + - drm/xe: Move declarations under conditional branch + - drm/xe: Do clean shutdown also when using flr + - drm/amd/display: Add pixel_clock to amd_pp_display_configuration + - drm/amd/pm: Use pm_display_cfg in legacy DPM (v2) + - drm/amd/display: Disable fastboot on DCE 6 too + - drm/amd/pm: Disable MCLK switching on SI at high pixel clocks + - drm/amd: Disable ASPM on SI + - arm64: kprobes: check the return value of set_memory_rox() + - compiler_types: Move unused static inline functions warning to W=2 + - riscv: Build loader.bin exclusively for Canaan K210 + - RISC-V: clear hot-unplugged cores from all task mm_cpumasks to avoid + rfence errors + - riscv: acpi: avoid errors caused by probing DT devices when ACPI is used + - fs: return EOPNOTSUPP from file_setattr/file_getattr syscalls + - ASoC: nau8821: Avoid unnecessary blocking in IRQ handler + - NFS4: Fix state renewals missing after boot + - drm/amdkfd: fix suspend/resume all calls in mes based eviction path + - NFS4: Apply delay_retrans to async operations + - HID: intel-thc-hid: intel-quickspi: Add ARL PCI Device Id's + - HID: quirks: avoid Cooler Master MM712 dongle wakeup bug + - ixgbe: handle IXGBE_VF_GET_PF_LINK_STATE mailbox operation + - HID: nintendo: Wait longer for initial probe + - NFS: check if suid/sgid was cleared after a write as needed + - HID: quirks: Add ALWAYS_POLL quirk for VRS R295 steering wheel + - io_uring: fix unexpected placement on same size resizing + - HID: logitech-hidpp: Add HIDPP_QUIRK_RESET_HI_RES_SCROLL + - ASoC: max98090/91: fixed max98091 ALSA widget powering up/down + - ALSA: hda/realtek: Fix mute led for HP Omen 17-cb0xxx + - ixgbe: handle IXGBE_VF_FEATURES_NEGOTIATE mbox cmd + - wifi: ath11k: zero init info->status in wmi_process_mgmt_tx_comp() + - selftests: net: local_termination: Wait for interfaces to come up + - net: fec: correct rx_bytes statistic for the case SHIFT16 is set + - net: phy: micrel: Introduce lanphy_modify_page_reg + - net: phy: micrel: Replace hardcoded pages with defines + - net: phy: micrel: lan8814 fix reset of the QSGMII interface + - rust: Add -fno-isolate-erroneous-paths-dereference to + bindgen_skip_c_flags + - NFSD: Skip close replay processing if XDR encoding fails + - Bluetooth: 6lowpan: fix BDADDR_LE vs ADDR_LE_DEV address type confusion + - Bluetooth: 6lowpan: Don't hold spin lock over sleeping functions + - Bluetooth: hci_conn: Fix not cleaning up PA_LINK connections + - net: dsa: tag_brcm: do not mark link local traffic as offloaded + - net/smc: fix mismatch between CLC header and proposal + - net/handshake: Fix memory leak in tls_handshake_accept() + - net: ethernet: ti: am65-cpsw-qos: fix IET verify/response timeout + - net: ethernet: ti: am65-cpsw-qos: fix IET verify retry mechanism + - net: mdio: fix resource leak in mdiobus_register_device() + - wifi: mac80211: skip rate verification for not captured PSDUs + - Bluetooth: hci_event: Fix not handling PA Sync Lost event + - net/mlx5e: Fix missing error assignment in mlx5e_xfrm_add_state() + - net/mlx5e: Fix maxrate wraparound in threshold between units + - net/mlx5e: Fix wraparound in rate limiting for values above 255 Gbps + - net/mlx5e: Fix potentially misleading debug message + - net/mlx5: Fix typo of MLX5_EQ_DOORBEL_OFFSET + - net/mlx5: Store the global doorbell in mlx5_priv + - net/mlx5e: Prepare for using different CQ doorbells + - net_sched: limit try_bulk_dequeue_skb() batches + - wifi: iwlwifi: mvm: fix beacon template/fixed rate + - wifi: iwlwifi: mld: always take beacon ies in link grading + - virtio-net: fix incorrect flags recording in big mode + - hsr: Fix supervision frame sending on HSRv0 + - hsr: Follow standard for HSRv0 supervision frames + - ACPI: CPPC: Detect preferred core availability on online CPUs + - ACPI: CPPC: Check _CPC validity for only the online CPUs + - ACPI: CPPC: Perform fast check switch only for online CPUs + - ACPI: CPPC: Limit perf ctrs in PCC check only to online CPUs + - cpufreq: intel_pstate: Check IDA only before MSR_IA32_PERF_CTL writes + - Bluetooth: L2CAP: export l2cap_chan_hold for modules + - io_uring/rsrc: don't use blk_rq_nr_phys_segments() as number of bvecs + - acpi,srat: Fix incorrect device handle check for Generic Initiator + - regulator: fixed: fix GPIO descriptor leak on register failure + - ASoC: cs4271: Fix regulator leak on probe failure + - ASoC: codecs: va-macro: fix resource leak in probe error path + - drm/vmwgfx: Restore Guest-Backed only cursor plane support + - ASoC: tas2781: fix getting the wrong device number + - pnfs: Fix TLS logic in _nfs4_pnfs_v3_ds_connect() + - pnfs: Fix TLS logic in _nfs4_pnfs_v4_ds_connect() + - pnfs: Set transport security policy to RPC_XPRTSEC_NONE unless using TLS + - simplify nfs_atomic_open_v23() + - NFSv2/v3: Fix error handling in nfs_atomic_open_v23() + - NFS: sysfs: fix leak when nfs_client kobject add fails + - NFSv4: Fix an incorrect parameter when calling nfs4_call_sync() + - drm/amd/amdgpu: Ensure isp_kernel_buffer_alloc() creates a new BO + - acpi/hmat: Fix lockdep warning for hmem_register_resource() + - ASoC: rsnd: fix OF node reference leak in rsnd_ssiu_probe() + - drm/client: fix MODULE_PARM_DESC string for "active" + - irqchip/riscv-intc: Add missing free() callback in riscv_intc_domain_ops + - lib/crypto: arm/curve25519: Disable on CPU_BIG_ENDIAN + - hostfs: Fix only passing host root in boot stage with new mount + - afs: Fix dynamic lookup to fail on cell lookup failure + - mtd: onenand: Pass correct pointer to IRQ handler + - virtio-fs: fix incorrect check for fsvq->kobj + - fs/namespace: correctly handle errors returned by grab_requested_mnt_ns + - perf header: Write bpf_prog (infos|btfs)_cnt to data file + - perf build: Don't fail fast path feature detection when binutils-devel + is not available + - perf lock: Fix segfault due to missing kernel map + - perf test shell lock_contention: Extra debug diagnostics + - perf test: Fix lock contention test + - arm64: dts: rockchip: Set correct pinctrl for I2S1 8ch TX on odroid-m1 + - arm64: dts: rockchip: Fix PCIe power enable pin for BigTreeTech CB2 and + Pi2 + - arm64: dts: rockchip: Make RK3588 GPU OPP table naming less generic + - ARM: dts: imx6ull-engicam-microgea-rmm: fix report-rate-hz value + - ARM: dts: imx51-zii-rdu1: Fix audmux node names + - arm64: dts: imx8-ss-img: Avoid gpio0_mipi_csi GPIOs being deferred + - arm64: dts: imx8mp-kontron: Fix USB OTG role switching + - HID: hid-ntrig: Prevent memory leak in ntrig_report_version() + - ARM: dts: BCM53573: Fix address of Luxul XAP-1440's Ethernet PHY + - arm64: dts: rockchip: Fix USB power enable pin for BTT CB2 and Pi2 + - arm64: dts: rockchip: drop reset from rk3576 i2c9 node + - pwm: adp5585: Correct mismatched pwm chip info + - HID: playstation: Fix memory leak in dualshock4_get_calibration_data() + - HID: uclogic: Fix potential memory leak in error path + - LoongArch: KVM: Restore guest PMU if it is enabled + - LoongArch: KVM: Add delay until timer interrupt injected + - LoongArch: KVM: Fix max supported vCPUs set with EIOINTC + - KVM: arm64: Make all 32bit ID registers fully writable + - KVM: SVM: Mark VMCB_LBR dirty when MSR_IA32_DEBUGCTLMSR is updated + - KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv() + - KVM: nSVM: Fix and simplify LBR virtualization handling with nested + - KVM: VMX: Fix check for valid GVA on an EPT violation + - nfsd: add missing FATTR4_WORD2_CLONE_BLKSIZE from supported attributes + - gcov: add support for GCC 15 + - kho: warn and exit when unpreserved page wasn't preserved + - strparser: Fix signed/unsigned mismatch bug + - dma-mapping: benchmark: Restore padding to ensure uABI remained + consistent + - maple_tree: fix tracepoint string pointers + - LoongArch: Consolidate early_ioremap()/ioremap_prot() + - LoongArch: Use correct accessor to read FWPC/MWPC + - LoongArch: Let {pte,pmd}_modify() record the status of _PAGE_DIRTY + - mm/damon/sysfs: change next_update_jiffies to a global variable + - selftests/tracing: Run sample events to clear page cache events + - wifi: mac80211: reject address change while connecting + - mm/huge_memory: preserve PG_has_hwpoisoned if a folio is split to >0 + order + - mm/mm_init: fix hash table order logging in alloc_large_system_hash() + - mm/damon/stat: change last_refresh_jiffies to a global variable + - mm/kmsan: fix kmsan kmalloc hook when no stack depots are allocated yet + - mm/shmem: fix THP allocation and fallback loop + - mm/mremap: honour writable bit in mremap pte batching + - mm/huge_memory: fix folio split check for anon folios in swapcache + - mmc: sdhci-of-dwcmshc: Change DLL_STRBIN_TAPNUM_DEFAULT to 0x4 + - mmc: pxamci: Simplify pxamci_probe() error handling using devm APIs + - mmc: dw_mmc-rockchip: Fix wrong internal phase calculate + - ASoC: sdw_utils: fix device reference leak in is_sdca_endpoint_present() + - crypto: hisilicon/qm - Fix device reference leak in qm_get_qos_value + - smb: client: fix cifs_pick_channel when channel needs reconnect + - spi: Try to get ACPI GPIO IRQ earlier + - x86/microcode/AMD: Add Zen5 model 0x44, stepping 0x1 minrev + - x86/CPU/AMD: Add additional fixed RDSEED microcode revisions + - selftests/user_events: fix type cast for write_index packed member in + perf_test + - gendwarfksyms: Skip files with no exports + - ftrace: Fix BPF fexit with livepatch + - LoongArch: Consolidate max_pfn & max_low_pfn calculation + - LoongArch: Use physical addresses for CSR_MERRENTRY/CSR_TLBRENTRY + - EDAC/altera: Handle OCRAM ECC enable after warm reset + - EDAC/altera: Use INTTEST register for Ethernet and USB SBE injection + - PM: hibernate: Emit an error when image writing fails + - PM: hibernate: Use atomic64_t for compressed_size variable + - btrfs: zoned: fix conventional zone capacity calculation + - btrfs: zoned: fix stripe width calculation + - btrfs: scrub: put bio after errors in scrub_raid56_parity_stripe() + - btrfs: do not update last_log_commit when logging inode due to a new + name + - btrfs: release root after error in data_reloc_print_warning_inode() + - drm/amdkfd: relax checks for over allocation of save area + - drm/amdgpu: disable peer-to-peer access for DCC-enabled GC12 VRAM + surfaces + - drm/i915/psr: fix pipe to vblank conversion + - drm/xe/xe3lpg: Extend Wa_15016589081 for xe3lpg + - drm/xe/xe3: Extend wa_14023061436 + - drm/xe/xe3: Add WA_14024681466 for Xe3_LPG + - pmdomain: imx: Fix reference count leak in imx_gpc_remove + - pmdomain: samsung: plug potential memleak during probe + - pmdomain: samsung: Rework legacy splash-screen handover workaround + - selftests: mptcp: connect: fix fallback note due to OoO + - selftests: mptcp: join: rm: set backup flag + - selftests: mptcp: join: endpoints: longer transfer + - selftests: mptcp: connect: trunc: read all recv data + - selftests: mptcp: join: userspace: longer transfer + - selftests: mptcp: join: properly kill background tasks + - mm/huge_memory: do not change split_huge_page*() target order silently + - mm/memory: do not populate page table entries beyond i_size + - scripts/decode_stacktrace.sh: symbol: avoid trailing whitespaces + - scripts/decode_stacktrace.sh: symbol: preserve alignment + - scripts/decode_stacktrace.sh: fix build ID and PC source parsing + - ASoC: da7213: Convert to DEFINE_RUNTIME_DEV_PM_OPS() + - ASoC: da7213: Use component driver suspend/resume + - KVM: x86: Rename local "ecx" variables to "msr" and "pmc" as appropriate + - KVM: x86: Add support for RDMSR/WRMSRNS w/ immediate on Intel + - KVM: VMX: Inject #UD if guest tries to execute SEAMCALL or TDCALL + - isdn: mISDN: hfcsusb: fix memory leak in hfcsusb_probe() + - net: phy: micrel: Fix lan8814_config_init + - Linux 6.17.9 + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68204 + - pmdomain: arm: scmi: Fix genpd leak on provider registration failure + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68203 + - drm/amdgpu: fix lock warning in amdgpu_userq_fence_driver_process + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40267 + - io_uring/rw: ensure allocated iovec gets cleared for early failure + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68198 + - crash: fix crashkernel resource shrink + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68199 + - codetag: debug: handle existing CODETAG_EMPTY in mark_objexts_empty for + slabobj_ext + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40268 + - cifs: client: fix memory leak in smb3_fs_context_parse_param + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40269 + - ALSA: usb-audio: Fix potential overflow of PCM transfer buffer + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68205 + - ALSA: hda/hdmi: Fix breakage at probing nvhdmi-mcp driver + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40270 + - mm, swap: fix potential UAF issue for VMA readahead + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40271 + - fs/proc: fix uaf in proc_readdir_de() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40272 + - mm/secretmem: fix use-after-free race in fault handler + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68245 + - net: netpoll: fix incorrect refcount handling causing incorrect cleanup + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68240 + - nilfs2: avoid having an active sc_timer before freeing sci + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68241 + - ipv4: route: Prevent rt_bind_exception() from rebinding stale fnhe + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68211 + - ksm: use range-walk function to jump over holes in + scan_get_next_rmap_item + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68246 + - ksmbd: close accepted socket when per-IP limit rejects connection + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40273 + - NFSD: free copynotify stateid in nfs4_free_ol_stateid() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40212 + - nfsd: fix refcount leak in nfsd_set_fh_dentry() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40274 + - KVM: guest_memfd: Remove bindings on memslot deletion when gmem is dying + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68202 + - sched_ext: Fix unsafe locking in the scx_dump_state() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68239 + - binfmt_misc: restore write access before closing files opened by + open_exec() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68247 + - posix-timers: Plug potential memory leak in do_timer_create() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68208 + - bpf: account for current allocated stack depth in + widen_imprecise_scalars() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68200 + - bpf: Add bpf_prog_run_data_pointers() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40275 + - ALSA: usb-audio: Fix NULL pointer dereference in + snd_usb_mixer_controls_badd + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68242 + - NFS: Fix LTP test failures when timestamps are delegated + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68243 + - NFS: Check the TLS certificate fields in nfs_match_client() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40276 + - drm/panthor: Flush shmem writes before mapping buffers CPU-uncached + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40277 + - drm/vmwgfx: Validate command header size against SVGA_CMD_MAX_DATASIZE + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68206 + - netfilter: nft_ct: add seqadj extension for natted connections + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68209 + - mlx5: Fix default values in create CQ + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40278 + - net: sched: act_ife: initialize struct tc_ife to fix KMSAN kernel- + infoleak + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40279 + - net: sched: act_connmark: initialize struct tc_ife to fix kernel leak + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40280 + - tipc: Fix use-after-free in tipc_mon_reinit_self(). + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40281 + - sctp: prevent possible shift-out-of-bounds in sctp_transport_update_rto + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40282 + - Bluetooth: 6lowpan: reset link-local header on ipv6 recv path + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40283 + - Bluetooth: btusb: reorder cleanup in btusb_disconnect to avoid UAF + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40284 + - Bluetooth: MGMT: cancel mesh send timer when hdev removed + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68210 + - erofs: avoid infinite loop due to incomplete zstd-compressed data + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40285 + - smb/server: fix possible refcount leak in smb2_sess_setup() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40286 + - smb/server: fix possible memory leak in smb2_read() + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40287 + - exfat: fix improper check of dentry.stream.valid_size + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40288 + - drm/amdgpu: Fix NULL pointer dereference in VRAM logic for APU devices + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-40289 + - drm/amdgpu: hide VRAM sysfs attributes on GPUs without VRAM + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68201 + - drm/amdgpu: remove two invalid BUG_ON()s + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68207 + - drm/xe/guc: Synchronize Dead CT worker with unbind + * Questing update: v6.17.8 upstream stable release (LP: #2136850) // + CVE-2025-68244 + - drm/i915: Avoid lock inversion when pinning to GGTT on CHV/BXT+VTD + * Questing update: v6.17.8 upstream stable release (LP: #2136833) + - Revert "Bluetooth: L2CAP: convert timeouts to secs_to_jiffies()" + - sched_ext: Mark scx_bpf_dsq_move_set_[slice|vtime]() with KF_RCU + - net: usb: asix_devices: Check return value of usbnet_get_endpoints + - fbdev: atyfb: Check if pll_ops->init_pll failed + - ACPI: button: Call input_free_device() on failing input device + registration + - ACPI: fan: Use platform device for devres-related actions + - virtio-net: drop the multi-buffer XDP packet in zerocopy + - batman-adv: Release references to inactive interfaces + - Bluetooth: rfcomm: fix modem control handling + - net: phy: dp83867: Disable EEE support as not implemented + - fbdev: pvr2fb: Fix leftover reference to ONCHIP_NR_DMA_CHANNELS + - fbdev: valkyriefb: Fix reference count leak in valkyriefb_init + - mptcp: drop bogus optimization in __mptcp_check_push() + - mptcp: restore window probe + - ASoC: qdsp6: q6asm: do not sleep while atomic + - ASoC: renesas: rz-ssi: Use proper dma_buffer_pos after resume + - s390/pci: Restore IRQ unconditionally for the zPCI device + - x86/build: Disable SSE4a + - wifi: ath10k: Fix memory leak on unsupported WMI command + - wifi: ath11k: Add missing platform IDs for quirk table + - wifi: ath12k: free skb during idr cleanup callback + - wifi: ath11k: avoid bit operation on key flags + - drm/msm/a6xx: Fix GMU firmware parser + - ALSA: usb-audio: fix control pipe direction + - ASoC: cs-amp-lib-test: Fix missing include of kunit/test-bug.h + - wifi: mac80211: reset FILS discovery and unsol probe resp intervals + - wifi: mac80211: fix key tailroom accounting leak + - wifi: nl80211: call kfree without a NULL check + - kunit: test_dev_action: Correctly cast 'priv' pointer to long* + - scsi: ufs: core: Initialize value of an attribute returned by uic cmd + - scsi: core: Fix the unit attention counter implementation + - bpf: Do not audit capability check in do_jit() + - nvmet-auth: update sc_c in host response + - crypto: s390/phmac - Do not modify the req->nbytes value + - ASoC: Intel: avs: Unprepare a stream when XRUN occurs + - ASoC: fsl_sai: fix bit order for DSD format + - ASoC: fsl_micfil: correct the endian format for DSD + - libbpf: Fix powerpc's stack register definition in bpf_tracing.h + - ASoC: mediatek: Fix double pm_runtime_disable in remove functions + - Bluetooth: ISO: Fix BIS connection dst_type handling + - Bluetooth: btmtksdio: Add pmctrl handling for BT closed state during + reset + - Bluetooth: HCI: Fix tracking of advertisement set/instance 0x00 + - Bluetooth: ISO: Fix another instance of dst_type handling + - Bluetooth: btintel_pcie: Fix event packet loss issue + - Bluetooth: hci_conn: Fix connection cleanup with BIG with 2 or more BIS + - Bluetooth: hci_core: Fix tracking of periodic advertisement + - bpf: Conditionally include dynptr copy kfuncs + - drm/msm: Ensure vm is created in VM_BIND ioctl + - ALSA: usb-audio: add mono main switch to Presonus S1824c + - ALSA: usb-audio: don't log messages meant for 1810c when initializing + 1824c + - ACPI: MRRM: Check revision of MRRM table + - drm/etnaviv: fix flush sequence logic + - tools: ynl: fix string attribute length to include null terminator + - net: hns3: return error code when function fails + - sfc: fix potential memory leak in efx_mae_process_mport() + - tools: ynl: avoid print_field when there is no reply + - dpll: spec: add missing module-name and clock-id to pin-get reply + - ASoC: fsl_sai: Fix sync error in consumer mode + - ASoC: soc_sdw_utils: remove cs42l43 component_name + - drm/amd/pm: fix smu table id bound check issue in smu_cmn_update_table() + - drm/amd/pm/powerplay/smumgr: Fix PCIeBootLinkLevel value on Fiji + - drm/amd/pm/powerplay/smumgr: Fix PCIeBootLinkLevel value on Iceland + - drm/amdgpu: fix SPDX headers on amdgpu_cper.c/h + - drm/amdgpu: fix SPDX header on amd_cper.h + - drm/amdgpu: fix SPDX header on irqsrcs_vcn_5_0.h + - ACPI: fan: Use ACPI handle when retrieving _FST + - block: fix op_is_zone_mgmt() to handle REQ_OP_ZONE_RESET_ALL + - block: make REQ_OP_ZONE_OPEN a write operation + - dma-fence: Fix safe access wrapper to call timeline name method + - kbuild: align modinfo section for Secureboot Authenticode EDK2 compat + - regmap: irq: Correct documentation of wake_invert flag + - [Config] Disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP for s390x + - s390/mm: Fix memory leak in add_marker() when kvrealloc() fails + - drm/xe: Do not wake device during a GT reset + - drm/sched: avoid killing parent entity on child SIGKILL + - drm/sched: Fix race in drm_sched_entity_select_rq() + - drm/nouveau: Fix race in nouveau_sched_fini() + - drm/i915/dmc: Clear HRR EVT_CTL/HTP to zero on ADL-S + - drm/ast: Clear preserved bits from register output value + - drm/amd: Check that VPE has reached DPM0 in idle handler + - drm/amd/display: Fix incorrect return of vblank enable on unconfigured + crtc + - drm/amd/display: Don't program BLNDGAM_MEM_PWR_FORCE when CM low-power + is disabled on DCN30 + - drm/amd/display: Add HDR workaround for a specific eDP + - mptcp: leverage skb deferral free + - mptcp: fix MSG_PEEK stream corruption + - cpuidle: governors: menu: Rearrange main loop in menu_select() + - cpuidle: governors: menu: Select polling state in some more cases + - PM: hibernate: Combine return paths in power_down() + - PM: sleep: Allow pm_restrict_gfp_mask() stacking + - mfd: kempld: Switch back to earlier ->init() behavior + - soc: aspeed: socinfo: Add AST27xx silicon IDs + - firmware: qcom: scm: preserve assign_mem() error return value + - soc: qcom: smem: Fix endian-unaware access of num_entries + - spi: loopback-test: Don't use %pK through printk + - spi: spi-qpic-snand: handle 'use_ecc' parameter of + qcom_spi_config_cw_read() + - soc: ti: pruss: don't use %pK through printk + - bpf: Don't use %pK through printk + - mmc: sdhci: Disable SD card clock before changing parameters + - pinctrl: single: fix bias pull up/down handling in pin_config_set + - mmc: host: renesas_sdhi: Fix the actual clock + - memstick: Add timeout to prevent indefinite waiting + - cpufreq: ti: Add support for AM62D2 + - bpf: Use tnums for JEQ/JNE is_branch_taken logic + - firmware: ti_sci: Enable abort handling of entry to LPM + - firewire: ohci: move self_id_complete tracepoint after validating + register + - irqchip/sifive-plic: Respect mask state when setting affinity + - irqchip/loongson-eiointc: Route interrupt parsed from bios table + - ACPI: sysfs: Use ACPI_FREE() for freeing an ACPI object + - ACPI: video: force native for Lenovo 82K8 + - libbpf: Fix USDT SIB argument handling causing unrecognized register + error + - selftests/bpf: Fix bpf_prog_detach2 usage in test_lirc_mode2 + - arm64: versal-net: Update rtc calibration value + - Revert "UBUNTU: SAUCE: firmware: qcom: scm: Allow QSEECOM on Dell + Inspiron 7441 / Latitude 7455" + - firmware: qcom: scm: Allow QSEECOM on Dell Inspiron 7441 / Latitude 7455 + - kselftest/arm64: tpidr2: Switch to waitpid() over wait4() + - arc: Fix __fls() const-foldability via __builtin_clzl() + - selftests/bpf: Upon failures, exit with code 1 in test_xsk.sh + - irqchip/gic-v2m: Handle Multiple MSI base IRQ Alignment + - ACPI: PRM: Skip handlers with NULL handler_address or NULL VA + - ACPI: resource: Skip IRQ override on ASUS Vivobook Pro N6506CU + - ACPI: scan: Add Intel CVS ACPI HIDs to acpi_ignore_dep_ids[] + - thermal: gov_step_wise: Allow cooling level to be reduced earlier + - thermal: intel: selftests: workload_hint: Mask unsupported types + - power: supply: qcom_battmgr: add OOI chemistry + - hwmon: (k10temp) Add thermal support for AMD Family 1Ah-based models + - hwmon: (k10temp) Add device ID for Strix Halo + - hwmon: (lenovo-ec-sensors) Update P8 supprt + - hwmon: (sbtsi_temp) AMD CPU extended temperature range support + - pinctrl: renesas: rzg2l: Add suspend/resume support for Schmitt control + registers + - pinctrl: keembay: release allocated memory in detach path + - power: supply: sbs-charger: Support multiple devices + - io_uring/rsrc: respect submitter_task in io_register_clone_buffers() + - hwmon: sy7636a: add alias + - selftests/bpf: Fix incorrect array size calculation + - block: check for valid bio while splitting + - irqchip/loongson-pch-lpc: Use legacy domain for PCH-LPC IRQ controller + - cpufreq: ondemand: Update the efficient idle check for Intel extended + Families + - arm64: zynqmp: Disable coresight by default + - arm64: zynqmp: Revert usb node drive strength and slew rate for zcu106 + - soc/tegra: fuse: Add Tegra114 nvmem cells and fuse lookups + - ARM: tegra: p880: set correct touchscreen clipping + - ARM: tegra: transformer-20: add missing magnetometer interrupt + - ARM: tegra: transformer-20: fix audio-codec interrupt + - firmware: qcom: tzmem: disable sc7180 platform + - soc: ti: k3-socinfo: Add information for AM62L SR1.1 + - mmc: sdhci-msm: Enable tuning for SDR50 mode for SD card + - pwm: pca9685: Use bulk write to atomicially update registers + - ACPICA: dispatcher: Use acpi_ds_clear_operands() in + acpi_ds_call_control_method() + - tee: allow a driver to allocate a tee_device without a pool + - kunit: Enable PCI on UML without triggering WARN() + - selftests/bpf: Fix arena_spin_lock selftest failure + - bpf: Do not limit bpf_cgroup_from_id to current's namespace + - i3c: mipi-i3c-hci-pci: Add support for Intel Wildcat Lake-U I3C + - rust: kunit: allow `cfg` on `test`s + - video: backlight: lp855x_bl: Set correct EPROM start for LP8556 + - i3c: dw: Add shutdown support to dw_i3c_master driver + - io_uring/zcrx: check all niovs filled with dma addresses + - tools/cpupower: fix error return value in cpupower_write_sysfs() + - io_uring/zcrx: account niov arrays to cgroup + - pmdomain: apple: Add "apple,t8103-pmgr-pwrstate" + - power: supply: qcom_battmgr: handle charging state change notifications + - bpftool: Fix -Wuninitialized-const-pointer warnings with clang >= 21 + - cpuidle: Fail cpuidle device registration if there is one already + - selftests/bpf: Fix selftest verifier_arena_large failure + - selftests: ublk: fix behavior when fio is not installed + - spi: rpc-if: Add resume support for RZ/G3E + - ACPI: SPCR: Support Precise Baud Rate field + - clocksource/drivers/vf-pit: Replace raw_readl/writel to readl/writel + - clocksource/drivers/timer-rtl-otto: Work around dying timers + - clocksource/drivers/timer-rtl-otto: Do not interfere with interrupts + - riscv: bpf: Fix uninitialized symbol 'retval_off' + - bpf: Clear pfmemalloc flag when freeing all fragments + - selftests: drv-net: Pull data before parsing headers + - nvme: Use non zero KATO for persistent discovery connections + - uprobe: Do not emulate/sstep original instruction when ip is changed + - hwmon: (asus-ec-sensors) increase timeout for locking ACPI mutex + - hwmon: (dell-smm) Remove Dell Precision 490 custom config data + - hwmon: (dell-smm) Add support for Dell OptiPlex 7040 + - tools/cpupower: Fix incorrect size in cpuidle_state_disable() + - selftests/bpf: Fix flaky bpf_cookie selftest + - tools/power turbostat: Fix incorrect sorting of PMT telemetry + - tools/power x86_energy_perf_policy: Fix incorrect fopen mode usage + - tools/power x86_energy_perf_policy: Enhance HWP enable + - tools/power x86_energy_perf_policy: Prefer driver HWP limits + - mfd: simple-mfd-i2c: Add compatible strings for Layerscape QIXIS FPGA + - mfd: stmpe: Remove IRQ domain upon removal + - mfd: stmpe-i2c: Add missing MODULE_LICENSE + - mfd: qnap-mcu: Handle errors returned from qnap_mcu_write + - mfd: qnap-mcu: Include linux/types.h in qnap-mcu.h shared header + - mfd: madera: Work around false-positive -Wininitialized warning + - mfd: da9063: Split chip variant reading in two bus transactions + - mfd: macsmc: Add "apple,t8103-smc" compatible + - mfd: core: Increment of_node's refcount before linking it to the + platform device + - mfd: cs42l43: Move IRQ enable/disable to encompass force suspend + - mfd: intel-lpss: Add Intel Wildcat Lake LPSS PCI IDs + - drm/xe/ptl: Apply Wa_16026007364 + - drm/xe/configfs: Enforce canonical device names + - drm/amd/display: Update tiled to tiled copy command + - drm/amd/display: fix condition for setting timing_adjust_pending + - drm/amd/display: ensure committing streams is seamless + - drm/amdgpu: add range check for RAS bad page address + - drm/amdgpu: Check vcn sram load return value + - drm/amd/display: Remove check DPIA HPD status for BW Allocation + - drm/amd/display: Increase AUX Intra-Hop Done Max Wait Duration + - drm/amd/display: Fix dmub_cmd header alignment + - drm/xe/guc: Add more GuC load error status codes + - drm/xe/pf: Don't resume device from restart worker + - drm/amdgpu: Fix build error when CONFIG_SUSPEND is disabled + - drm/amdgpu: Update IPID value for bad page threshold CPER + - drm/amdgpu: Avoid rma causes GPU duplicate reset + - drm/amdgpu: Effective health check before reset + - drm/amd/amdgpu: Release xcp drm memory after unplug + - drm/amdgpu: Fix vcn v5.0.1 poison irq call trace + - drm/xe: Extend wa_13012615864 to additional Xe2 and Xe3 platforms + - drm/amdgpu: Skip poison aca bank from UE channel + - drm/amd/display: add more cyan skillfish devices + - drm/amdgpu: Initialize jpeg v5_0_1 ras function + - drm/amdgpu: skip mgpu fan boost for multi-vf + - drm/amd/display: fix dmub access race condition + - drm/amd/display: update dpp/disp clock from smu clock table + - drm/amd/pm: Use cached metrics data on aldebaran + - drm/amd/pm: Use cached metrics data on arcturus + - accel/amdxdna: Unify pm and rpm suspend and resume callbacks + - drm/amdgpu/jpeg: Hold pg_lock before jpeg poweroff + - drm/xe/pf: Program LMTT directory pointer on all GTs within a tile + - drm/nouveau: replace snprintf() with scnprintf() in nvkm_snprintbf() + - ASoC: tas2781: Add keyword "init" in profile section + - ASoC: mediatek: Use SND_JACK_AVOUT for HDMI/DP jacks + - drm/amd/display: Reset apply_eamless_boot_optimization when dpms_off + - drm/amdgpu: add to custom amdgpu_drm_release drm_dev_enter/exit + - drm/amd/display: Wait until OTG enable state is cleared + - drm/xe: rework PDE PAT index selection + - docs: kernel-doc: avoid script crash on ancient Python + - drm/sharp-memory: Do not access GEM-DMA vaddr directly + - PCI: Disable MSI on RDC PCI to PCIe bridges + - drm/nouveau: always set RMDevidCheckIgnore for GSP-RM + - drm/panel-edp: Add SHP LQ134Z1 panel for Dell XPS 9345 + - selftests/net: Replace non-standard __WORDSIZE with sizeof(long) * 8 + - selftests/net: Ensure assert() triggers in psock_tpacket.c + - wifi: rtw89: print just once for unknown C2H events + - wifi: rtw88: sdio: use indirect IO for device registers before power-on + - wifi: rtw89: add dummy C2H handlers for BCN resend and update done + - drm/amdkfd: return -ENOTTY for unsupported IOCTLs + - selftests: drv-net: devmem: add / correct the IPv6 support + - selftests: drv-net: devmem: flip the direction of Tx tests + - media: pci: ivtv: Don't create fake v4l2_fh + - media: amphion: Delete v4l2_fh synchronously in .release() + - drm/tidss: Use the crtc_* timings when programming the HW + - drm/bridge: cdns-dsi: Fix REG_WAKEUP_TIME value + - drm/bridge: cdns-dsi: Don't fail on MIPI_DSI_MODE_VIDEO_BURST + - drm/tidss: Set crtc modesetting parameters with adjusted mode + - drm/tidss: Remove early fb + - RDMA/mana_ib: Drain send wrs of GSI QP + - media: i2c: Kconfig: Ensure a dependency on HAVE_CLK for + VIDEO_CAMERA_SENSOR + - PCI/ERR: Update device error_state already after reset + - x86/vsyscall: Do not require X86_PF_INSTR to emulate vsyscall + - net: stmmac: Check stmmac_hw_setup() in stmmac_resume() + - ice: Don't use %pK through printk or tracepoints + - thunderbolt: Use is_pciehp instead of is_hotplug_bridge + - ASoC: es8323: enable DAPM power widgets for playback DAC and output + - powerpc/eeh: Use result of error_detected() in uevent + - s390/pci: Use pci_uevent_ers() in PCI recovery + - bridge: Redirect to backup port when port is administratively down + - selftests: drv-net: wait for carrier + - net: phy: mscc: report and configure in-band auto-negotiation for + SGMII/QSGMII + - scsi: ufs: host: mediatek: Fix auto-hibern8 timer configuration + - scsi: ufs: host: mediatek: Fix PWM mode switch issue + - scsi: ufs: host: mediatek: Assign power mode userdata before FASTAUTO + mode change + - scsi: ufs: host: mediatek: Change reset sequence for improved stability + - scsi: ufs: host: mediatek: Fix invalid access in vccqx handling + - gpu: nova-core: register: allow fields named `offset` + - drm/panthor: Serialize GPU cache flush operations + - HID: pidff: Use direction fix only for conditional effects + - HID: pidff: PERMISSIVE_CONTROL quirk autodetection + - drm/bridge: display-connector: don't set OP_DETECT for DisplayPorts + - drm/amdkfd: Handle lack of READ permissions in SVM mapping + - drm/amdgpu: refactor bad_page_work for corner case handling + - hwrng: timeriomem - Use us_to_ktime() where appropriate + - iio: adc: spear_adc: mask SPEAR_ADC_STATUS channel and avg sample before + setting register + - iio: adc: imx93_adc: load calibrated values even calibration failed + - usb: gadget: f_ncm: Fix MAC assignment NCM ethernet + - ASoC: es8323: remove DAC enablement write from es8323_probe + - ASoC: es8323: add proper left/right mixer controls via DAPM + - ASoC: codecs: wsa883x: Handle shared reset GPIO for WSA883x speakers + - drm/xe: Make page size consistent in loop + - wifi: rtw89: wow: remove notify during WoWLAN net-detect + - wifi: rtw89: fix BSSID comparison for non-transmitted BSSID + - wifi: rtw89: 8851b: rfk: update IQK TIA setting + - dm error: mark as DM_TARGET_PASSES_INTEGRITY + - char: misc: Make misc_register() reentry for miscdevice who wants + dynamic minor + - char: misc: Does not request module for miscdevice with dynamic minor + - net: When removing nexthops, don't call synchronize_net if it is not + necessary + - net: Call trace_sock_exceed_buf_limit() for memcg failure with + SK_MEM_RECV. + - dmaengine: idxd: Add a new IAA device ID for Wildcat Lake family + platforms + - PCI/P2PDMA: Fix incorrect pointer usage in devm_kfree() call + - bnxt_en: Add Hyper-V VF ID + - tty: serial: Modify the use of dev_err_probe() + - ALSA: usb-audio: Add validation of UAC2/UAC3 effect units + - Octeontx2-af: Broadcast XON on all channels + - idpf: do not linearize big TSO packets + - drm/xe/pcode: Initialize data0 for pcode read routine + - drm/panel: ilitek-ili9881c: turn off power-supply when init fails + - drm/panel: ilitek-ili9881c: move display_on/_off dcs calls to + (un-)prepare + - rds: Fix endianness annotation for RDS_MPATH_HASH + - net: wangxun: limit tx_max_coalesced_frames_irq + - iio: imu: bmi270: Match PNP ID found on newer GPD firmware + - media: ipu6: isys: Set embedded data type correctly for metadata formats + - rpmsg: char: Export alias for RPMSG ID rpmsg-raw from table + - net: ipv4: allow directed broadcast routes to use dst hint + - scsi: mpi3mr: Fix device loss during enclosure reboot due to zero link + speed + - wifi: rtw89: coex: Limit Wi-Fi scan slot cost to avoid A2DP glitch + - scsi: mpi3mr: Fix I/O failures during controller reset + - scsi: mpi3mr: Fix controller init failure on fault during queue creation + - scsi: pm80xx: Fix race condition caused by static variables + - extcon: adc-jack: Fix wakeup source leaks on device unbind + - extcon: fsa9480: Fix wakeup source leaks on device unbind + - extcon: axp288: Fix wakeup source leaks on device unbind + - drm/xe: Set GT as wedged before sending wedged uevent + - remoteproc: wkup_m3: Use devm_pm_runtime_enable() helper + - drm/xe/wcl: Extend L3bank mask workaround + - net: phy: fixed_phy: let fixed_phy_unregister free the phy_device + - selftests: drv-net: hds: restore hds settings + - fuse: zero initialize inode private data + - virtio_fs: fix the hash table using in virtio_fs_enqueue_req() + - selftests: pci_endpoint: Skip IRQ test if IRQ is out of range. + - drm/xe: Ensure GT is in C0 during resumes + - misc: pci_endpoint_test: Skip IRQ tests if irq is out of range + - drm/amdgpu: Correct the loss of aca bank reg info + - drm/amdgpu: Correct the counts of nr_banks and nr_errors + - drm/amdkfd: fix vram allocation failure for a special case + - drm/amd/display: Support HW cursor 180 rot for any number of pipe splits + - drm/amdkfd: Tie UNMAP_LATENCY to queue_preemption + - drm/amd/display: wait for otg update pending latch before clock + optimization + - drm/amd/display: Consider sink max slice width limitation for dsc + - drm/amdgpu/vpe: cancel delayed work in hw_fini + - drm/xe: Cancel pending TLB inval workers on teardown + - net: Prevent RPS table overwrite of active flows + - eth: fbnic: Reset hw stats upon PCI error + - wifi: iwlwifi: mld: trigger mlo scan only when not in EMLSR + - platform/x86/intel-uncore-freq: Fix warning in partitioned system + - drm/msm/dpu: Filter modes based on adjusted mode clock + - drm/msm: Use of_reserved_mem_region_to_resource() for "memory-region" + - selftests: drv-net: rss_ctx: fix the queue count check + - media: fix uninitialized symbol warnings + - media: pci: mgb4: Fix timings comparison in VIDIOC_S_DV_TIMINGS + - ASoC: SOF: ipc4-pcm: Add fixup for channels + - drm/amdgpu: Notify pmfw bad page threshold exceeded + - drm/amd/display: Increase minimum clock for TMDS 420 with pipe splitting + - drm/amdgpu: Avoid jpeg v5.0.1 poison irq call trace on sriov guest + - drm/amd/display: incorrect conditions for failing dto calculations + - drm/amdgpu: Avoid vcn v5.0.1 poison irq call trace on sriov guest + - drm/amdgpu: Respect max pixel clock for HDMI and DVI-D (v2) + - mips: lantiq: danube: add missing properties to cpu node + - mips: lantiq: danube: add model to EASY50712 dts + - mips: lantiq: danube: add missing device_type in pci node + - mips: lantiq: xway: sysctrl: rename stp clock + - mips: lantiq: danube: rename stp node on EASY50712 reference board + - inet_diag: annotate data-races in inet_diag_bc_sk() + - microchip: lan865x: add ndo_eth_ioctl handler to enable PHY ioctl + support + - crypto: qat - use kcalloc() in qat_uclo_map_objs_from_mof() + - scsi: pm8001: Use int instead of u32 to store error codes + - iio: adc: ad7124: do not require mclk + - scsi: ufs: exynos: fsd: Gate ref_clk and put UFS device in reset on + suspend + - media: imx-mipi-csis: Only set clock rate when specified in DT + - wifi: iwlwifi: pcie: remember when interrupts are disabled + - drm/st7571-i2c: add support for inverted pixel format + - ptp: Limit time setting of PTP clocks + - dmaengine: sh: setup_xref error handling + - dmaengine: mv_xor: match alloc_wc and free_wc + - dmaengine: dw-edma: Set status for callback_result + - netfilter: nf_tables: all transaction allocations can now sleep + - drm/msm/dsi/phy: Toggle back buffer resync after preparing PLL + - drm/msm/dsi/phy_7nm: Fix missing initial VCO rate + - drm/amdgpu: Allow kfd CRIU with no buffer objects + - drm/xe/guc: Increase GuC crash dump buffer size + - drm/amd/pm: Increase SMC timeout on SI and warn (v3) + - move_mount(2): take sanity checks in 'beneath' case into do_lock_mount() + - selftests: drv-net: rss_ctx: make the test pass with few queues + - ipv6: Add sanity checks on ipv6_devconf.rpl_seg_enabled + - drm/xe: Extend Wa_22021007897 to Xe3 platforms + - wifi: mac80211: count reg connection element in the size + - drm/panthor: check bo offset alignment in vm bind + - drm: panel-backlight-quirks: Make EDID match optional + - ixgbe: reduce number of reads when getting OROM data + - netlink: specs: fou: change local-v6/peer-v6 check + - net: nfc: nci: Increase NCI_DATA_TIMEOUT to 3000 ms + - media: adv7180: Add missing lock in suspend callback + - media: adv7180: Do not write format to device in set_fmt + - media: adv7180: Only validate format in querystd + - media: verisilicon: Explicitly disable selection api ioctls for decoders + - wifi: mac80211: Fix 6 GHz Band capabilities element advertisement in + lower bands + - platform/x86: think-lmi: Add extra TC BIOS error messages + - platform/x86/intel-uncore-freq: Present unique domain ID per package + - ALSA: usb-audio: apply quirk for MOONDROP Quark2 + - PCI: imx6: Enable the Vaux supply if available + - drm/xe/guc: Set upper limit of H2G retries over CTB + - net: call cond_resched() less often in __release_sock() + - smsc911x: add second read of EEPROM mac when possible corruption seen + - drm/xe: improve dma-resv handling for backup object + - iommu/amd: Add support to remap/unmap IOMMU buffers for kdump + - iommu/amd: Skip enabling command/event buffers for kdump + - iommu/amd: Reuse device table for kdump + - crypto: ccp: Skip SEV and SNP INIT for kdump boot + - iommu/apple-dart: Clear stream error indicator bits for T8110 DARTs + - bus: mhi: host: pci_generic: Add support for all Foxconn T99W696 SKU + variants + - drm/amdgpu: Correct info field of bad page threshold exceed CPER + - drm/amd: add more cyan skillfish PCI ids + - drm/amdgpu: don't enable SMU on cyan skillfish + - drm/amdgpu: add support for cyan skillfish gpu_info + - drm/amd/display: Fix pbn_div Calculation Error + - drm/amd/display: dont wait for pipe update during medupdate/highirq + - drm/amd/pm: refine amdgpu pm sysfs node error code + - drm/amd/display: Indicate when custom brightness curves are in use + - selftests: ncdevmem: don't retry EFAULT + - net: dsa: felix: support phy-mode = "10g-qxgmii" + - usb: gadget: f_hid: Fix zero length packet transfer + - serial: qcom-geni: Add DFS clock mode support to GENI UART driver + - serdev: Drop dev_pm_domain_detach() call + - tty/vt: Add missing return value for VT_RESIZE in vt_ioctl() + - eeprom: at25: support Cypress FRAMs without device ID + - drm/msm/adreno: Add speedbins for A663 GPU + - drm/msm: Fix 32b size truncation + - dt-bindings: display/msm/gmu: Update Adreno 623 bindings + - drm/msm: make sure to not queue up recovery more than once + - char: Use list_del_init() in misc_deregister() to reinitialize list + pointer + - drm/msm/adreno: Add speedbin data for A623 GPU + - drm/msm/adreno: Add fenced regwrite support + - drm/msm/a6xx: Switch to GMU AO counter + - idpf: link NAPIs to queues + - selftests: net: make the dump test less sensitive to mem accounting + - PCI: endpoint: pci-epf-test: Limit PCIe BAR size for fixed BARs + - wifi: rtw89: Add USB ID 2001:332a for D-Link AX9U rev. A1 + - wifi: rtw89: Add USB ID 2001:3327 for D-Link AX18U rev. A1 + - wifi: iwlwifi: fw: Add ASUS to PPAG and TAS list + - drm/xe/i2c: Enable bus mastering + - media: ov08x40: Fix the horizontal flip control + - media: i2c: og01a1b: Specify monochrome media bus format instead of + Bayer + - media: qcom: camss: csiphy-3ph: Add CSIPHY 2ph DPHY v2.0.1 init sequence + - drm/bridge: write full Audio InfoFrame + - drm/xe/guc: Always add CT disable action during second init step + - f2fs: fix wrong layout information on 16KB page + - selftests: mptcp: join: allow more time to send ADD_ADDR + - scsi: ufs: host: mediatek: Enhance recovery on resume failure + - scsi: ufs: ufs-qcom: Align programming sequence of Shared ICE for UFS + controller v5 + - scsi: ufs: host: mediatek: Fix unbalanced IRQ enable issue + - scsi: ufs: host: mediatek: Enhance recovery on hibernation exit failure + - net: phy: marvell: Fix 88e1510 downshift counter errata + - scsi: ufs: host: mediatek: Correct system PM flow + - scsi: ufs: host: mediatek: Disable auto-hibern8 during power mode + changes + - scsi: ufs: host: mediatek: Fix adapt issue after PA_Init + - wifi: cfg80211: update the time stamps in hidden ssid + - wifi: mac80211: Fix HE capabilities element check + - fbcon: Use screen info to find primary device + - phy: cadence: cdns-dphy: Enable lower resolutions in dphy + - Fix access to video_is_primary_device() when compiled without + CONFIG_VIDEO + - phy: renesas: r8a779f0-ether-serdes: add new step added to latest + datasheet + - phy: rockchip: phy-rockchip-inno-csidphy: allow writes to grf register 0 + - drm/msm/registers: Generate _HI/LO builders for reg64 + - net: sh_eth: Disable WoL if system can not suspend + - selftests: net: replace sleeps in fcnal-test with waits + - media: redrat3: use int type to store negative error codes + - platform/x86/amd/pmf: Fix the custom bios input handling mechanism + - selftests: traceroute: Use require_command() + - selftests: traceroute: Return correct value on failure + - openrisc: Add R_OR1K_32_PCREL relocation type module support + - netfilter: nf_reject: don't reply to icmp error messages + - x86/kvm: Prefer native qspinlock for dedicated vCPUs irrespective of + PV_UNHALT + - x86/virt/tdx: Use precalculated TDVPR page physical address + - selftests: Disable dad for ipv6 in fcnal-test.sh + - eth: 8139too: Make 8139TOO_PIO depend on !NO_IOPORT_MAP + - [Config] No longer enable `CONFIG_8139TOO_PIO` for armhf + - selftests: Replace sleep with slowwait + - net: devmem: expose tcp_recvmsg_locked errors + - selftests: net: lib.sh: Don't defer failed commands + - HID: asus: add Z13 folio to generic group for multitouch to work + - watchdog: s3c2410_wdt: Fix max_timeout being calculated larger + - crypto: sun8i-ce - remove channel timeout field + - PCI: dwc: Verify the single eDMA IRQ in dw_pcie_edma_irq_verify() + - crypto: ccp - Fix incorrect payload size calculation in + psp_poulate_hsti() + - crypto: caam - double the entropy delay interval for retry + - can: rcar_canfd: Update bit rate constants for RZ/G3E and R-Car Gen4 + - net: mana: Reduce waiting time if HWC not responding + - ionic: use int type for err in ionic_get_module_eeprom_by_page + - net/cls_cgroup: Fix task_get_classid() during qdisc run + - wifi: mt76: mt7921: Add 160MHz beamformee capability for mt7922 device + - wifi: mt76: mt7925: add pci restore for hibernate + - wifi: mt76: mt7996: Fix mt7996_reverse_frag0_hdr_trans for MLO + - wifi: mt76: mt7996: Set def_wcid pointer in mt7996_mac_sta_init_link() + - wifi: mt76: mt7996: Temporarily disable EPCS + - wifi: mt76: mt7996: support writing MAC TXD for AddBA Request + - wifi: mt76: mt76_eeprom_override to int + - ALSA: serial-generic: remove shared static buffer + - wifi: mt76: mt7996: fix memory leak on mt7996_mcu_sta_key_tlv error + - wifi: mt76: mt7996: disable promiscuous mode by default + - wifi: mt76: use altx queue for offchannel tx on connac+ + - wifi: mt76: improve phy reset on hw restart + - drm/amdgpu: Use memdup_array_user in amdgpu_cs_wait_fences_ioctl + - drm/amdgpu: Release hive reference properly + - drm/amd/display: Fix DMCUB loading sequence for DCN3.2 + - drm/amd/display: Set up pixel encoding for YCBCR422 + - drm/amd/display: fix dml ms order of operations + - drm/amd/display: Don't use non-registered VUPDATE on DCE 6 + - drm/amd/display: Keep PLL0 running on DCE 6.0 and 6.4 + - drm/amd/display: Fix DVI-D/HDMI adapters + - drm/amd/display: Disable VRR on DCE 6 + - drm/amd/display/dml2: Guard dml21_map_dc_state_into_dml_display_cfg with + DC_FP_START + - net: phy: clear EEE runtime state in PHY_HALTED/PHY_ERROR + - ethernet: Extend device_get_mac_address() to use NVMEM + - scsi: ufs: ufs-qcom: Disable lane clocks during phy hibern8 + - HID: i2c-hid: Resolve touchpad issues on Dell systems during S4 + - hinic3: Queue pair endianness improvements + - hinic3: Fix missing napi->dev in netif_queue_set_napi + - tools: ynl-gen: validate nested arrays + - drm/xe/guc: Return an error code if the GuC load fails + - drm/amdgpu: reject gang submissions under SRIOV + - selftests/Makefile: include $(INSTALL_DEP_TARGETS) in clean target to + clean net/lib dependency + - scsi: ufs: core: Disable timestamp functionality if not supported + - scsi: lpfc: Clean up allocated queues when queue setup mbox commands + fail + - scsi: lpfc: Decrement ndlp kref after FDISC retries exhausted + - scsi: lpfc: Check return status of lpfc_reset_flush_io_context during + TGT_RESET + - scsi: lpfc: Remove ndlp kref decrement clause for F_Port_Ctrl in + lpfc_cleanup + - scsi: lpfc: Define size of debugfs entry for xri rebalancing + - scsi: lpfc: Ensure PLOGI_ACC is sent prior to PRLI in Point to Point + topology + - allow finish_no_open(file, ERR_PTR(-E...)) + - usb: mon: Increase BUFF_MAX to 64 MiB to support multi-MB URBs + - usb: xhci: plat: Facilitate using autosuspend for xhci plat devices + - wifi: rtw89: disable RTW89_PHYSTS_IE09_FTR_0 for ppdu status + - wifi: rtw89: obtain RX path from ppdu status IE00 + - wifi: rtw89: renew a completion for each H2C command waiting C2H event + - usb: xhci-pci: add support for hosts with zero USB3 ports + - ipv6: np->rxpmtu race annotation + - RDMA/irdma: Update Kconfig + - IB/ipoib: Ignore L3 master device + - bnxt_en: Add fw log trace support for 5731X/5741X chips + - mei: make a local copy of client uuid in connect + - ASoC: qcom: sc8280xp: explicitly set S16LE format in + sc8280xp_be_hw_params_fixup() + - net: phy: clear link parameters on admin link down + - net: ethernet: microchip: sparx5: make it selectable for ARCH_LAN969X + - bus: mhi: core: Improve mhi_sync_power_up handling for SYS_ERR state + - iommu/vt-d: Replace snprintf with scnprintf in dmar_latency_snapshot() + - wifi: ath10k: Fix connection after GTK rekeying + - iommu/vt-d: Remove LPIG from page group response descriptor + - wifi: mac80211: Get the correct interface for non-netdev skb status + - wifi: mac80211: Track NAN interface start/stop + - net: intel: fm10k: Fix parameter idx set but not used + - sparc/module: Add R_SPARC_UA64 relocation handling + - sparc64: fix prototypes of reads[bwl]() + - vfio: return -ENOTTY for unsupported device feature + - ptp_ocp: make ptp_ocp driver compatible with PTP_EXTTS_REQUEST2 + - crypto: hisilicon/qm - invalidate queues in use + - crypto: hisilicon/qm - clear all VF configurations in the hardware + - ASoC: ops: improve snd_soc_get_volsw + - PCI/PM: Skip resuming to D0 if device is disconnected + - selftests: forwarding: Reorder (ar)ping arguments to obey POSIX getopt + - remoteproc: qcom: q6v5: Avoid handling handover twice + - wifi: ath12k: Increase DP_REO_CMD_RING_SIZE to 256 + - net: dsa: microchip: Set SPI as bus interface during reset for KSZ8463 + - bng_en: make bnge_alloc_ring() self-unwind on failure + - ALSA: usb-audio: don't apply interface quirk to Presonus S1824c + - tcp: Update bind bucket state on port release + - ovl: make sure that ovl_create_real() returns a hashed dentry + - drm/amd/display: Add missing post flip calls + - drm/amd/display: Add AVI infoframe copy in copy_stream_update_to_stream + - drm/amd/display: Add fast sync field in ultra sleep more for DMUB + - drm/amd/display: Init dispclk from bootup clock for DCN314 + - drm/amd/display: Fix for test crash due to power gating + - drm/amd/display: change dc stream color settings only in atomic commit + - NFSv4: handle ERR_GRACE on delegation recalls + - NFSv4.1: fix mount hang after CREATE_SESSION failure + - net: bridge: Install FDB for bridge MAC on VLAN 0 + - net: phy: dp83640: improve phydev and driver removal handling + - scsi: ufs: core: Change MCQ interrupt enable flow + - scsi: libfc: Fix potential buffer overflow in fc_ct_ms_fill() + - accel/habanalabs/gaudi2: fix BMON disable configuration + - scsi: mpt3sas: Add support for 22.5 Gbps SAS link rate + - accel/habanalabs: return ENOMEM if less than requested pages were pinned + - accel/habanalabs/gaudi2: read preboot status after recovering from dirty + state + - ASoC: renesas: msiof: add .symmetric_xxx on snd_soc_dai_driver + - ASoC: renesas: msiof: use reset controller + - ASoC: renesas: msiof: tidyup DMAC stop timing + - ASoC: renesas: msiof: set SIFCTR register + - ext4: increase IO priority of fastcommit + - drm/amdgpu: Add fallback to pipe reset if KCQ ring reset fails + - drm/amdgpu: Fix fence signaling race condition in userqueue + - ASoC: stm32: sai: manage context in set_sysclk callback + - ASoC: tlv320aic3x: Fix class-D initialization for tlv320aic3007 + - ACPI: scan: Update honor list for RPMI System MSI + - platform/x86: x86-android-tablets: Stop using EPROBE_DEFER + - vfio/pci: Fix INTx handling on legacy non-PCI 2.3 devices + - vfio/nvgrace-gpu: Add GB300 SKU to the devid table + - selftest: net: Fix error message if empty variable + - net/mlx5e: Don't query FEC statistics when FEC is disabled + - Bluetooth: btintel: Add support for BlazarIW core + - net: macb: avoid dealing with endianness in macb_set_hwaddr() + - Bluetooth: btusb: Add new VID/PID 13d3/3627 for MT7925 + - Bluetooth: btintel_pcie: Define hdev->wakeup() callback + - Bluetooth: ISO: Don't initiate CIS connections if there are no buffers + - Bluetooth: btusb: Check for unexpected bytes when defragmenting HCI + frames + - Bluetooth: ISO: Use sk_sndtimeo as conn_timeout + - Bluetooth: btusb: Add new VID/PID 13d3/3633 for MT7922 + - net: stmmac: est: Drop frames causing HLBS error + - exfat: limit log print for IO error + - 6pack: drop redundant locking and refcounting + - page_pool: Clamp pool size to max 16K pages + - net/mlx5e: Prevent entering switchdev mode with inconsistent netns + - ksmbd: use sock_create_kern interface to create kernel socket + - smb: client: update cfid->last_access_time in + open_cached_dir_by_dentry() + - smb: client: transport: avoid reconnects triggered by pending task work + - usb: xhci-pci: Fix USB2-only root hub registration + - drm/amd/display: Add fallback path for YCBCR422 + - ACPICA: Update dsmethod.c to get rid of unused variable warning + - RDMA/bnxt_re: Fix a potential memory leak in destroy_gsi_sqp + - RDMA/irdma: Fix SD index calculation + - RDMA/irdma: Remove unused struct irdma_cq fields + - RDMA/irdma: Set irdma_cq cq_num field during CQ create + - RDMA/uverbs: Fix umem release in UVERBS_METHOD_CQ_CREATE + - RDMA/hns: Fix recv CQ and QP cache affinity + - RDMA/hns: Fix the modification of max_send_sge + - RDMA/hns: Fix wrong WQE data when QP wraps around + - btrfs: mark dirty extent range for out of bound prealloc extents + - clk: qcom: gcc-ipq6018: rework nss_port5 clock to multiple conf + - clk: renesas: rzv2h: Re-assert reset on deassert timeout + - clk: samsung: exynos990: Add missing USB clock registers to HSI0 + - fs/hpfs: Fix error code for new_inode() failure in + mkdir/create/mknod/symlink + - clocksource: hyper-v: Skip unnecessary checks for the root partition + - hyperv: Add missing field to hv_output_map_device_interrupt + - um: Fix help message for ssl-non-raw + - clk: sunxi-ng: sun6i-rtc: Add A523 specifics + - rtc: pcf2127: clear minute/second interrupt + - ARM: at91: pm: save and restore ACR during PLL disable/enable + - clk: at91: add ACR in all PLL settings + - clk: at91: sam9x7: Add peripheral clock id for pmecc + - clk: at91: clk-master: Add check for divide by 3 + - clk: at91: clk-sam9x60-pll: force write to PLL_UPDT register + - clk: ti: am33xx: keep WKUP_DEBUGSS_CLKCTRL enabled + - clk: scmi: Add duty cycle ops only when duty cycle is supported + - clk: clocking-wizard: Fix output clock register offset for Versal + platforms + - NTB: epf: Allow arbitrary BAR mapping + - 9p: fix /sys/fs/9p/caches overwriting itself + - cpufreq: tegra186: Initialize all cores to max frequencies + - 9p: sysfs_init: don't hardcode error to ENOMEM + - scsi: ufs: core: Include UTP error in INT_FATAL_ERRORS + - fbdev: core: Fix ubsan warning in pixel_to_pat + - ACPI: property: Return present device nodes only on fwnode interface + - LoongArch: Handle new atomic instructions for probes + - tools bitmap: Add missing asm-generic/bitsperlong.h include + - tools: lib: thermal: don't preserve owner in install + - tools: lib: thermal: use pkg-config to locate libnl3 + - ALSA: hda/realtek: Add quirk for ASUS ROG Zephyrus Duo + - rtc: zynqmp: Restore alarm functionality after kexec transition + - rtc: pcf2127: fix watchdog interrupt mask on pcf2131 + - net: wwan: t7xx: add support for HP DRMR-H01 + - kbuild: uapi: Strip comments before size type check + - ASoC: meson: aiu-encoder-i2s: fix bit clock polarity + - ASoC: rt722: add settings for rt722VB + - drm/amdgpu: Report individual reset error + - ceph: add checking of wait_for_completion_killable() return value + - ceph: fix potential race condition in ceph_ioctl_lazyio() + - ceph: refactor wake_up_bit() pattern of calling + - x86: uaccess: don't use runtime-const rewriting in modules + - rust: condvar: fix broken intra-doc link + - rust: devres: fix private intra-doc link + - rust: kbuild: workaround `rustdoc` doctests modifier bug + - rust: kbuild: treat `build_error` and `rustdoc` as kernel objects + - media: uvcvideo: Use heuristic to find stream entity + - Revert "wifi: ath10k: avoid unnecessary wait for service ready message" + - tracing: tprobe-events: Fix to register tracepoint correctly + - tracing: tprobe-events: Fix to put tracepoint_user when disable the + tprobe + - net: libwx: fix device bus LAN ID + - scsi: ufs: core: Fix a race condition related to the "hid" attribute + group + - riscv: ptdump: use seq_puts() in pt_dump_seq_puts() macro + - Revert "wifi: ath12k: Fix missing station power save configuration" + - scsi: ufs: core: Revert "Make HID attributes visible" + - Bluetooth: btrtl: Fix memory leak in rtlbt_parse_firmware_v2() + - net: dsa: tag_brcm: legacy: fix untagged rx on unbridged ports for + bcm63xx + - selftests/net: fix out-of-order delivery of FIN in gro:tcp test + - selftests/net: use destination options instead of hop-by-hop + - selftests: netdevsim: Fix ethtool-coalesce.sh fail by installing + ethtool-common.sh + - net: vlan: sync VLAN features with lower device + - net: dsa: b53: fix resetting speed and pause on forced link + - net: dsa: b53: fix bcm63xx RGMII port link adjustment + - net: dsa: b53: fix enabling ip multicast + - net: dsa: b53: stop reading ARL entries if search is done + - net: dsa: b53: properly bound ARL searches for < 4 ARL bin chips + - sctp: Hold RCU read lock while iterating over address list + - sctp: Hold sock lock while iterating over address list + - net: ionic: add dma_wmb() before ringing TX doorbell + - net: ionic: map SKB after pseudo-header checksum prep + - octeontx2-pf: Fix devm_kcalloc() error checking + - bnxt_en: Fix a possible memory leak in bnxt_ptp_init + - bnxt_en: Always provide max entry and entry size in coredump segments + - bnxt_en: Fix warning in bnxt_dl_reload_down() + - wifi: mac80211_hwsim: Limit destroy_on_close radio removal to netgroup + - io_uring: fix types for region size calulation + - net/mlx5e: Fix return value in case of module EEPROM read error + - net: ti: icssg-prueth: Fix fdb hash size configuration + - net/mlx5e: SHAMPO, Fix header mapping for 64K pages + - net/mlx5e: SHAMPO, Fix skb size check for 64K pages + - net/mlx5e: SHAMPO, Fix header formulas for higher MTUs and 64K pages + - net: wan: framer: pef2256: Switch to devm_mfd_add_devices() + - net: dsa: microchip: Fix reserved multicast address table programming + - net: bridge: fix MST static key usage + - selftests/vsock: avoid false-positives when checking dmesg + - tracing: Fix memory leaks in create_field_var() + - drm/amd/display: Enable mst when it's detected but yet to be initialized + - wifi: cfg80211: add an hrtimer based delayed work item + - wifi: mac80211: use wiphy_hrtimer_work for ml_reconf_work + - wifi: mac80211: use wiphy_hrtimer_work for ttlm_work + - wifi: mac80211: use wiphy_hrtimer_work for csa.switch_work + - riscv: Fix memory leak in module_frob_arch_sections() + - rtc: rx8025: fix incorrect register reference + - x86/microcode/AMD: Add more known models to entry sign checking + - smb: client: validate change notify buffer before copy + - x86/amd_node: Fix AMD root device caching + - xfs: fix delalloc write failures in software-provided atomic writes + - xfs: fix various problems in xfs_atomic_write_cow_iomap_begin + - x86/CPU/AMD: Add missing terminator for zen5_rdseed_microcode + - drm: define NVIDIA DRM format modifiers for GB20x + - drm/nouveau: Advertise correct modifiers on GB20x + - drm/amdgpu/smu: Handle S0ix for vangogh + - drm/amdkfd: Don't clear PT after process killed + - virtio_net: fix alignment for virtio_net_hdr_v1_hash + - lib/crypto: curve25519-hacl64: Fix older clang KASAN workaround for GCC + - scsi: ufs: ufs-pci: Fix S0ix/S3 for Intel controllers + - scsi: ufs: ufs-pci: Set UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE for Intel + ADL + - scsi: ufs: core: Add a quirk to suppress link_startup_again + - drm/amd/display: update color on atomic commit time + - extcon: adc-jack: Cleanup wakeup source only if it was enabled + - kunit: Extend kconfig help text for KUNIT_UML_PCI + - ALSA: hda/tas2781: Enable init_profile_id for device initialization + - ACPI: SPCR: Check for table version when using precise baudrate + - kbuild: Strip trailing padding bytes from modules.builtin.modinfo + - drm/amdgpu: Fix unintended error log in VCN5_0_0 + - drm/amd/display: Fix vupdate_offload_work doc + - drm/amdgpu: Fix function header names in amdgpu_connectors.c + - drm/amdgpu/userq: assign an error code for invalid userq va + - drm/msm/dpu: Fix adjusted mode clock check for 3d merge + - drm/amd/display: Reject modes with too high pixel clock on DCE6-10 + - drm/amd/display: use GFP_NOWAIT for allocation in interrupt handler + - drm/amd/display: Fix black screen with HDMI outputs + - selftests: drv-net: Reload pkt pointer after calling filter_udphdr + - dt-bindings: eeprom: at25: use "size" for FRAMs without device ID + - Linux 6.17.8 + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68316 + - scsi: ufs: core: Fix invalid probe error return value + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40292 + - virtio-net: fix received length check in big packets + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68180 + - drm/amd/display: Fix NULL deref in debugfs odm_combine_segments + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40327 + - perf/core: Fix system hang caused by cpu-clock usage + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40328 + - smb: client: fix potential UAF in smb2_close_cached_fid() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40291 + - io_uring: fix regbuf vector size truncation + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68322 + - parisc: Avoid crash due to unaligned access in unwinder + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40293 + - iommufd: Don't overflow during division for dirty tracking + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40294 + - Bluetooth: MGMT: Fix OOB access in parse_adv_monitor_pattern() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40329 + - drm/sched: Fix deadlock in drm_sched_entity_kill_jobs_cb + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40295 + - fscrypt: fix left shift underflow when inode->i_blkbits > PAGE_SHIFT + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40296 + - platform/x86: int3472: Fix double free of GPIO device during unregister + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40297 + - net: bridge: fix use-after-free due to MST port state bypass + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68320 + - lan966x: Fix sleeping in atomic context + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68169 + - netpoll: Fix deadlock in memory allocation under spinlock + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68197 + - bnxt_en: Fix null pointer dereference in bnxt_bs_trace_check_wrap() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40330 + - bnxt_en: Shutdown FW DMA in bnxt_shutdown() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68192 + - net: usb: qmi_wwan: initialize MAC header offset in qmimux_rx_fixup + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40331 + - sctp: Prevent TOCTOU out-of-bounds write + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68187 + - net: mdio: Check regmap pointer returned by device_node_to_regmap() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68167 + - gpiolib: fix invalid pointer access in debugfs + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68319 + - netconsole: Acquire su_mutex before navigating configs hierarchy + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40298 + - gve: Implement settime64 with -EOPNOTSUPP + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40299 + - gve: Implement gettimex64 with -EOPNOTSUPP + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40301 + - Bluetooth: hci_event: validate skb length for unknown CC opcode + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40358 + - riscv: stacktrace: Disable KASAN checks for non-current tasks + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68186 + - ring-buffer: Do not warn in ring_buffer_map_get_reader() when reader + catches up + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68184 + - drm/mediatek: Disable AFBC support on Mediatek DRM driver + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40302 + - media: videobuf2: forbid remove_bufs when legacy fileio is active + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40303 + - btrfs: ensure no dirty metadata is written back for an fs with errors + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40362 + - ceph: fix multifs mds auth caps issue + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40332 + - drm/amdkfd: Fix mmap write lock not release + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40304 + - fbdev: Add bounds checking in bit_putcs to fix vmalloc-out-of-bounds + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40305 + - 9p/trans_fd: p9_fd_request: kick rx thread if EPOLLIN + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68318 + - clk: thead: th1520-ap: set all AXI clocks to CLK_IS_CRITICAL + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40209 + - btrfs: fix memory leak of qgroup_list in btrfs_add_qgroup_relation + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68183 + - ima: don't clear IMA_DIGSIG flag when setting or removing non-IMA xattr + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68173 + - ftrace: Fix softlockup in ftrace_module_enable + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40306 + - orangefs: fix xattr related buffer overflow... + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40307 + - exfat: validate cluster allocation bits of the allocation bitmap + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40308 + - Bluetooth: bcsp: receive data only if registered + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40309 + - Bluetooth: SCO: Fix UAF on sco_conn_free + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68174 + - amd/amdkfd: enhance kfd process check in switch partition + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40310 + - amd/amdkfd: resolve a race in amdgpu_amdkfd_device_fini_sw + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40361 + - fs: ext4: change GFP_KERNEL to GFP_NOFS to avoid deadlock + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40311 + - accel/habanalabs: support mapping cb with vmalloc-backed coherent memory + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68185 + - nfs4_setup_readdir(): insufficient locking for ->d_parent->d_inode + dereferencing + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68176 + - PCI: cadence: Check for the existence of cdns_pcie::ops before using it + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68190 + - drm/amdgpu/atom: Check kcalloc() for WS buffer in + amdgpu_atom_execute_table_locked() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68168 + - jfs: fix uninitialized waitqueue in transaction manager + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40312 + - jfs: Verify inode mode when loading from disk + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40333 + - f2fs: fix infinite loop in __insert_extent_tree() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68321 + - page_pool: always add GFP_NOWARN for ATOMIC allocations + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40334 + - drm/amdgpu: validate userq buffer virtual address and size + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68191 + - udp_tunnel: use netdev_warn() instead of netdev_WARN() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68309 + - PCI/AER: Fix NULL pointer access by aer_info + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40313 + - ntfs3: pretend $Extend records as regular files + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40335 + - drm/amdgpu: validate userq input args + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40314 + - usb: cdns3: gadget: Use-after-free during failed initialization and exit + of cdnsp gadget + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40336 + - drm/gpusvm: fix hmm_pfn_to_map_order() usage + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68193 + - drm/xe/guc: Add devm release action to safely tear down CT + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68175 + - media: nxp: imx8-isi: Fix streaming cleanup on release + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68188 + - tcp: use dst_dev_rcu() in tcp_fastopen_active_disable_ofo_check() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68315 + - f2fs: fix to detect potential corrupted nid in free_nid_list + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40337 + - net: stmmac: Correctly handle Rx checksum offload errors + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40338 + - ASoC: Intel: avs: Do not share the name pointer between components + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40339 + - drm/amdgpu: fix nullptr err of vm_handle_moved + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68194 + - media: imon: make send_packet() more robust + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40363 + - net: ipv6: fix field-spanning memcpy warning in AH output + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68311 + - tty: serial: ip22zilog: Use platform device for probing + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40340 + - drm/xe: Fix oops in xe_gem_fault when running core_hotunplug test. + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68196 + - drm/amd/display: Cache streams targeting link when performing LT + automation + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68178 + - blk-cgroup: fix possible deadlock while configuring policy + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40341 + - futex: Don't leak robust_list pointer on exec race + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40342 + - nvme-fc: use lock accessing port_state and rport state + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40343 + - nvmet-fc: avoid scheduling association deletion twice + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68177 + - cpufreq/longhaul: handle NULL policy in longhaul_exit + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68317 + - io_uring/zctx: check chained notif contexts + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40315 + - usb: gadget: f_fs: Fix epfile null pointer access after ep enable. + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40316 + - drm/mediatek: Fix device use-after-free on unbind + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40360 + - drm/sysfb: Do not dereference NULL pointer in plane reset + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68179 + - s390: Disable ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68310 + - s390/pci: Avoid deadlock between PCI error recovery and mlx5 crdump + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40317 + - regmap: slimbus: fix bus_context pointer in regmap init calls + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40359 + - perf/x86/intel: Fix KASAN global-out-of-bounds warning + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68181 + - drm/radeon: Remove calls to drm_put_dev() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68170 + - drm/radeon: Do not kfree() devres managed rdev + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40213 + - Bluetooth: MGMT: fix crash in set_mesh_sync and set_mesh_complete + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40318 + - Bluetooth: hci_sync: fix race in hci_cmd_sync_dequeue_once + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68312 + - usbnet: Prevents free active kevent + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40344 + - ASoC: Intel: avs: Disable periods-elapsed work when closing PCM + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68172 + - crypto: aspeed - fix double free caused by devm + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40319 + - bpf: Sync pending IRQ work before freeing ring buffer + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68182 + - wifi: iwlwifi: fix potential use after free in iwl_mld_remove_link() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68314 + - drm/msm: make sure last_fence is always updated + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68189 + - drm/msm: Fix GEM free for imported dma-bufs + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68171 + - x86/fpu: Ensure XFD state on signal delivery + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-68313 + - x86/CPU/AMD: Add RDSEED fix for Zen5 + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40320 + - smb: client: fix potential cfid UAF in smb2_query_info_compound + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40321 + - wifi: brcmfmac: fix crash while sending Action Frames in standalone AP + Mode + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40322 + - fbdev: bitblit: bound-check glyph index in bit_putcs* + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40211 + - ACPI: video: Fix use-after-free in acpi_video_switch_brightness() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40323 + - fbcon: Set fb_display[i]->mode to NULL when the mode is released + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40210 + - Revert "NFSD: Remove the cap on number of operations per NFSv4 COMPOUND" + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40324 + - NFSD: Fix crash in nfsd4_read_release() + * Questing update: v6.17.8 upstream stable release (LP: #2136833) // + CVE-2025-40326 + - NFSD: Define actions for the new time_deleg FATTR4 attributes + * Questing update: v6.17.7 upstream stable release (LP: #2136813) + - sched_ext: Move internal type and accessor definitions to ext_internal.h + - sched_ext: Put event_stats_cpu in struct scx_sched_pcpu + - sched_ext: Sync error_irq_work before freeing scx_sched + - timekeeping: Fix aux clocks sysfs initialization loop bound + - x86/bugs: Report correct retbleed mitigation status + - x86/bugs: Qualify RETBLEED_INTEL_MSG + - genirq/chip: Add buslock back in to irq_set_handler() + - genirq/manage: Add buslock back in to __disable_irq_nosync() + - genirq/manage: Add buslock back in to enable_irq() + - audit: record fanotify event regardless of presence of rules + - EDAC/ie31200: Add two more Intel Alder Lake-S SoCs for EDAC support + - perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK + - perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of + current->mm == NULL + - perf: Have get_perf_callchain() return NULL if crosstask and user are + set + - perf: Skip user unwind if the task is a kernel thread + - EDAC: Fix wrong executable file modes for C source files + - seccomp: passthrough uprobe systemcall without filtering + - sched_ext: Keep bypass on between enable failure and + scx_disable_workfn() + - x86/bugs: Add attack vector controls for VMSCAPE + - x86/bugs: Fix reporting of LFENCE retpoline + - EDAC/mc_sysfs: Increase legacy channel support to 16 + - cpuset: Use new excpus for nocpu error check when enabling root + partition + - btrfs: abort transaction on specific error places when walking log tree + - btrfs: abort transaction in the process_one_buffer() log tree walk + callback + - btrfs: zoned: return error from btrfs_zone_finish_endio() + - btrfs: zoned: refine extent allocator hint selection + - btrfs: scrub: replace max_t()/min_t() with clamp() in + scrub_throttle_dev_io() + - btrfs: always drop log root tree reference in btrfs_replay_log() + - btrfs: use level argument in log tree walk callback replay_one_buffer() + - btrfs: abort transaction if we fail to update inode in log replay dir + fixup + - btrfs: tree-checker: add inode extref checks + - btrfs: use smp_mb__after_atomic() when forcing COW in + create_pending_snapshot() + - sched_ext: Make qmap dump operation non-destructive + - arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c + - btrfs: tree-checker: fix bounds check in check_inode_extref() + - Linux 6.17.7 + * [UBUNTU 24.04] KVM: s390: improve interrupt cpu for wakeup (LP: #2132317) + - KVM: s390: improve interrupt cpu for wakeup + * Questing update: v6.17.6 upstream stable release (LP: #2134982) + - sched/fair: Block delayed tasks on throttled hierarchy during dequeue + - vfio/cdx: update driver to build without CONFIG_GENERIC_MSI_IRQ + - expfs: Fix exportfs_can_encode_fh() for EXPORT_FH_FID + - cgroup/misc: fix misc_res_type kernel-doc warning + - dlm: move to rinfo for all middle conversion cases + - exec: Fix incorrect type for ret + - s390/pkey: Forward keygenflags to ep11_unwrapkey + - hfs: clear offset and space out of valid records in b-tree node + - hfs: make proper initalization of struct hfs_find_data + - hfs: validate record offset in hfsplus_bmap_alloc + - hfsplus: fix KMSAN uninit-value issue in hfsplus_delete_cat() + - dlm: check for defined force value in dlm_lockspace_release + - hfsplus: return EIO when type of hidden directory mismatch in + hfsplus_fill_super() + - PCI: Test for bit underflow in pcie_set_readrq() + - lkdtm: fortify: Fix potential NULL dereference on kmalloc failure + - arm64: sysreg: Correct sign definitions for EIESB and DoubleLock + - m68k: bitops: Fix find_*_bit() signatures + - powerpc/32: Remove PAGE_KERNEL_TEXT to fix startup failure + - riscv: mm: Return intended SATP mode for noXlvl options + - riscv: mm: Use mmu-type from FDT to limit SATP mode + - riscv: cpufeature: add validation for zfa, zfh and zfhmin + - drivers/perf: hisi: Relax the event ID check in the framework + - s390/mm: Use __GFP_ACCOUNT for user page table allocations + - smb: client: queue post_recv_credits_work also if the peer raises the + credit target + - smb: client: limit the range of info->receive_credit_target + - smb: client: make use of ib_wc_status_msg() and skip IB_WC_WR_FLUSH_ERR + logging + - smb: server: let smb_direct_flush_send_list() invalidate a remote key + first + - Unbreak 'make tools/*' for user-space targets + - platform/mellanox: mlxbf-pmc: add sysfs_attr_init() to count_clock init + - cpufreq/amd-pstate: Fix a regression leading to EPP 0 after hibernate + - net/mlx5e: Return 1 instead of 0 in invalid case in + mlx5e_mpwrq_umr_entry_size() + - rtnetlink: Allow deleting FDB entries in user namespace + - net: enetc: fix the deadlock of enetc_mdio_lock + - net: enetc: correct the value of ENETC_RXB_TRUESIZE + - dpaa2-eth: fix the pointer passed to PTR_ALIGN on Tx path + - net: phy: realtek: fix rtl8221b-vm-cg name + - can: bxcan: bxcan_start_xmit(): use can_dev_dropped_skb() instead of + can_dropped_invalid_skb() + - can: esd: acc_start_xmit(): use can_dev_dropped_skb() instead of + can_dropped_invalid_skb() + - can: rockchip-canfd: rkcanfd_start_xmit(): use can_dev_dropped_skb() + instead of can_dropped_invalid_skb() + - selftests: net: fix server bind failure in sctp_vrf.sh + - net/mlx5e: RX, Fix generating skb from non-linear xdp_buff for legacy RQ + - net/mlx5e: RX, Fix generating skb from non-linear xdp_buff for striding + RQ + - net/smc: fix general protection fault in __smc_diag_dump + - net: ethernet: ti: am65-cpts: fix timestamp loss due to race conditions + - arm64, mm: avoid always making PTE dirty in pte_mkwrite() + - erofs: avoid infinite loops due to corrupted subpage compact indexes + - net: hibmcge: select FIXED_PHY + - ptp: ocp: Fix typo using index 1 instead of i in SMA initialization loop + - net: hsr: prevent creation of HSR device with slaves from another netns + - espintcp: use datagram_poll_queue for socket readiness + - net: datagram: introduce datagram_poll_queue for custom receive queues + - ovpn: use datagram_poll_queue for socket readiness in TCP + - net: bonding: fix possible peer notify event loss or dup issue + - hung_task: fix warnings caused by unaligned lock pointers + - mm: don't spin in add_stack_record when gfp flags don't allow + - dma-debug: don't report false positives with + DMA_BOUNCE_UNALIGNED_KMALLOC + - arch_topology: Fix incorrect error check in + topology_parse_cpu_capacity() + - riscv: hwprobe: Fix stale vDSO data for late-initialized keys at boot + - io_uring/sqpoll: switch away from getrusage() for CPU accounting + - io_uring/sqpoll: be smarter on when to update the stime usage + - btrfs: send: fix duplicated rmdir operations when using extrefs + - btrfs: ref-verify: fix IS_ERR() vs NULL check in btrfs_build_ref_tree() + - gpio: pci-idio-16: Define maximum valid register address offset + - gpio: 104-idio-16: Define maximum valid register address offset + - xfs: fix locking in xchk_nlinks_collect_dir + - platform/x86: alienware-wmi-wmax: Add AWCC support to Dell G15 5530 + - Revert "cpuidle: menu: Avoid discarding useful information" + - riscv: cpufeature: avoid uninitialized variable in + has_thead_homogeneous_vlenb() + - rust: device: fix device context of Device::parent() + - slab: Avoid race on slab->obj_exts in alloc_slab_obj_exts + - slab: Fix obj_ext mistakenly considered NULL due to race condition + - smb: client: get rid of d_drop() in cifs_do_rename() + - ACPICA: Work around bogus -Wstringop-overread warning since GCC 11 + - arm64: mte: Do not warn if the page is already tagged in copy_highpage() + - can: netlink: can_changelink(): allow disabling of automatic restart + - cifs: Fix TCP_Server_Info::credits to be signed + - devcoredump: Fix circular locking dependency with devcd->mutex. + - hwmon: (pmbus/max34440) Update adpm12160 coeff due to latest FW + - MIPS: Malta: Fix keyboard resource preventing i8042 driver from + registering + - rv: Make rtapp/pagefault monitor depends on CONFIG_MMU + - net: bonding: update the slave array for broadcast mode + - net: stmmac: dwmac-rk: Fix disabling set_clock_selection + - net: usb: rtl8150: Fix frame padding + - net: ravb: Enforce descriptor type ordering + - net: ravb: Ensure memory write completes before ringing TX doorbell + - mptcp: pm: in-kernel: C-flag: handle late ADD_ADDR + - selftests: mptcp: join: mark 'flush re-add' as skipped if not supported + - selftests: mptcp: join: mark implicit tests as skipped if not supported + - selftests: mptcp: join: mark 'delete re-add signal' as skipped if not + supported + - mm/mremap: correctly account old mapping after MREMAP_DONTUNMAP remap + - drm/xe: Check return value of GGTT workqueue allocation + - drm/amd/display: increase max link count and fix link->enc NULL pointer + access + - mm/damon/core: use damos_commit_quota_goal() for new goal commit + - mm/damon/core: fix list_add_tail() call on damon_call() + - spi: rockchip-sfc: Fix DMA-API usage + - firmware: arm_ffa: Add support for IMPDEF value in the memory access + descriptor + - spi: spi-nxp-fspi: add the support for sample data from DQS pad + - spi: spi-nxp-fspi: re-config the clock rate when operation require new + clock rate + - spi: spi-nxp-fspi: add extra delay after dll locked + - spi: spi-nxp-fspi: limit the clock rate for different sample clock + source selection + - spi: cadence-quadspi: Fix pm_runtime unbalance on dma EPROBE_DEFER + - arm64: dts: broadcom: bcm2712: Add default GIC address cells + - arm64: dts: broadcom: bcm2712: Define VGIC interrupt + - include: trace: Fix inflight count helper on failed initialization + - firmware: arm_scmi: Fix premature SCMI_XFER_FLAG_IS_RAW clearing in raw + mode + - spi: airoha: return an error for continuous mode dirmap creation cases + - spi: airoha: add support of dual/quad wires spi modes to exec_op() + handler + - spi: airoha: switch back to non-dma mode in the case of error + - spi: airoha: fix reading/writing of flashes with more than one plane per + lun + - sysfs: check visibility before changing group attribute ownership + - RISC-V: Define pgprot_dmacoherent() for non-coherent devices + - RISC-V: Don't print details of CPUs disabled in DT + - riscv: hwprobe: avoid uninitialized variable use in hwprobe_arch_id() + - hwmon: (pmbus/isl68137) Fix child node reference leak on early return + - hwmon: (sht3x) Fix error handling + - io_uring: fix incorrect unlikely() usage in io_waitid_prep() + - nbd: override creds to kernel when calling sock_{send,recv}msg() + - drm/panic: Fix drawing the logo on a small narrow screen + - drm/panic: Fix qr_code, ensure vmargin is positive + - drm/panic: Fix 24bit pixel crossing page boundaries + - of/irq: Convert of_msi_map_id() callers to of_msi_xlate() + - of/irq: Add msi-parent check to of_msi_xlate() + - block: require LBA dma_alignment when using PI + - gpio: ljca: Fix duplicated IRQ mapping + - io_uring: correct __must_hold annotation in io_install_fixed_file + - sched: Remove never used code in mm_cid_get() + - USB: serial: option: add UNISOC UIS7720 + - USB: serial: option: add Quectel RG255C + - USB: serial: option: add Telit FN920C04 ECM compositions + - usb/core/quirks: Add Huawei ME906S to wakeup quirk + - usb: raw-gadget: do not limit transfer length + - xhci: dbc: enable back DbC in resume if it was enabled before suspend + - xhci: dbc: fix bogus 1024 byte prefix if ttyDBC read races with stall + event + - x86/microcode: Fix Entrysign revision check for Zen1/Naples + - binder: remove "invalid inc weak" check + - mei: me: add wildcat lake P DID + - objtool/rust: add one more `noreturn` Rust function + - nvmem: rcar-efuse: add missing MODULE_DEVICE_TABLE + - misc: fastrpc: Fix dma_buf object leak in fastrpc_map_lookup + - most: usb: hdm_probe: Fix calling put_device() before device + initialization + - tcpm: switch check for role_sw device with fw_node + - dt-bindings: serial: sh-sci: Fix r8a78000 interrupts + - dt-bindings: usb: dwc3-imx8mp: dma-range is required only for imx8mp + - dt-bindings: usb: qcom,snps-dwc3: Fix bindings for X1E80100 + - serial: 8250_dw: handle reset control deassert error + - serial: 8250_exar: add support for Advantech 2 port card with Device ID + 0x0018 + - serial: 8250_mtk: Enable baud clock and manage in runtime PM + - serial: sc16is7xx: remove useless enable of enhanced features + - staging: gpib: Fix device reference leak in fmh_gpib driver + - staging: gpib: Fix no EOI on 1 and 2 byte writes + - staging: gpib: Return -EINTR on device clear + - staging: gpib: Fix sending clear and trigger events + - mm/migrate: remove MIGRATEPAGE_UNMAP + - treewide: remove MIGRATEPAGE_SUCCESS + - vmw_balloon: indicate success when effectively deflating during + migration + - xfs: always warn about deprecated mount options + - gpio: regmap: Allow to allocate regmap-irq device + - gpio: regmap: add the .fixed_direction_output configuration parameter + - gpio: idio-16: Define fixed direction of the GPIO lines + - Linux 6.17.6 + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40084 + - ksmbd: transport_ipc: validate payload size before reading handle + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40222 + - tty: serial: sh-sci: fix RSCI FIFO overrun handling + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40223 + - most: usb: Fix use-after-free in hdm_disconnect + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40106 + - comedi: fix divide-by-zero in comedi_buf_munge() + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40224 + - hwmon: (cgbc-hwmon) Add missing NULL check after devm_kzalloc() + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40225 + - drm/panthor: Fix kernel panic on partial unmap of a GPU VA region + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40226 + - firmware: arm_scmi: Account for failed debug initialization + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40227 + - mm/damon/sysfs: dealloc commit test ctx always + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40228 + - mm/damon/sysfs: catch commit test ctx alloc failure + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40229 + - mm/damon/core: fix potential memory leak by cleaning ops_filter in + damon_destroy_scheme + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40230 + - mm: prevent poison consumption when splitting THP + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40231 + - vsock: fix lock inversion in vsock_assign_transport() + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40233 + - ocfs2: clear extent cache after moving/defragmenting extents + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40235 + - btrfs: directly free partially initialized fs_info in + btrfs_check_leaked_roots() + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40236 + - virtio-net: zero unused hash fields + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40237 + - fs/notify: call exportfs_encode_fid with s_umount + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40238 + - net/mlx5: Fix IPsec cleanup over MPV device + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40239 + - net: phy: micrel: always set shared->phydev for LAN8814 + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40240 + - sctp: avoid NULL dereference when chunk data buffer is missing + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40241 + - erofs: fix crafted invalid cases for encoded extents + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40242 + - gfs2: Fix unlikely race in gdlm_put_lock + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40243 + - hfs: fix KMSAN uninit-value issue in hfs_find_set_zero_bits() + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40244 + - hfsplus: fix KMSAN uninit-value issue in __hfsplus_ext_cache_extent() + * Questing update: v6.17.6 upstream stable release (LP: #2134982) // + CVE-2025-40245 + - nios2: ensure that memblock.current_limit is set when setting pfn limits + * Questing update: v6.17.5 upstream stable release (LP: #2133557) + - docs: kdoc: handle the obsolescensce of docutils.ErrorString() + - Revert "fs: make vfs_fileattr_[get|set] return -EOPNOTSUPP" + - PCI: vmd: Override irq_startup()/irq_shutdown() in + vmd_init_dev_msi_info() + - ata: libata-core: relax checks in ata_read_log_directory() + - arm64/sysreg: Fix GIC CDEOI instruction encoding + - ixgbevf: fix getting link speed data for E610 devices + - rust: cfi: only 64-bit arm and x86 support CFI_CLANG + - x86/CPU/AMD: Prevent reset reasons from being retained across reboot + - slab: reset slab->obj_ext when freeing and it is OBJEXTS_ALLOC_FAIL + - Revert "io_uring/rw: drop -EOPNOTSUPP check in + __io_complete_rw_common()" + - io_uring: protect mem region deregistration + - Revert "drm/amd/display: Only restore backlight after amdgpu_dm_init or + dm_resume" + - r8152: add error handling in rtl8152_driver_init + - net: usb: lan78xx: Fix lost EEPROM write timeout error(-ETIMEDOUT) in + lan78xx_write_raw_eeprom + - f2fs: fix wrong block mapping for multi-devices + - gve: Check valid ts bit on RX descriptor before hw timestamping + - jbd2: ensure that all ongoing I/O complete before freeing blocks + - ext4: wait for ongoing I/O to complete before freeing blocks + - btrfs: fix clearing of BTRFS_FS_RELOC_RUNNING if relocation already + running + - btrfs: fix memory leak on duplicated memory in the qgroup assign ioctl + - btrfs: only set the device specific options after devices are opened + - btrfs: fix incorrect readahead expansion length + - can: gs_usb: gs_make_candev(): populate net_device->dev_port + - can: gs_usb: increase max interface to U8_MAX + - cxl/acpi: Fix setup of memory resource in cxl_acpi_set_cache_size() + - ALSA: hda/intel: Add MSI X870E Tomahawk to denylist + - ALSA: hda/realtek: Add quirk entry for HP ZBook 17 G6 + - drm/amdgpu: use atomic functions with memory barriers for vm fault info + - drm/amdgpu: fix gfx12 mes packet status return check + - drm/xe: Increase global invalidation timeout to 1000us + - perf/core: Fix address filter match with backing files + - perf/core: Fix MMAP event path names with backing files + - perf/core: Fix MMAP2 event device with backing files + - drm/amd: Check whether secure display TA loaded successfully + - PM: hibernate: Add pm_hibernation_mode_is_suspend() + - drm/amd: Fix hybrid sleep + - usb: gadget: Store endpoint pointer in usb_request + - usb: gadget: Introduce free_usb_request helper + - HID: multitouch: fix sticky fingers + - dax: skip read lock assertion for read-only filesystems + - coredump: fix core_pattern input validation + - can: m_can: m_can_plat_remove(): add missing pm_runtime_disable() + - can: m_can: m_can_handle_state_errors(): fix CAN state transition to + Error Active + - can: m_can: m_can_chip_config(): bring up interface in correct state + - can: m_can: fix CAN state in system PM + - net: mtk: wed: add dma mask limitation and GFP_DMA32 for device with + more than 4GB DRAM + - net: dlink: handle dma_map_single() failure properly + - doc: fix seg6_flowlabel path + - can: j1939: add missing calls in NETDEV_UNREGISTER notification handler + - dpll: zl3073x: Refactor DPLL initialization + - dpll: zl3073x: Handle missing or corrupted flash configuration + - r8169: fix packet truncation after S4 resume on RTL8168H/RTL8111H + - net: phy: bcm54811: Fix GMII/MII/MII-Lite selection + - net: phy: realtek: Avoid PHYCR2 access if PHYCR2 not present + - amd-xgbe: Avoid spurious link down messages during interface toggle + - Octeontx2-af: Fix missing error code in cgx_probe() + - tcp: fix tcp_tso_should_defer() vs large RTT + - net: airoha: Take into account out-of-order tx completions in + airoha_dev_xmit() + - selftests: net: check jq command is supported + - net: core: fix lockdep splat on device unregister + - ksmbd: fix recursive locking in RPC handle list access + - tg3: prevent use of uninitialized remote_adv and local_adv variables + - tls: trim encrypted message to match the plaintext on short splice + - tls: wait for async encrypt in case of error during latter iterations of + sendmsg + - tls: always set record_type in tls_process_cmsg + - tls: don't rely on tx_work during send() + - netdevsim: set the carrier when the device goes up + - net: usb: lan78xx: fix use of improperly initialized dev->chipid in + lan78xx_reset + - drm/panthor: Ensure MCU is disabled on suspend + - nvme-multipath: Skip nr_active increments in RETRY disposition + - riscv: kprobes: Fix probe address validation + - drm/bridge: lt9211: Drop check for last nibble of version register + - powerpc/fadump: skip parameter area allocation when fadump is disabled + - ASoC: codecs: Fix gain setting ranges for Renesas IDT821034 codec + - ASoC: nau8821: Cancel jdet_work before handling jack ejection + - ASoC: nau8821: Generalize helper to clear IRQ status + - ASoC: nau8821: Consistently clear interrupts before unmasking + - ASoC: nau8821: Add DMI quirk to bypass jack debounce circuit + - drm/i915/guc: Skip communication warning on reset in progress + - drm/i915/frontbuffer: Move bo refcounting + intel_frontbuffer_{get,release}() + - drm/i915/fb: Fix the set_tiling vs. addfb race, again + - drm/amdgpu: add ip offset support for cyan skillfish + - drm/amdgpu: add support for cyan skillfish without IP discovery + - drm/amdgpu: fix handling of harvesting for ip_discovery firmware + - drm/amdgpu: handle wrap around in reemit handling + - drm/amdgpu: set an error on all fences from a bad context + - drm/amdgpu: drop unused structures in amdgpu_drm.h + - drm/amd/powerplay: Fix CIK shutdown temperature + - drm/xe: Enable media sampler power gating + - drm/draw: fix color truncation in drm_draw_fill24 + - drm/rockchip: vop2: use correct destination rectangle height check + - HID: intel-thc-hid: Intel-quickspi: switch first interrupt from level to + edge detection + - sched/fair: Fix pelt lost idle time detection + - ALSA: firewire: amdtp-stream: fix enum kernel-doc warnings + - accel/qaic: Synchronize access to DBC request queue head & tail pointer + - nvme-auth: update sc_c in host response + - cxl/trace: Subtract to find an hpa_alias0 in cxl_poison events + - selftests/bpf: make arg_parsing.c more robust to crashes + - blk-mq: fix stale tag depth for shared sched tags in + blk_mq_update_nr_requests() + - block: Remove elevator_lock usage from blkg_conf frozen operations + - HID: hid-input: only ignore 0 battery events for digitizers + - HID: multitouch: fix name of Stylus input devices + - drm/xe/evict: drop bogus assert + - selftests: arg_parsing: Ensure data is flushed to disk before reading. + - nvme/tcp: handle tls partially sent records in write_space() + - rust: cpufreq: fix formatting + - arm64: debug: always unmask interrupts in el0_softstp() + - arm64: cputype: Add Neoverse-V3AE definitions + - arm64: errata: Apply workarounds for Neoverse-V3AE + - xfs: rename the old_crc variable in xlog_recover_process + - xfs: fix log CRC mismatches between i386 and other architectures + - NFSD: Rework encoding and decoding of nfsd4_deviceid + - NFSD: Minor cleanup in layoutcommit processing + - NFSD: Implement large extent array support in pNFS + - NFSD: Fix last write offset handling in layoutcommit + - phy: cdns-dphy: Store hs_clk_rate and return it + - phy: cadence: cdns-dphy: Fix PLL lock and O_CMN_READY polling + - x86/resctrl: Refactor resctrl_arch_rmid_read() + - x86/resctrl: Fix miscount of bandwidth event when reactivating + previously unavailable RMID + - cxl: Fix match_region_by_range() to use region_res_match_cxl_range() + - phy: cadence: cdns-dphy: Update calibration wait time for startup state + machine + - drm/xe: Use devm_ioremap_wc for VRAM mapping and drop manual unmap + - drm/xe: Use dynamic allocation for tile and device VRAM region + structures + - drm/xe: Move struct xe_vram_region to a dedicated header + - drm/xe: Unify the initialization of VRAM regions + - drm/xe: Move rebar to be done earlier + - PM: hibernate: Fix pm_hibernation_mode_is_suspend() build breakage + - drm/xe: Fix an IS_ERR() vs NULL bug in xe_tile_alloc_vram() + - Linux 6.17.5 + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40086 + - drm/xe: Don't allow evicting of BOs in same VM in array of VM binds + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40087 + - NFSD: Define a proc_layoutcommit for the FlexFiles layout type + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40088 + - hfsplus: fix slab-out-of-bounds read in hfsplus_strcasecmp() + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40162 + - ASoC: amd/sdw_utils: avoid NULL deref when devm_kasprintf() fails + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40085 + - ALSA: usb-audio: Fix NULL pointer deference in try_to_register_card + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40172 + - accel/qaic: Treat remaining == 0 as error in find_and_map_user_pages() + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40177 + - accel/qaic: Fix bootlog initialization ordering + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40163 + - sched/deadline: Stop dl_server before CPU goes offline + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40174 + - x86/mm: Fix SMP ordering in switch_mm_irqs_off() + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40089 + - cxl/features: Add check for no entries in cxl_feature_info + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40176 + - tls: wait for pending async decryptions if tls_strp_msg_hold fails + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40164 + - usbnet: Fix using smp_processor_id() in preemptible code warnings + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40091 + - ixgbe: fix too early devlink_free() in ixgbe_remove() + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40175 + - idpf: cleanup remaining SKBs in PTP flows + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40173 + - net/ip6_tunnel: Prevent perpetual tunnel growth + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40092 + - usb: gadget: f_ncm: Refactor bind path to use __free() + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40093 + - usb: gadget: f_ecm: Refactor bind path to use __free() + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40094 + - usb: gadget: f_acm: Refactor bind path to use __free() + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40095 + - usb: gadget: f_rndis: Refactor bind path to use __free() + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40165 + - media: nxp: imx8-isi: m2m: Fix streaming cleanup on release + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40096 + - drm/sched: Fix potential double free in + drm_sched_job_add_resv_dependencies + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40097 + - ALSA: hda: Fix missing pointer check in hda_component_manager_init + function + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40098 + - ALSA: hda: cs35l41: Fix NULL pointer dereference in + cs35l41_get_acpi_mute_state() + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40099 + - cifs: parse_dfs_referrals: prevent oob on malformed input + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40100 + - btrfs: do not assert we found block group item when creating free space + tree + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40101 + - btrfs: fix memory leaks when rejecting a non SINGLE data profile without + an RST + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40167 + - ext4: detect invalid INLINE_DATA + EXTENTS flag combination + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40102 + - KVM: arm64: Prevent access to vCPU events before init + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40103 + - smb: client: Fix refcount leak for cifs_sb_tlink + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40104 + - ixgbevf: fix mailbox API compatibility by negotiating supported features + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40166 + - drm/xe/guc: Check GuC running state before deregistering exec queue + * Questing update: v6.17.5 upstream stable release (LP: #2133557) // + CVE-2025-40105 + - vfs: Don't leak disconnected dentries on umount + * The machine didn’t go into suspend and got stuck (LP: #2132095) + - platform/x86: alienware-wmi-wmax: Fix NULL pointer dereference in sleep + handlers + * CAP_PERFMON insufficient to get perf data (LP: #2131046) + - SAUCE: perf/core: Allow CAP_PERFMON for paranoid level 4 + * Poweroff not working consistently after upgrading kernel 6.14.0-17.17 or + later (LP: #2115860) + - drm/amd: Unify shutdown() callback behavior + - drm/amd: Stop exporting amdgpu_device_ip_suspend() outside amdgpu_device + - drm/amd: Remove comment about handling errors in + amdgpu_device_ip_suspend_phase1() + - drm/amd: Don't always set IP block HW status to false + - drm/amd: Pass IP suspend errors up to callers + - drm/amd: Avoid evicting resources at S5 + * Re-enable INTEL_SKL_INT3472 for kernels >= 6.16 for Intel IPU camera + (LP: #2128792) + - Revert "UBUNTU: [Config] FTBFS: disable INTEL_SKL_INT3472" + - Revert "UBUNTU: SAUCE: platform/x86: int3472: Add handshake GPIO + function" + * Support Samsung S5K3J1 sensor for Intel MIPI camera (LP: #2121852) + - SAUCE: media: ipu-bridge: Support s5k3j1 sensor + * Questing update: v6.17.4 upstream stable release (LP: #2131259) + - fs: always return zero on success from replace_fd() + - fscontext: do not consume log entries when returning -EMSGSIZE + - btrfs: fix the incorrect max_bytes value for find_lock_delalloc_range() + - arm64: map [_text, _stext) virtual address range non-executable+read- + only + - rseq: Protect event mask against membarrier IPI + - statmount: don't call path_put() under namespace semaphore + - listmount: don't call path_put() under namespace semaphore + - clocksource/drivers/clps711x: Fix resource leaks in error paths + - memcg: skip cgroup_file_notify if spinning is not allowed + - page_pool: Fix PP_MAGIC_MASK to avoid crashing on some 32-bit arches + - PM: runtime: Update kerneldoc return codes + - dma-mapping: fix direction in dma_alloc direction traces + - cpufreq: Make drivers using CPUFREQ_ETERNAL specify transition latency + - nfsd: unregister with rpcbind when deleting a transport + - KVM: x86: Add helper to retrieve current value of user return MSR + - KVM: SVM: Emulate PERF_CNTR_GLOBAL_STATUS_SET for PerfMonV2 + - iio: frequency: adf4350: Fix ADF4350_REG3_12BIT_CLKDIV_MODE + - media: v4l2-subdev: Fix alloc failure check in + v4l2_subdev_call_state_try() + - asm-generic/io.h: Skip trace helpers if rwmmio events are disabled + - clk: npcm: select CONFIG_AUXILIARY_BUS + - clk: thead: th1520-ap: describe gate clocks with clk_gate + - clk: thead: th1520-ap: fix parent of padctrl0 clock + - clk: thead: Correct parent for DPU pixel clocks + - clk: renesas: r9a08g045: Add MSTOP for GPIO + - perf disasm: Avoid undefined behavior in incrementing NULL + - perf test trace_btf_enum: Skip if permissions are insufficient + - perf evsel: Avoid container_of on a NULL leader + - libperf event: Ensure tracing data is multiple of 8 sized + - clk: qcom: common: Fix NULL vs IS_ERR() check in qcom_cc_icc_register() + - clk: qcom: Select the intended config in QCS_DISPCC_615 + - perf parse-events: Handle fake PMUs in CPU terms + - clk: at91: peripheral: fix return value + - clk: renesas: cpg-mssr: Fix memory leak in cpg_mssr_reserved_init() + - perf: Completely remove possibility to override MAX_NR_CPUS + - perf drm_pmu: Fix fd_dir leaks in for_each_drm_fdinfo_in_dir() + - perf util: Fix compression checks returning -1 as bool + - rtc: x1205: Fix Xicor X1205 vendor prefix + - rtc: optee: fix memory leak on driver removal + - perf arm_spe: Correct setting remote access + - perf arm_spe: Correct memory level for remote access + - perf vendor events arm64 AmpereOneX: Fix typo - should be + l1d_cache_access_prefetches + - perf test: AMD IBS swfilt skip kernel tests if paranoia is >1 + - perf test shell lbr: Avoid failures with perf event paranoia + - perf trace: Fix IS_ERR() vs NULL check bug + - perf session: Fix handling when buffer exceeds 2 GiB + - perf test: Don't leak workload gopipe in PERF_RECORD_* + - perf evsel: Fix uniquification when PMU given without suffix + - perf test: Avoid uncore_imc/clockticks in uniquification test + - perf evsel: Ensure the fallback message is always written to + - perf build-id: Ensure snprintf string is empty when size is 0 + - clk: mediatek: mt8195-infra_ao: Fix parent for infra_ao_hdmi_26m + - clk: mediatek: clk-mux: Do not pass flags to + clk_mux_determine_rate_flags() + - clk: nxp: lpc18xx-cgu: convert from round_rate() to determine_rate() + - clk: nxp: Fix pll0 rate check condition in LPC18xx CGU driver + - clk: tegra: do not overallocate memory for bpmp clocks + - nfsd: fix assignment of ia_ctime.tv_nsec on delegated mtime update + - nfsd: ignore ATTR_DELEG when checking ia_valid before notify_change() + - vfs: add ATTR_CTIME_SET flag + - nfsd: use ATTR_CTIME_SET for delegated ctime updates + - nfsd: track original timestamps in nfs4_delegation + - nfsd: fix SETATTR updates for delegated timestamps + - nfsd: fix timestamp updates in CB_GETATTR + - tracing: Fix the bug where bpf_get_stackid returns -EFAULT on the ARM64 + - PM: core: Annotate loops walking device links as _srcu + - PM: core: Add two macros for walking device links + - PM: sleep: Do not wait on SYNC_STATE_ONLY device links + - cpufreq: tegra186: Set target frequency for all cpus in policy + - scsi: mvsas: Fix use-after-free bugs in mvs_work_queue + - perf bpf-filter: Fix opts declaration on older libbpfs + - scsi: ufs: sysfs: Make HID attributes visible + - mshv: Handle NEED_RESCHED_LAZY before transferring to guest + - perf bpf_counter: Fix handling of cpumap fixing hybrid + - ASoC: SOF: ipc4-topology: Correct the minimum host DMA buffer size + - ASoC: SOF: ipc4-topology: Account for different ChainDMA host buffer + size + - ASoC: SOF: Intel: hda-pcm: Place the constraint on period time instead + of buffer time + - LoongArch: Add cflag -fno-isolate-erroneous-paths-dereference + - LoongArch: Fix build error for LTO with LLVM-18 + - LoongArch: Init acpi_gbl_use_global_lock to false + - ASoC: SOF: Intel: Read the LLP via the associated Link DMA channel + - net: usb: lan78xx: Fix lost EEPROM read timeout error(-ETIMEDOUT) in + lan78xx_read_raw_eeprom + - net/mlx4: prevent potential use after free in mlx4_en_do_uc_filter() + - drm/xe/hw_engine_group: Fix double write lock release in error path + - drm/xe/i2c: Don't rely on d3cold.allowed flag in system PM path + - s390/cio: Update purge function to unregister the unused subchannels + - drm/vmwgfx: Fix a null-ptr access in the cursor snooper + - drm/vmwgfx: Fix Use-after-free in validation + - drm/vmwgfx: Fix copy-paste typo in validation + - net/sctp: fix a null dereference in sctp_disposition + sctp_sf_do_5_1D_ce() + - tcp: Don't call reqsk_fastopen_remove() in tcp_conn_request(). + - net: mscc: ocelot: Fix use-after-free caused by cyclic delayed work + - selftest: net: ovpn: Fix uninit return values + - ice: ice_adapter: release xa entry on adapter allocation failure + - net: fsl_pq_mdio: Fix device node reference leak in fsl_pq_mdio_probe + - tools build: Align warning options with perf + - perf python: split Clang options when invoking Popen + - tcp: take care of zero tp->window_clamp in tcp_set_rcvlowat() + - mailbox: zynqmp-ipi: Remove redundant mbox_controller_unregister() call + - mailbox: zynqmp-ipi: Remove dev.parent check in zynqmp_ipi_free_mboxes + - mailbox: zynqmp-ipi: Fix out-of-bounds access in mailbox cleanup loop + - mailbox: zynqmp-ipi: Fix SGI cleanup on unbind + - bpf: Fix metadata_dst leak __bpf_redirect_neigh_v{4,6} + - net: mdio: mdio-i2c: Hold the i2c bus lock during smbus transactions + - net: sparx5/lan969x: fix flooding configuration on bridge join/leave + - net/mlx5: Prevent tunnel mode conflicts between FDB and NIC IPsec tables + - net/mlx5e: Prevent tunnel reformat when tunnel mode not allowed + - mailbox: mtk-cmdq: Remove pm_runtime APIs from cmdq_mbox_send_data() + - drm/amdgpu: Add additional DCE6 SCL registers + - drm/amd/display: Add missing DCE6 SCL_HORZ_FILTER_INIT* SRIs + - drm/amd/display: Properly clear SCL_*_FILTER_CONTROL on DCE6 + - drm/amd/display: Properly disable scaling on DCE6 + - drm/amd/display: Disable scaling on DCE6 for now + - drm/amdkfd: Fix kfd process ref leaking when userptr unmapping + - net: pse-pd: tps23881: Fix current measurement scaling + - crypto: skcipher - Fix reqsize handling + - netfilter: nft_objref: validate objref and objrefmap expressions + - bridge: br_vlan_fill_forward_path_pvid: use br_vlan_group_rcu() + - selftests: netfilter: nft_fib.sh: fix spurious test failures + - selftests: netfilter: query conntrack state to check for port clash + resolution + - io_uring/zcrx: increment fallback loop src offset + - net: airoha: Fix loopback mode configuration for GDM2 port + - cifs: Fix copy_to_iter return value check + - smb: client: fix missing timestamp updates after utime(2) + - rtc: isl12022: Fix initial enable_irq/disable_irq balance + - cifs: Query EA $LXMOD in cifs_query_path_info() for WSL reparse points + - tpm_tis: Fix incorrect arguments in tpm_tis_probe_irq_single + - gpio: wcd934x: mark the GPIO controller as sleeping + - bpf: Avoid RCU context warning when unpinning htab with internal structs + - kbuild: always create intermediate vmlinux.unstripped + - kbuild: keep .modinfo section in vmlinux.unstripped + - kbuild: Restore pattern to avoid stripping .rela.dyn from vmlinux + - kbuild: Add '.rel.*' strip pattern for vmlinux + - s390: vmlinux.lds.S: Reorder sections + - s390/vmlinux.lds.S: Move .vmlinux.info to end of allocatable sections + - ACPICA: acpidump: drop ACPI_NONSTRING attribute from file_name + - ACPI: property: Fix buffer properties extraction for subnodes + - ACPI: TAD: Add missing sysfs_remove_group() for ACPI_TAD_RT + - ACPICA: Debugger: drop ACPI_NONSTRING attribute from name_seg + - ACPI: debug: fix signedness issues in read/write helpers + - ACPI: battery: Add synchronization between interface updates + - arm64: dts: qcom: msm8916: Add missing MDSS reset + - arm64: dts: qcom: msm8939: Add missing MDSS reset + - arm64: dts: qcom: sdm845: Fix slimbam num-channels/ees + - Revert "UBUNTU: SAUCE: arm64: dts: qcom: x1e80100-pmics: Disable pm8010 + by default" + - arm64: dts: qcom: x1e80100-pmics: Disable pm8010 by default + - arm64: dts: ti: k3-am62a-main: Fix main padcfg length + - arm64: dts: ti: k3-am62p: Fix supported hardware for 1GHz OPP + - arm64: kprobes: call set_memory_rox() for kprobe page + - arm64: mte: Do not flag the zero page as PG_mte_tagged + - ARM: AM33xx: Implement TI advisory 1.0.36 (EMU0/EMU1 pins state on + reset) + - ARM: OMAP2+: pm33xx-core: ix device node reference leaks in + amx3_idle_init + - firmware: arm_scmi: quirk: Prevent writes to string constants + - perf/arm-cmn: Fix CMN S3 DTM offset + - KVM: s390: Fix to clear PTE when discarding a swapped page + - KVM: arm64: Fix debug checking for np-guests using huge mappings + - KVM: arm64: Fix page leak in user_mem_abort() + - x86/kvm: Force legacy PCI hole to UC when overriding MTRRs for TDX/SNP + - KVM: SVM: Re-load current, not host, TSC_AUX on #VMEXIT from SEV-ES + guest + - KVM: TDX: Fix uninitialized error code for __tdx_bringup() + - dt-bindings: phy: rockchip-inno-csi-dphy: make power-domains non- + required + - xen: take system_transition_mutex on suspend + - xen/events: Cleanup find_virq() return codes + - xen/manage: Fix suspend error path + - xen/events: Return -EEXIST for bound VIRQs + - xen/events: Update virq_to_irq on migration + - firmware: exynos-acpm: fix PMIC returned errno + - firmware: meson_sm: fix device leak at probe + - media: cec: extron-da-hd-4k-plus: drop external-module make commands + - media: cx18: Add missing check after DMA map + - media: i2c: mt9p031: fix mbus code initialization + - media: i2c: mt9v111: fix incorrect type for ret + - media: mc: Fix MUST_CONNECT handling for pads with no links + - media: pci: ivtv: Add missing check after DMA map + - media: pci: mg4b: fix uninitialized iio scan data + - media: platform: mtk-mdp3: Add missing MT8188 compatible to comp_dt_ids + - media: s5p-mfc: remove an unused/uninitialized variable + - media: staging/ipu7: fix isys device runtime PM usage in firmware + closing + - media: uvcvideo: Avoid variable shadowing in uvc_ctrl_cleanup_fh + - media: venus: firmware: Use correct reset sequence for IRIS2 + - media: venus: pm_helpers: add fallback for the opp-table + - media: vivid: fix disappearing messages + - media: vsp1: Export missing vsp1_isp_free_buffer symbol + - media: ti: j721e-csi2rx: Use devm_of_platform_populate + - media: ti: j721e-csi2rx: Fix source subdev link creation + - media: lirc: Fix error handling in lirc_register() + - drm/exynos: exynos7_drm_decon: remove ctx->suspended + - drm/panthor: Fix memory leak in panthor_ioctl_group_create() + - drm/msm/a6xx: Fix PDC sleep sequence + - drm/rcar-du: dsi: Fix 1/2/3 lane support + - drm/nouveau: fix bad ret code in nouveau_bo_move_prep + - drm/xe/uapi: loosen used tracking restriction + - drm/amd/display: Incorrect Mirror Cositing + - drm/amd/display: Enable Dynamic DTBCLK Switch + - drm/amd/display: Fix unsafe uses of kernel mode FPU + - blk-crypto: fix missing blktrace bio split events + - btrfs: avoid potential out-of-bounds in btrfs_encode_fh() + - bus: mhi: ep: Fix chained transfer handling in read path + - bus: mhi: host: Do not use uninitialized 'dev' pointer in + mhi_init_irq_setup() + - cdx: Fix device node reference leak in cdx_msi_domain_init + - clk: qcom: tcsrcc-x1e80100: Set the bi_tcxo as parent to eDP refclk + - clk: samsung: exynos990: Use PLL_CON0 for PLL parent muxes + - clk: samsung: exynos990: Fix CMU_TOP mux/div bit widths + - clk: samsung: exynos990: Replace bogus divs with fixed-factor clocks + - copy_sighand: Handle architectures where sizeof(unsigned long) < + sizeof(u64) + - cpufreq: CPPC: Avoid using CPUFREQ_ETERNAL as transition delay + - cpufreq: intel_pstate: Fix object lifecycle issue in + update_qos_request() + - crypto: aspeed - Fix dma_unmap_sg() direction + - crypto: atmel - Fix dma_unmap_sg() direction + - crypto: rockchip - Fix dma_unmap_sg() nents value + - eventpoll: Replace rwlock with spinlock + - fbdev: Fix logic error in "offb" name match + - fs/ntfs3: Fix a resource leak bug in wnd_extend() + - fs: quota: create dedicated workqueue for quota_release_work + - fsnotify: pass correct offset to fsnotify_mmap_perm() + - fuse: fix possibly missing fuse_copy_finish() call in fuse_notify() + - fuse: fix livelock in synchronous file put from fuseblk workers + - gpio: mpfs: fix setting gpio direction to output + - i3c: Fix default I2C adapter timeout value + - iio/adc/pac1934: fix channel disable configuration + - iio: dac: ad5360: use int type to store negative error codes + - iio: dac: ad5421: use int type to store negative error codes + - iio: frequency: adf4350: Fix prescaler usage. + - iio: xilinx-ams: Fix AMS_ALARM_THR_DIRECT_MASK + - iio: xilinx-ams: Unmask interrupts after updating alarms + - init: handle bootloader identifier in kernel parameters + - iio: imu: inv_icm42600: Simplify pm_runtime setup + - iio: imu: inv_icm42600: Drop redundant pm_runtime reinitialization in + resume + - iio: imu: inv_icm42600: Avoid configuring if already pm_runtime + suspended + - iommu/vt-d: PRS isn't usable if PDS isn't supported + - ipmi: Rework user message limit handling + - ipmi:msghandler:Change seq_lock to a mutex + - kernel/sys.c: fix the racy usage of task_lock(tsk->group_leader) in + sys_prlimit64() paths + - KEYS: trusted_tpm1: Compare HMAC values in constant time + - kho: only fill kimage if KHO is finalized + - lib/genalloc: fix device leak in of_gen_pool_get() + - loop: fix backing file reference leak on validation error + - md: fix mssing blktrace bio split events + - of: unittest: Fix device reference count leak in + of_unittest_pci_node_verify + - openat2: don't trigger automounts with RESOLVE_NO_XDEV + - padata: Reset next CPU when reorder sequence wraps around + - parisc: don't reference obsolete termio struct for TC* constants + - parisc: Remove spurious if statement from raw_copy_from_user() + - nvme-pci: Add TUXEDO IBS Gen8 to Samsung sleep quirk + - pinctrl: samsung: Drop unused S3C24xx driver data + - PM: EM: Fix late boot with holes in CPU topology + - PM: hibernate: Fix hybrid-sleep + - PM: hibernate: Restrict GFP mask in power_down() + - power: supply: max77976_charger: fix constant current reporting + - powerpc/powernv/pci: Fix underflow and leak issue + - powerpc/pseries/msi: Fix potential underflow and leak issue + - pwm: berlin: Fix wrong register in suspend/resume + - pwm: Fix incorrect variable used in error message + - Revert "ipmi: fix msg stack when IPMI is disconnected" + - sched/deadline: Fix race in push_dl_task() + - scsi: hpsa: Fix potential memory leak in hpsa_big_passthru_ioctl() + - scsi: sd: Fix build warning in sd_revalidate_disk() + - sctp: Fix MAC comparison to be constant-time + - smb client: fix bug with newly created file in cached dir + - sparc64: fix hugetlb for sun4u + - sparc: fix error handling in scan_one_device() + - xtensa: simdisk: add input size check in proc_write_simdisk + - xsk: Harden userspace-supplied xdp_desc validation + - mtd: rawnand: fsmc: Default to autodetect buswidth + - mtd: nand: raw: gpmi: fix clocks when CONFIG_PM=N + - mmc: core: SPI mode remove cmd7 + - mmc: mmc_spi: multiple block read remove read crc ack + - memory: samsung: exynos-srom: Fix of_iomap leak in exynos_srom_probe + - memory: stm32_omm: Fix req2ack update test + - rtc: interface: Ensure alarm irq is enabled when UIE is enabled + - rtc: interface: Fix long-standing race when setting alarm + - rseq/selftests: Use weak symbol reference, not definition, to link with + glibc + - PCI: xilinx-nwl: Fix ECAM programming + - PCI: tegra: Convert struct tegra_msi mask_lock into raw spinlock + - PCI/sysfs: Ensure devices are powered for config reads + - PCI/IOV: Add PCI rescan-remove locking when enabling/disabling SR-IOV + - PCI/ERR: Fix uevent on failure to recover + - PCI/AER: Fix missing uevent on recovery when a reset is requested + - PCI/AER: Support errors introduced by PCIe r6.0 + - PCI: Ensure relaxed tail alignment does not increase min_align + - PCI: Fix failure detection during resource resize + - PCI: j721e: Fix module autoloading + - PCI: j721e: Fix programming sequence of "strap" settings + - PCI: keystone: Use devm_request_irq() to free "ks-pcie-error-irq" on + exit + - PCI: rcar-gen4: Fix PHY initialization + - PCI: rcar-host: Drop PMSR spinlock + - PCI: rcar-host: Convert struct rcar_msi mask_lock into raw spinlock + - PCI: tegra194: Fix broken tegra_pcie_ep_raise_msi_irq() + - PCI: tegra194: Handle errors in BPMP response + - PCI: tegra194: Reset BARs when running in PCIe endpoint mode + - PCI/pwrctrl: Fix device leak at registration + - PCI/pwrctrl: Fix device and OF node leak at bus scan + - PCI/pwrctrl: Fix device leak at device stop + - spi: cadence-quadspi: Flush posted register writes before INDAC access + - spi: cadence-quadspi: Flush posted register writes before DAC access + - spi: cadence-quadspi: Fix cqspi_setup_flash() + - xfs: use deferred intent items for reaping crosslinked blocks + - x86/fred: Remove ENDBR64 from FRED entry points + - x86/umip: Check that the instruction opcode is at least two bytes + - x86/umip: Fix decoding of register forms of 0F 01 (SGDT and SIDT + aliases) + - mptcp: pm: in-kernel: usable client side with C-flag + - mptcp: reset blackhole on success with non-loopback ifaces + - selftests: mptcp: join: validate C-flag + def limit + - s390/cio/ioasm: Fix __xsch() condition code handling + - s390/dasd: enforce dma_alignment to ensure proper buffer validation + - s390/dasd: Return BLK_STS_INVAL for EINVAL from do_dasd_request + - s390: Add -Wno-pointer-sign to KBUILD_CFLAGS_DECOMPRESSOR + - slab: prevent warnings when slab obj_exts vector allocation fails + - slab: mark slab->obj_exts allocation failures unconditionally + - wifi: ath11k: HAL SRNG: don't deinitialize and re-initialize again + - wifi: iwlwifi: Fix dentry reference leak in iwl_mld_add_link_debugfs + - wifi: rtw89: avoid possible TX wait initialization race + - wifi: mt76: mt7925u: Add VID/PID for Netgear A9000 + - wifi: mt76: mt7921u: Add VID/PID for Netgear A7500 + - mm/thp: fix MTE tag mismatch when replacing zero-filled subpages + - mm/rmap: fix soft-dirty and uffd-wp bit loss when remapping zero-filled + mTHP subpage to shared zeropage + - mm/page_alloc: only set ALLOC_HIGHATOMIC for __GPF_HIGH allocations + - mm/hugetlb: early exit from hugetlb_pages_alloc_boot() when + max_huge_pages=0 + - mm/damon/vaddr: do not repeat pte_offset_map_lock() until success + - mm/damon/lru_sort: use param_ctx for damon_attrs staging + - nfsd: decouple the xprtsec policy check from check_nfsd_access() + - NFSD: Fix destination buffer size in nfsd4_ssc_setup_dul() + - nfsd: nfserr_jukebox in nlm_fopen should lead to a retry + - media: iris: Call correct power off callback in cleanup path + - media: iris: Fix firmware reference leak and unmap memory after load + - media: iris: fix module removal if firmware download failed + - media: iris: vpu3x: Add MNoC low power handshake during hardware power- + off + - media: iris: Fix port streaming handling + - media: iris: Fix buffer count reporting in internal buffer check + - media: iris: Allow substate transition to load resources during output + streaming + - media: iris: Always destroy internal buffers on firmware release + response + - media: iris: Simplify session stop logic by relying on vb2 checks + - media: iris: Update vbuf flags before v4l2_m2m_buf_done + - media: iris: Send dummy buffer address for all codecs during drain + - media: iris: Fix missing LAST flag handling during drain + - media: iris: Fix format check for CAPTURE plane in try_fmt + - media: iris: Allow stop on firmware only if start was issued. + - ext4: add ext4_sb_bread_nofail() helper function for + ext4_free_branches() + - ext4: fail unaligned direct IO write with EINVAL + - ext4: verify orphan file size is not too big + - ext4: increase i_disksize to offset + len in + ext4_update_disksize_before_punch() + - ext4: correctly handle queries for metadata mappings + - ext4: avoid potential buffer over-read in parse_apply_sb_mount_options() + - ext4: fix an off-by-one issue during moving extents + - ext4: guard against EA inode refcount underflow in xattr update + - ext4: validate ea_ino and size in check_xattrs + - ACPICA: Allow to skip Global Lock initialization + - ext4: free orphan info with kvfree + - ipmi: Fix handling of messages with provided receive message pointer + - Squashfs: add additional inode sanity checking + - Squashfs: reject negative file sizes in squashfs_read_inode() + - mm/ksm: fix incorrect KSM counter handling in mm_struct during fork + - media: mc: Clear minor number before put device + - arm64: dts: qcom: qcs615: add missing dt property in QUP SEs + - ACPI: property: Disregard references in data-only subnode lists + - ACPI: property: Add code comments explaining what is going on + - ACPI: property: Do not pass NULL handles to acpi_attach_data() + - irqchip/sifive-plic: Avoid interrupt ID 0 handling during suspend/resume + - copy_file_range: limit size if in compat mode + - minixfs: Verify inode mode when loading from disk + - pid: Add a judgment for ns null in pid_nr_ns + - fs: Add 'initramfs_options' to set initramfs mount options + - cramfs: Verify inode mode when loading from disk + - nsfs: validate extensible ioctls + - mnt_ns_tree_remove(): DTRT if mnt_ns had never been added to mnt_ns_list + - writeback: Avoid softlockup when switching many inodes + - writeback: Avoid excessively long inode switching times + - iomap: error out on file IO when there is no inline_data buffer + - pidfs: validate extensible ioctls + - mount: handle NULL values in mnt_ns_release() + - Linux 6.17.4 + * Questing update: v6.17.4 upstream stable release (LP: #2131259) // Race + condition in perf build causes build failure due to missing unistd_64.h + header on arm64 (LP: #2131702) + - perf tools: Fix arm64 libjvmti build by generating unistd_64.h + * Questing update: v6.17.3 upstream stable release (LP: #2129610) + - arch: copy_thread: pass clone_flags as u64 + - filelock: add FL_RECLAIM to show_fl_flags() macro + - init: INITRAMFS_PRESERVE_MTIME should depend on BLK_DEV_INITRD + - pid: use ns_capable_noaudit() when determining net sysctl permissions + - Fix CC_HAS_ASM_GOTO_OUTPUT on non-x86 architectures + - [Config]: Update CC configs for v6.17.3 + - seccomp: Fix a race with WAIT_KILLABLE_RECV if the tracer replies too + fast + - kbuild: Add missing $(objtree) prefix to powerpc crtsavres.o artifact + - selftests: arm64: Check fread return value in exec_target + - selftests: arm64: Fix -Waddress warning in tpidr2 test + - kselftest/arm64/gcs: Correctly check return value when disabling GCS + - hfsplus: fix slab-out-of-bounds read in hfsplus_uni2asc() + - gfs2: Fix GLF_INVALIDATE_IN_PROGRESS flag clearing in do_xmote + - gfs2: Remove space before newline + - gfs2: Further sanitize lock_dlm.c + - gfs2: Fix LM_FLAG_TRY* logic in add_to_queue + - gfs2: Remove duplicate check in do_xmote + - gfs2: Get rid of GLF_INVALIDATE_IN_PROGRESS + - gfs2: do_xmote cleanup + - gfs2: Add proper lockspace locking + - powerpc/8xx: Remove left-over instruction and comments in + DataStoreTLBMiss handler + - powerpc/603: Really copy kernel PGD entries into all PGDIRs + - powerpc/ftrace: ensure ftrace record ops are always set for NOPs + - powerpc64/modules: correctly iterate over stubs in + setup_ftrace_ool_stubs + - uprobes: uprobe_warn should use passed task + - raid6: riscv: Clean up unused header file inclusion + - coresight: trbe: Prevent overflow in PERF_IDX2OFF() + - perf: arm_spe: Prevent overflow in PERF_IDX2OFF() + - erofs: avoid reading more for fragment maps + - smb: client: fix sending the iwrap custom IRD/ORD negotiation messages + - smb: server: fix IRD/ORD negotiation with the client + - perf/x86/intel: Use early_initcall() to hook bts_init() + - perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error + - x86/vdso: Fix output operand size of RDPID + - selftests: cgroup: Make test_pids backwards compatible + - sched/fair: Get rid of sched_domains_curr_level hack for tl->cpumask() + - [Config]: Update CONFIG_SCHED_MC for v6.17.3 + - lsm: CONFIG_LSM can depend on CONFIG_SECURITY + - cpuset: fix failure to enable isolated partition when containing + isolcpus + - btrfs: return any hit error from extent_writepage_io() + - btrfs: fix symbolic link reading when bs > ps + - pinctrl: renesas: rzg2l: Fix invalid unsigned return in rzg3s_oen_read() + - arm64: dts: renesas: rzg2lc-smarc: Disable CAN-FD channel0 + - bpf: Tidy verifier bug message + - regmap: Remove superfluous check for !config in __regmap_init() + - selftests/bpf: Copy test_kmods when installing selftest + - rust: cpumask: Mark CpumaskVar as transparent + - bpf/selftests: Fix test_tcpnotify_user + - bpf: Remove migrate_disable in kprobe_multi_link_prog_run + - libbpf: Fix reuse of DEVMAP + - tools/nolibc: fix error return value of clock_nanosleep() + - ARM: dts: renesas: porter: Fix CAN pin group + - leds: max77705: Function return instead of variable assignment + - leds: flash: leds-qcom-flash: Update torch current clamp setting + - s390/bpf: Do not write tail call counter into helper and kfunc frames + - s390/bpf: Write back tail call counter for BPF_PSEUDO_CALL + - s390/bpf: Write back tail call counter for BPF_TRAMP_F_CALL_ORIG + - cpufreq: scmi: Account for malformed DT in scmi_dev_used_by_cpus() + - arm64: dts: renesas: sparrow-hawk: Invert microSD voltage selector on + EVTB1 + - arm64: dts: renesas: sparrow-hawk: Set VDDQ18_25_AVB voltage on EVTB1 + - libbpf: Export bpf_object__prepare symbol + - firmware: arm_scmi: Mark VirtIO ready before registering + scmi_virtio_driver + - arm64: dts: imx93-kontron: Fix GPIO for panel regulator + - arm64: dts: imx93-kontron: Fix USB port assignment + - arm64: dts: imx95: Correct the lpuart7 and lpuart8 srcid + - bpf: Remove preempt_disable in bpf_try_get_buffers + - ACPI: processor: idle: Fix memory leak when register cpuidle device + failed + - genirq: Add irq_chip_(startup/shutdown)_parent() + - PCI/MSI: Add startup/shutdown for per device domains + - irqchip/sg2042-msi: Fix broken affinity setting + - scripts/misc-check: update export checks for EXPORT_SYMBOL_FOR_MODULES() + - soc: qcom: rpmh-rsc: Unconditionally clear _TRIGGER bit for TCS + - pinctrl: meson-gxl: add missing i2c_d pinmux + - blk-mq: check kobject state_in_sysfs before deleting in + blk_mq_unregister_hctx + - selftests/futex: Remove the -g parameter from futex_priv_hash + - ARM: at91: pm: fix MCKx restore routine + - arm64: dts: apple: t8103-j457: Fix PCIe ethernet iommu-map + - regulator: scmi: Use int type to store negative error codes + - selftests/futex: Fix some futex_numa_mpol subtests + - tools/nolibc: avoid error in dup2() if old fd equals new fd + - selftests/nolibc: fix EXPECT_NZ macro + - leds: leds-lp55xx: Use correct address for memory programming + - PCI/MSI: Check MSI_FLAG_PCI_MSI_MASK_PARENT in + cond_[startup|shutdown]_parent() + - block: use int to store blk_stack_limits() return value + - ARM: dts: stm32: stm32mp151c-plyaqm: Use correct dai-format property + - dt-bindings: vendor-prefixes: Add undocumented vendor prefixes + - genirq/test: Fix depth tests on architectures with NOREQUEST by default. + - genirq/test: Select IRQ_DOMAIN + - genirq/test: Depend on SPARSE_IRQ + - genirq/test: Drop CONFIG_GENERIC_IRQ_MIGRATION assumptions + - genirq/test: Ensure CPU 1 is online for hotplug test + - selftests/bpf: Fix count write in testapp_xdp_metadata_copy() + - vdso/datastore: Gate time data behind CONFIG_GENERIC_GETTIMEOFDAY + - PM: sleep: core: Clear power.must_resume in noirq suspend error path + - blk-mq: fix elevator depth_updated method + - vdso: Add struct __kernel_old_timeval forward declaration to gettime.h + - ARM: dts: ti: omap: am335x-baltos: Fix ti,en-ck32k-xtal property in DTS + to use correct boolean syntax + - ARM: dts: ti: omap: omap3-devkit8000-lcd: Fix ti,keep-vref-on property + to use correct boolean syntax in DTS + - ARM: dts: omap: am335x-cm-t335: Remove unused mcasp num-serializer + property + - PM / devfreq: mtk-cci: Fix potential error pointer dereference in + probe() + - power: supply: cw2015: Fix a alignment coding style issue + - hwmon: (asus-ec-sensors) Narrow lock for X870E-CREATOR WIFI + - pinctrl: renesas: Use int type to store negative error codes + - pinctrl: eswin: Fix regulator error check and Kconfig dependency + - null_blk: Fix the description of the cache_size module argument + - blk-throttle: fix access race during throttle policy activation + - selftests: vDSO: Fix -Wunitialized in powerpc VDSO_CALL() wrapper + - selftests: vDSO: vdso_test_abi: Correctly skip whole test with missing + vDSO + - irqchip/gic-v5: Fix loop in gicv5_its_create_itt_two_level() cleanup + path + - irqchip/gic-v5: Fix error handling in gicv5_its_irq_domain_alloc() + - tick: Do not set device to detached state in tick_shutdown() + - arm64: dts: mediatek: mt8195: Remove suspend-breaking reset from pcie0 + - arm64: dts: mediatek: mt8183: Fix out of range pull values + - nbd: restrict sockets to TCP and UDP + - PM / devfreq: rockchip-dfi: double count on RK3588 + - firmware: firmware: meson-sm: fix compile-test default + - dts: arm: amlogic: fix pwm node for c3 + - soc: mediatek: mtk-svs: fix device leaks on mt8183 probe failure + - soc: mediatek: mtk-svs: fix device leaks on mt8192 probe failure + - cpuidle: qcom-spm: fix device and OF node leaks at probe + - block: cleanup bio_issue + - block: initialize bio issue time in blk_mq_submit_bio() + - block: factor out a helper bio_submit_split_bioset() + - block: skip unnecessary checks for split bio + - block: fix ordering of recursive split IO + - blk-mq: remove useless checkings in blk_mq_update_nr_requests() + - blk-mq: check invalid nr_requests in queue_requests_store() + - blk-mq: convert to serialize updating nr_requests with + update_nr_hwq_lock + - blk-mq: cleanup shared tags case in blk_mq_update_nr_requests() + - blk-mq: split bitmap grow and resize case in blk_mq_update_nr_requests() + - blk-mq-sched: add new parameter nr_requests in blk_mq_alloc_sched_tags() + - blk-mq: fix potential deadlock while nr_requests grown + - arm64: dts: allwinner: a527: cubie-a5e: Add ethernet PHY reset setting + - arm64: dts: allwinner: t527: avaota-a1: Add ethernet PHY reset setting + - arm64: dts: rockchip: Add RTC on rk3576-evb1-v10 + - arm64: dts: rockchip: Add WiFi on rk3576-evb1-v10 + - arm64: dts: rockchip: Fix network on rk3576 evb1 board + - arm64: dts: ti: k3-j742s2-mcu-wakeup: Override firmware-name for MCU R5F + cores + - arm64: dts: ti: k3: Rename rproc reserved-mem nodes to 'memory@addr' + - Revert "arm64: dts: ti: k3-j721e-sk: Fix reversed C6x carveout + locations" + - Revert "arm64: dts: ti: k3-j721e-beagleboneai64: Fix reversed C6x + carveout locations" + - arm64: dts: mediatek: mt8188: Change efuse fallback compatible to mt8186 + - arm64: dts: mediatek: mt8186-tentacruel: Fix touchscreen model + - arm64: dts: ti: k3-pinctrl: Fix the bug in existing macros + - arm64: dts: renesas: r9a09g047e57-smarc: Fix gpio key's pin control node + - arm64: dts: mediatek: mt6331: Fix pmic, regulators, rtc, keys node names + - mmc: core: Fix variable shadowing in mmc_route_rpmb_frames() + - arm64: dts: mediatek: mt6795-xperia-m5: Fix mmc0 latch-ck value + - arm64: dts: mediatek: mt7986a: Fix PCI-Express T-PHY node address + - arm64: dts: mediatek: mt8395-kontron-i1200: Fix MT6360 regulator nodes + - arm64: dts: mediatek: mt8516-pumpkin: Fix machine compatible + - arm64: dts: allwinner: a527: cubie-a5e: Add LEDs + - arm64: dts: allwinner: a527: cubie-a5e: Drop external 32.768 KHz crystal + - arm64: dts: allwinner: t527: avaota-a1: hook up external 32k crystal + - arm64: dts: allwinner: t527: orangepi-4a: hook up external 32k crystal + - pwm: tiehrpwm: Don't drop runtime PM reference in .free() + - pwm: tiehrpwm: Make code comment in .free() more useful + - pwm: tiehrpwm: Fix various off-by-one errors in duty-cycle calculation + - pwm: tiehrpwm: Fix corner case in clock divisor calculation + - ACPICA: Apply ACPI_NONSTRING + - ACPICA: Fix largest possible resource descriptor index + - riscv, bpf: Sign extend struct ops return values properly + - nvme-auth: update bi_directional flag + - nvmet-fc: move lsop put work to nvmet_fc_ls_req_op + - nvmet-fcloop: call done callback even when remote port is gone + - nvme-tcp: send only permitted commands for secure concat + - i3c: master: svc: Use manual response for IBI events + - i3c: master: svc: Recycle unused IBI slot + - block: update validation of atomic writes boundary for stacked devices + - block: fix stacking of atomic writes when atomics are not supported + - selftests: watchdog: skip ping loop if WDIOF_KEEPALIVEPING not supported + - selftests/kselftest_harness: Add harness-selftest.expected to TEST_FILES + - blk-throttle: fix throtl_data leak during disk release + - bpf: Explicitly check accesses to bpf_sock_addr + - mmc: select REGMAP_MMIO with MMC_LOONGSON2 + - selftests/futex: Fix futex_wait() for 32bit ARM + - selftest/futex: Make the error check more precise for futex_numa_mpol + - selftest/futex: Compile also with libnuma < 2.0.16 + - bpf: dont report verifier bug for missing bpf_scc_visit on speculative + path + - bpf, arm64: Call bpf_jit_binary_pack_finalize() in bpf_jit_free() + - arm64: dts: apple: t600x: Add missing WiFi properties + - arm64: dts: apple: t600x: Add bluetooth device nodes + - arm64: dts: apple: Add ethernet0 alias for J375 template + - selftests: always install UAPI headers to the correct directory + - smp: Fix up and expand the smp_call_function_many() kerneldoc + - mfd: max77705: max77705_charger: move active discharge setting to mfd + parent + - power: supply: max77705_charger: refactoring: rename charger to chg + - power: supply: max77705_charger: use regfields for config registers + - power: supply: max77705_charger: rework interrupts + - tools/nolibc: make time_t robust if __kernel_old_time_t is missing in + host headers + - spi: fix return code when spi device has too many chipselects + - clocksource/drivers/timer-tegra186: Avoid 64-bit divide operation + - clocksource/drivers/tegra186: Avoid 64-bit division + - bpf: Mark kfuncs as __noclone + - once: fix race by moving DO_ONCE to separate section + - hwmon: (mlxreg-fan) Separate methods of fan setting coming from + different subsystems + - tools/nolibc: add stdbool.h to nolibc includes + - thermal/drivers/qcom: Make LMH select QCOM_SCM + - thermal/drivers/qcom/lmh: Add missing IRQ includes + - i2c: mediatek: fix potential incorrect use of I2C_MASTER_WRRD + - i2c: spacemit: ensure bus release check runs when wait_bus_idle() fails + - i2c: spacemit: remove stop function to avoid bus error + - i2c: spacemit: disable SDA glitch fix to avoid restart delay + - i2c: spacemit: check SDA instead of SCL after bus reset + - i2c: spacemit: ensure SDA is released after bus reset + - i2c: designware: Fix clock issue when PM is disabled + - i2c: designware: Add disabling clocks when probe fails + - libbpf: Fix error when st-prefix_ops and ops from differ btf + - bpf: Enforce expected_attach_type for tailcall compatibility + - i3c: fix big-endian FIFO transfers + - mfd: max77705: Setup the core driver as an interrupt controller + - drm/sched: Fix a race in DRM_GPU_SCHED_STAT_NO_HANG test + - drm/panel-edp: Add disable to 100ms for MNB601LS1-4 + - drm/display: bridge-connector: correct CEC bridge pointers in + drm_bridge_connector_init + - drm/panel-edp: Add 50ms disable delay for four panels + - drm/vmwgfx: fix missing assignment to ts + - drm/amd/display: Reduce Stack Usage by moving 'audio_output' into + 'stream_res' v4 + - drm/panel: novatek-nt35560: Fix invalid return value + - drm/amdgpu: fix link error for !PM_SLEEP + - drm/amdgpu: Fix jpeg v4.0.3 poison irq call trace on sriov guest + - drm/amdgpu: Fix vcn v4.0.3 poison irq call trace on sriov guest + - PCI: endpoint: pci-ep-msi: Fix NULL vs IS_ERR() check in + pci_epf_write_msi_msg() + - PCI: xgene-msi: Return negative -EINVAL in xgene_msi_handler_setup() + - drm/radeon/r600_cs: clean up of dead code in r600_cs + - f2fs: fix condition in __allow_reserved_blocks() + - f2fs: fix to avoid overflow while left shift operation + - f2fs: fix to zero data after EOF for compressed file correctly + - drm/bridge: it6505: select REGMAP_I2C + - wifi: rtw88: Lock rtwdev->mutex before setting the LED + - HID: steelseries: refactor probe() and remove() + - media: zoran: Remove zoran_fh structure + - phy: rockchip: naneng-combphy: Enable U3 OTG port for RK3568 + - drm/bridge: cdns-dsi: Fix the _atomic_check() + - usb: host: max3421-hcd: Fix error pointer dereference in probe cleanup + - usb: misc: qcom_eud: Access EUD_MODE_MANAGER2 through secure calls + - PCI/pwrctrl: Fix double cleanup on devm_add_action_or_reset() failure + - misc: pci_endpoint_test: Fix array underflow in + pci_endpoint_test_ioctl() + - serial: max310x: Add error checking in probe() + - drm/amd/display: Remove redundant semicolons + - drm/amd/display: Add NULL pointer checks in dc_stream cursor attribute + functions + - crypto: keembay - Add missing check after sg_nents_for_len() + - hwrng: nomadik - add ARM_AMBA dependency + - docs: iio: ad3552r: Fix malformed code-block directive + - fwctl/mlx5: Fix memory alloc/free in mlx5ctl_fw_rpc() + - scsi: pm80xx: Restore support for expanders + - scsi: pm80xx: Fix array-index-out-of-of-bounds on rmmod + - scsi: libsas: Add dev_parent_is_expander() helper + - scsi: pm80xx: Use dev_parent_is_expander() helper + - scsi: pm80xx: Add helper function to get the local phy id + - scsi: pm80xx: Fix pm8001_abort_task() for chip_8006 when using an + expander + - mptcp: Fix up subflow's memcg when CONFIG_SOCK_CGROUP_DATA=n. + - scsi: myrs: Fix dma_alloc_coherent() error check + - f2fs: fix to clear unusable_cap for checkpoint=enable + - f2fs: fix to avoid NULL pointer dereference in + f2fs_check_quota_consistency() + - f2fs: fix to allow removing qf_name + - Revert "UBUNTU: SAUCE: drm/dp: drm_edp_backlight_set_level: do not + always send 3-byte commands" + - drm/dp: drm_edp_backlight_set_level: do not always send 3-byte commands + - crypto: octeontx2 - Call strscpy() with correct size argument + - drm: re-allow no-op changes on non-primary planes in async flips + - media: rj54n1cb0c: Fix memleak in rj54n1_probe() + - media: staging/ipu7: convert to use pci_alloc_irq_vectors() API + - media: staging/ipu7: Don't set name for IPU7 PCI device + - media: staging/ipu7: cleanup the MMU correctly in IPU7 driver release + - media: i2c: vd55g1: Fix duster register address + - drm/panel: Allow powering on panel follower after panel is enabled + - HID: i2c-hid: Make elan touch controllers power on after panel is + enabled + - RDMA/mlx5: Better estimate max_qp_wr to reflect WQE count + - RDMA/mlx5: Fix vport loopback forcing for MPV device + - wifi: rtw88: Use led->brightness_set_blocking for PCI too + - net: phy: introduce phy_id_compare_vendor() PHY ID helper + - net: phy: as21xxx: better handle PHY HW reset on soft-reboot + - PCI: rcar-host: Pass proper IRQ domain to generic_handle_domain_irq() + - fuse: remove unneeded offset assignment when filling write pages + - PCI: qcom: Restrict port parsing only to PCIe bridge child nodes + - cdx: don't select CONFIG_GENERIC_MSI_IRQ + - PCI/ACPI: Fix pci_acpi_preserve_config() memory leak + - HID: i2c-hid: Fix test in i2c_hid_core_register_panel_follower() + - ALSA: lx_core: use int type to store negative error codes + - media: st-delta: avoid excessive stack usage + - drm/amdgpu/vcn: Add regdump helper functions + - drm/amdgpu/vcn: Hold pg_lock before vcn power off + - drm/amdgpu: Check vcn state before profile switch + - accel/amdxdna: Use int instead of u32 to store error codes + - efi: Explain OVMF acronym in OVMF_DEBUG_LOG help text + - net: dst: introduce dst->dev_rcu + - ipv6: mcast: Add ip6_mc_find_idev() helper + - ipv6: start using dst_dev_rcu() + - ipv6: use RCU in ip6_xmit() + - ipv6: use RCU in ip6_output() + - net: use dst_dev_rcu() in sk_setup_caps() + - tcp_metrics: use dst_dev_net_rcu() + - ipv4: start using dst_dev_rcu() + - crypto: hisilicon/zip - remove unnecessary validation for high- + performance mode configurations + - crypto: hisilicon - re-enable address prefetch after device resuming + - crypto: hisilicon - check the sva module status while enabling or + disabling address prefetch + - crypto: hisilicon/qm - check whether the input function and PF are on + the same device + - crypto: hisilicon/qm - request reserved interrupt for virtual function + - inet: ping: check sock_net() in ping_get_port() and ping_lookup() + - dmaengine: Fix dma_async_tx_descriptor->tx_submit documentation + - coresight: trbe: Add ISB after TRBLIMITR write + - coresight: Fix missing include for FIELD_GET + - coresight: Only register perf symlink for sinks with alloc_buffer + - drm/amdgpu: Power up UVD 3 for FW validation (v2) + - drm/amd/pm: Disable ULV even if unsupported (v3) + - drm/amd/pm: Fix si_upload_smc_data (v3) + - drm/amd/pm: Adjust si_upload_smc_data register programming (v3) + - drm/amd/pm: Treat zero vblank time as too short in si_dpm (v3) + - drm/amd/pm: Disable MCLK switching with non-DC at 120 Hz+ (v2) + - drm/amd/pm: Disable SCLK switching on Oland with high pixel clocks (v3) + - wifi: mac80211: Make CONNECTION_MONITOR optional for MLO sta + - wifi: mwifiex: send world regulatory domain to driver + - wifi: brcmfmac: fix 43752 SDIO FWVID incorrectly labelled as Cypress + (CYW) + - drm/msm: Do not validate SSPP when it is not ready + - PCI: tegra: Fix devm_kcalloc() argument order for port->phys allocation + - wifi: mac80211: consider links for validating SCAN_FLAG_AP in scan + request during MLO + - PCI: qcom: Add equalization settings for 8.0 GT/s and 32.0 GT/s + - tcp: fix __tcp_close() to only send RST when required + - fanotify: Validate the return value of mnt_ns_from_dentry() before + dereferencing + - drm/amdkfd: Fix error code sign for EINVAL in svm_ioctl() + - usb: phy: twl6030: Fix incorrect type for ret + - usb: gadget: configfs: Correctly set use_os_string at bind + - tty: n_gsm: Don't block input queue by waiting MSC + - misc: genwqe: Fix incorrect cmd field being reported in error + - pps: fix warning in pps_register_cdev when register device fail + - drm/msm: Fix obj leak in VM_BIND error path + - drm/msm: Fix missing VM_BIND offset/range validation + - wifi: iwlwifi: Remove redundant header files + - drm/msm/mdp4: stop supporting no-IOMMU configuration + - drm/msm: stop supporting no-IOMMU configuration + - idpf: fix Rx descriptor ready check barrier in splitq + - ASoC: Intel: bytcht_es8316: Fix invalid quirk input mapping + - ASoC: Intel: bytcr_rt5640: Fix invalid quirk input mapping + - ASoC: Intel: bytcr_rt5651: Fix invalid quirk input mapping + - ipv6: snmp: do not use SNMP_MIB_SENTINEL anymore + - ipv6: snmp: do not track per idev ICMP6_MIB_RATELIMITHOST + - drm/msm: Fix bootup splat with separate_gpu_drm modparam + - drm/msm/dpu: fix incorrect type for ret + - wifi: mac80211: fix reporting of all valid links in sta_set_sinfo() + - fs: ntfs3: Fix integer overflow in run_unpack() + - fs/ntfs3: reject index allocation if $BITMAP is empty but blocks exist + - iio: consumers: Fix handling of negative channel scale in + iio_convert_raw_to_processed() + - iio: consumers: Fix offset handling in iio_convert_raw_to_processed() + - mm/slub: Fix cmp_loc_by_count() to return 0 when counts are equal + - tools: ynl: fix undefined variable name + - RDMA/mlx5: Fix page size bitmap calculation for KSM mode + - netfilter: ipset: Remove unused htable_bits in macro ahash_region + - ipvs: Use READ_ONCE/WRITE_ONCE for ipvs->enable + - HID: steelseries: Fix STEELSERIES_SRWS1 handling in steelseries_remove() + - watchdog: intel_oc_wdt: Do not try to write into const memory + - watchdog: mpc8xxx_wdt: Reload the watchdog timer when enabling the + watchdog + - PCI: endpoint: pci-epf-test: Fix doorbell test support + - drivers/base/node: handle error properly in register_one_node() + - RDMA/cm: Rate limit destroy CM ID timeout error message + - wifi: mt76: mt7996: Fix mt7996_mcu_sta_ba wcid configuration + - wifi: mt76: mt7996: Fix mt7996_mcu_bss_mld_tlv routine + - wifi: mt76: fix potential memory leak in mt76_wmac_probe() + - wifi: mt76: mt7996: Use proper link_id in link_sta_rc_update callback + - wifi: mt76: mt7996: Check phy before init msta_link in + mt7996_mac_sta_add_links() + - wifi: mt76: mt7996: Fix tx-queues initialization for second phy on + mt7996 + - wifi: mt76: mt7996: Fix RX packets configuration for primary WED device + - wifi: mt76: mt7996: Convert mt7996_wed_rro_addr to LE + - wifi: mt76: mt7915: fix mt7981 pre-calibration + - wifi: mt76: mt7996: remove redundant per-phy mac80211 calls during + restart + - ASoC: Intel: hda-sdw-bpt: set persistent_buffer false + - srcu/tiny: Remove preempt_disable/enable() in srcu_gp_start_if_needed() + - drm/amdgpu: Fix allocating extra dwords for rings (v2) + - f2fs: fix to update map->m_next_extent correctly in f2fs_map_blocks() + - f2fs: fix to truncate first page in error path of f2fs_truncate() + - f2fs: fix to avoid migrating empty section + - f2fs: fix to mitigate overhead of f2fs_zero_post_eof_page() + - RISC-V: KVM: Write hgatp register with valid mode bits + - ALSA: pcm: Disable bottom softirqs as part of spin_lock_irq() on + PREEMPT_RT + - ACPI: NFIT: Fix incorrect ndr_desc being reportedin dev_err message + - scsi: qla2xxx: edif: Fix incorrect sign of error code + - scsi: qla2xxx: Fix incorrect sign of error code in START_SP_W_RETRIES() + - scsi: qla2xxx: Fix incorrect sign of error code in qla_nvme_xmt_ls_rsp() + - HID: hidraw: tighten ioctl command parsing + - f2fs: fix zero-sized extent for precache extents + - smc: Fix use-after-free in __pnet_find_base_ndev(). + - smc: Use __sk_dst_get() and dst_dev_rcu() in in smc_clc_prfx_set(). + - smc: Use __sk_dst_get() and dst_dev_rcu() in smc_clc_prfx_match(). + - smc: Use __sk_dst_get() and dst_dev_rcu() in smc_vlan_by_tcpsk(). + - tls: Use __sk_dst_get() and dst_dev_rcu() in get_netdev_for_sock(). + - mptcp: Call dst_release() in mptcp_active_enable(). + - mptcp: Use __sk_dst_get() and dst_dev_rcu() in mptcp_active_enable(). + - Revert "usb: xhci: Avoid Stop Endpoint retry loop if the endpoint seems + Running" + - RDMA/core: Resolve MAC of next-hop device without ARP support + - IB/sa: Fix sa_local_svc_timeout_ms read race + - Documentation: trace: historgram-design: Separate sched_waking histogram + section heading and the following diagram + - ASoC: SOF: ipc4-pcm: Fix incorrect comparison with number of tdm_slots + - wifi: ath12k: initialize eirp_power before use + - wifi: ath12k: fix overflow warning on num_pwr_levels + - wifi: ath12k: fix signal in radiotap for WCN7850 + - wifi: ath12k: fix HAL_PHYRX_COMMON_USER_INFO handling in monitor mode + - wifi: ath12k: fix the fetching of combined rssi + - wifi: ath12k: Add fallback for invalid channel number in PHY metadata + - wifi: ath12k: fix wrong logging ID used for CE + - wifi: ath10k: avoid unnecessary wait for service ready message + - iommu/vt-d: debugfs: Fix legacy mode page table dump logic + - wifi: mac80211: fix Rx packet handling when pubsta information is not + available + - ASoC: Intel: sof_sdw: Prevent jump to NULL add_sidecar callback + - sparc: fix accurate exception reporting in copy_{from_to}_user for + UltraSPARC + - sparc: fix accurate exception reporting in copy_{from_to}_user for + UltraSPARC III + - sparc: fix accurate exception reporting in copy_{from_to}_user for + Niagara + - sparc: fix accurate exception reporting in copy_to_user for Niagara 4 + - sparc: fix accurate exception reporting in copy_{from,to}_user for M7 + - vfio/pds: replace bitmap_free with vfree + - crypto: comp - Use same definition of context alloc and free ops + - crypto: hisilicon/qm - set NULL to qm->debug.qm_diff_regs + - wifi: ath12k: Fix peer lookup in ath12k_dp_mon_rx_deliver_msdu() + - rpmsg: qcom_smd: Fix fallback to qcom,ipc parse + - remoteproc: qcom_q6v5_mss: support loading MBN file on msm8974 + - RDMA/rxe: Fix race in do_task() when draining + - selftests/mm: fix va_high_addr_switch.sh failure on x86_64 + - wifi: rtw89: fix leak in rtw89_core_send_nullfunc() + - wifi: rtw89: avoid circular locking dependency in ser_state_run() + - PCI: tegra194: Fix duplicate PLL disable in + pex_ep_event_pex_rst_assert() + - remoteproc: qcom: q6v5: Avoid disabling handover IRQ twice + - remoteproc: qcom: pas: Shutdown lite ADSP DTB on X1E + - wifi: ath12k: Refactor RX TID deletion handling into helper function + - wifi: ath12k: Fix flush cache failure during RX queue update + - wifi: cfg80211: fix width unit in cfg80211_radio_chandef_valid() + - dm vdo: return error on corrupted metadata in start_restoring_volume + functions + - coresight: fix indentation error in cscfg_remove_owned_csdev_configs() + - coresight-etm4x: Conditionally access register TRCEXTINSELR + - coresight: tmc: Support atclk + - coresight: catu: Support atclk + - coresight: etm4x: Support atclk + - coresight: Appropriately disable programming clocks + - coresight: Appropriately disable trace bus clocks + - coresight: Avoid enable programming clock duplicately + - coresight: trbe: Return NULL pointer for allocation failures + - coresight: tpda: fix the logic to setup the element size + - coresight: Fix incorrect handling for return value of devm_kzalloc + - NFSv4.1: fix backchannel max_resp_sz verification check + - net: ethtool: tsconfig: set command must provide a reply + - netfilter: nfnetlink: reset nlh pointer during batch replay + - netfilter: nf_conntrack: do not skip entries in /proc/net/nf_conntrack + - scsi: ufs: core: Fix data race in CPU latency PM QoS request handling + - scsi: mpt3sas: Fix crash in transport port remove by using ioc_info() + - usb: vhci-hcd: Prevent suspending virtually attached devices + - PCI: rcar-gen4: Add missing 1ms delay after PWR reset assertion + - PCI: rcar-gen4: Assure reset occurs before DBI access + - PCI: rcar-gen4: Fix inverted break condition in PHY initialization + - ASoC: qcom: sc8280xp: use sa8775p/ subdir for QCS9100 / QCS9075 + - iommu/vt-d: Disallow dirty tracking if incoherent page walk + - iommu/selftest: prevent use of uninitialized variable + - RDMA/siw: Always report immediate post SQ errors + - net: enetc: Fix probing error message typo for the ENETCv4 PF driver + - net: usb: Remove disruptive netif_wake_queue in rtl8150_set_multicast + - ptp: Add a upper bound on max_vclocks + - vhost: vringh: Fix copy_to_iter return value check + - net: macb: remove illusion about TBQPH/RBQPH being per-queue + - net: macb: move ring size computation to functions + - net: macb: single dma_alloc_coherent() for DMA descriptors + - Bluetooth: btintel_pcie: Refactor Device Coredump + - Bluetooth: MGMT: Fix not exposing debug UUID on + MGMT_OP_READ_EXP_FEATURES_INFO + - Bluetooth: ISO: Fix possible UAF on iso_conn_free + - Bluetooth: ISO: free rx_skb if not consumed + - Bluetooth: ISO: don't leak skb in ISO_CONT RX + - Bluetooth: hci_sync: Fix using random address for BIG/PA advertisements + - KEYS: X.509: Fix Basic Constraints CA flag parsing + - hwrng: ks-sa - fix division by zero in ks_sa_rng_init + - cramfs: fix incorrect physical page address calculation + - ocfs2: fix double free in user_cluster_connect() + - drivers/base/node: fix double free in register_one_node() + - f2fs: fix UAF issue in f2fs_merge_page_bio() + - mtd: rawnand: atmel: Fix error handling path in + atmel_nand_controller_add_nands + - PCI: j721e: Fix incorrect error message in probe() + - idpf: fix mismatched free function for dma_alloc_coherent + - tcp: use skb->len instead of skb->truesize in tcp_can_ingest() + - nfp: fix RSS hash key size when RSS is not supported + - net: ena: return 0 in ena_get_rxfh_key_size() when RSS hash key is not + configurable + - net: dlink: handle copy_thresh allocation failure + - net/mlx5: Stop polling for command response if interface goes down + - net/mlx5: pagealloc: Fix reclaim race during command interface teardown + - net/mlx5: fw reset, add reset timeout work + - smb: client: fix crypto buffers in non-linear memory + - bonding: fix xfrm offload feature setup on active-backup mode + - net: enetc: initialize SW PIR and CIR based HW PIR and CIR values + - iommufd: Register iommufd mock devices with fwspec + - Revert "net/mlx5e: Update and set Xon/Xoff upon MTU set" + - NFSD: filecache: add STATX_DIOALIGN and STATX_DIO_READ_ALIGN support + - nfs/localio: avoid issuing misaligned IO using O_DIRECT + - octeontx2-vf: fix bitmap leak + - octeontx2-pf: fix bitmap leak + - vhost: vringh: Modify the return value check + - selftests/bpf: Fix typos and grammar in test sources + - selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c + - selftests/bpf: Fix realloc size in bpf_get_addrs + - bpf: Skip scalar adjustment for BPF_NEG if dst is a pointer + - bpf: Reject negative offsets for ALU ops + - tpm: Disable TPM2_TCG_HMAC by default + - ALSA: hda/hdmi: Add pin fix for HP ProDesk model + - ALSA: hda/realtek: Add quirk for HP Spectre 14t-ea100 + - Squashfs: fix uninit-value in squashfs_get_parent + - uio_hv_generic: Let userspace take care of interrupt mask + - hisi_acc_vfio_pci: Fix reference leak in hisi_acc_vfio_debug_init + - io_uring/waitid: always prune wait queue entry in io_waitid_wait() + - io_uring/zcrx: fix overshooting recv limit + - ASoC: wcd934x: fix error handling in wcd934x_codec_parse_data() + - ASoC: SOF: ipc3-topology: Fix multi-core and static pipelines tear down + - ASoC: codecs: wcd937x: set the comp soundwire port correctly + - ASoC: codecs: wcd937x: make stub functions inline + - ASoC: SOF: ipc4-pcm: fix delay calculation when DSP resamples + - ASoC: SOF: ipc4-pcm: fix start offset calculation for chain DMA + - fs: udf: fix OOB read in lengthAllocDescs handling + - net: nfc: nci: Add parameter validation for packet data + - mfd: rz-mtu3: Fix MTU5 NFCR register offset + - mfd: intel_soc_pmic_chtdc_ti: Set use_single_read regmap_config flag + - mfd: vexpress-sysreg: Check the return value of devm_gpiochip_add_data() + - tracing: Fix lock imbalance in s_start() memory allocation failure path + - tracing: Fix race condition in kprobe initialization causing NULL + pointer dereference + - tracing: Fix wakeup tracers on failure of acquiring calltime + - tracing: Fix irqoff tracers on failure of acquiring calltime + - tracing: Have trace_marker use per-cpu data to read user space + - tracing: Fix tracing_mark_raw_write() to use buf and not ubuf + - tracing: Stop fortify-string from warning in tracing_mark_raw_write() + - dm: fix queue start/stop imbalance under suspend/load/resume races + - dm: fix NULL pointer dereference in __dm_suspend() + - LoongArch: Automatically disable kaslr if boot from kexec_file + - pwm: loongson: Fix LOONGSON_PWM_FREQ_DEFAULT + - LoongArch: BPF: Sign-extend struct ops return values properly + - LoongArch: BPF: No support of struct argument in trampoline programs + - LoongArch: BPF: Don't align trampoline size + - LoongArch: BPF: Make trampoline size stable + - LoongArch: BPF: Make error handling robust in + arch_prepare_bpf_trampoline() + - LoongArch: BPF: Remove duplicated bpf_flush_icache() + - LoongArch: BPF: No text_poke() for kernel text + - LoongArch: BPF: Remove duplicated flags check + - LoongArch: BPF: Fix uninitialized symbol 'retval_off' + - mm/ksm: fix flag-dropping behavior in ksm_madvise + - ksmbd: Fix race condition in RPC handle list access + - ksmbd: fix error code overwriting in smb2_get_info_filesystem() + - ksmbd: add max ip connections parameter + - ext4: fix potential null deref in ext4_mb_init() + - ext4: fix checks for orphan inodes + - KVM: SVM: Skip fastpath emulation on VM-Exit if next RIP isn't valid + - fbdev: simplefb: Fix use after free in simplefb_detach_genpds() + - mm: hugetlb: avoid soft lockup when mprotect to large memory area + - selftests/mm: skip soft-dirty tests when CONFIG_MEM_SOFT_DIRTY is + disabled + - nvdimm: ndtest: Return -ENOMEM if devm_kcalloc() fails in ndtest_probe() + - misc: fastrpc: Save actual DMA size in fastrpc_map structure + - misc: fastrpc: Fix fastrpc_map_lookup operation + - misc: fastrpc: fix possible map leak in fastrpc_put_args + - misc: fastrpc: Skip reference for DMA handles + - Input: atmel_mxt_ts - allow reset GPIO to sleep + - Input: uinput - zero-initialize uinput_ff_upload_compat to avoid info + leak + - sunrpc: fix null pointer dereference on zero-length checksum + - PCI/AER: Avoid NULL pointer dereference in aer_ratelimit() + - remoteproc: pru: Fix potential NULL pointer dereference in + pru_rproc_set_ctable() + - PCI: endpoint: pci-epf-test: Add NULL check for DMA channels before + release + - thunderbolt: Fix use-after-free in tb_dp_dprx_work + - tee: fix register_shm_helper() + - pinctrl: check the return value of pinmux_ops::get_function_name() + - bus: fsl-mc: Check return value of platform_get_resource() + - net/9p: Fix buffer overflow in USB transport layer + - net: usb: asix: hold PM usage ref to avoid PM/MDIO + RTNL deadlock + - usb: typec: tipd: Clear interrupts first + - arm64: dts: qcom: qcm2290: Disable USB SS bus instances in park mode + - usb: cdns3: cdnsp-pci: remove redundant pci_disable_device() call + - scsi: ufs: core: Fix PM QoS mutex initialization + - drm/amdgpu/vcn: Fix double-free of vcn dump buffer + - Linux 6.17.3 + * CVE-2025-40019 + - crypto: essiv - Check ssize for decryption and in-place encryption + * CVE-2025-40214 + - af_unix: Initialise scc_index in unix_add_edge(). + * Miscellaneous Ubuntu changes + - [SAUCE] Fix selftest/net/rtnetlink.sh for Big Endian + + -- Abdur Rahman Fri, 16 Jan 2026 15:25:40 -0500 linux-nvidia-6.17 (6.17.0-1006.6) noble; urgency=medium diff --git a/debian.nvidia-6.17/reconstruct b/debian.nvidia-6.17/reconstruct index 2e620630217d9..8c46e786cef15 100644 --- a/debian.nvidia-6.17/reconstruct +++ b/debian.nvidia-6.17/reconstruct @@ -29,6 +29,9 @@ chmod +x 'debian/templates/image.preinst.in' chmod +x 'debian/templates/image.prerm.in' chmod +x 'debian/tests/rebuild' chmod +x 'debian/tests/ubuntu-regression-suite' +chmod -x 'drivers/edac/ecs.c' +chmod -x 'drivers/edac/mem_repair.c' +chmod -x 'drivers/edac/scrub.c' chmod +x 'drivers/net/ethernet/realtek/r8127/Makefile' chmod +x 'drivers/net/ethernet/realtek/r8127/r8127.h' chmod +x 'drivers/net/ethernet/realtek/r8127/r8127_dash.h' From f4362ad9a1b415d78d0674a0b97c111a477ab992 Mon Sep 17 00:00:00 2001 From: Leon Yen Date: Thu, 11 Dec 2025 20:38:36 +0800 Subject: [PATCH 212/247] NVIDIA: SAUCE: wifi: mt76: mt7925: Fix incorrect MLO mode in firmware control BugLink: https://bugs.launchpad.net/bugs/2138755 The selection of MLO mode should depend on the capabilities of the STA rather than those of the peer AP to avoid compatibility issues with certain APs, such as Xiaomi BE5000 WiFi7 router. Fixes: 69acd6d910b0c ("wifi: mt76: mt7925: add mt7925_change_vif_links") Signed-off-by: Leon Yen (backported from https://lore.kernel.org/all/20251211123836.4169436-1-leon.yen@mediatek.com/) Signed-off-by: Muteeb Akram Acked-by: Jamie Nguyen Acked-by: Carol L Soto Acked-by: Jacob Martin Acked-by: Abdur Rahman Signed-off-by: Brad Figg --- drivers/net/wireless/mediatek/mt76/mt7925/mcu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c index 10d68d241ba1f..90a68a59af2e7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c @@ -1280,6 +1280,8 @@ int mt7925_mcu_set_mlo_roc(struct mt792x_bss_conf *mconf, u16 sel_links, .roc[1].len = cpu_to_le16(sizeof(struct roc_acquire_tlv)) }; + struct wiphy *wiphy = mvif->phy->mt76->hw->wiphy; + if (!mconf || hweight16(vif->valid_links) < 2 || hweight16(sel_links) != 2) return -EPERM; @@ -1302,7 +1304,8 @@ int mt7925_mcu_set_mlo_roc(struct mt792x_bss_conf *mconf, u16 sel_links, is_AG_band |= links[i].chan->band == NL80211_BAND_2GHZ; } - if (vif->cfg.eml_cap & IEEE80211_EML_CAP_EMLSR_SUPP) + if (!(wiphy->iftype_ext_capab[0].mld_capa_and_ops & + IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS)) type = is_AG_band ? MT7925_ROC_REQ_MLSR_AG : MT7925_ROC_REQ_MLSR_AA; else From aa49274b93a7360206f78494ac3ec1789773d28c Mon Sep 17 00:00:00 2001 From: Abdur Rahman Date: Tue, 20 Jan 2026 16:51:03 -0500 Subject: [PATCH 213/247] UBUNTU: Start new release Ignore: yes Signed-off-by: Abdur Rahman --- debian.nvidia-6.17/changelog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/debian.nvidia-6.17/changelog b/debian.nvidia-6.17/changelog index 34e0bc30c37e7..9846739ee2b16 100644 --- a/debian.nvidia-6.17/changelog +++ b/debian.nvidia-6.17/changelog @@ -1,3 +1,11 @@ +linux-nvidia-6.17 (6.17.0-1008.8) UNRELEASED; urgency=medium + + CHANGELOG: Do not edit directly. Autogenerated at release. + CHANGELOG: Use the printchanges target to see the current changes. + CHANGELOG: Use the insertchanges target to create the final log. + + -- Abdur Rahman Tue, 20 Jan 2026 16:51:03 -0500 + linux-nvidia-6.17 (6.17.0-1007.7) noble; urgency=medium * noble/linux-nvidia-6.17: 6.17.0-1007.7 -proposed tracker (LP: #2137561) From 2a103655cb659982f6be989b13c320048b4a31ee Mon Sep 17 00:00:00 2001 From: Abdur Rahman Date: Tue, 20 Jan 2026 16:53:03 -0500 Subject: [PATCH 214/247] UBUNTU: link-to-tracker: update tracking bug BugLink: https://bugs.launchpad.net/bugs/2138765 Properties: no-test-build Signed-off-by: Abdur Rahman --- debian.nvidia-6.17/tracking-bug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian.nvidia-6.17/tracking-bug b/debian.nvidia-6.17/tracking-bug index 780343b0a29a0..6400076c36ec6 100644 --- a/debian.nvidia-6.17/tracking-bug +++ b/debian.nvidia-6.17/tracking-bug @@ -1 +1 @@ -2137561 d2025.12.18-1 +2138765 d2025.12.18-2 From c77e62818537e7d9c5305810f91cabfaada468f7 Mon Sep 17 00:00:00 2001 From: Abdur Rahman Date: Tue, 20 Jan 2026 16:54:13 -0500 Subject: [PATCH 215/247] UBUNTU: Ubuntu-nvidia-6.17-6.17.0-1008.8 Signed-off-by: Abdur Rahman --- debian.nvidia-6.17/changelog | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/debian.nvidia-6.17/changelog b/debian.nvidia-6.17/changelog index 9846739ee2b16..0e7f10d1594ec 100644 --- a/debian.nvidia-6.17/changelog +++ b/debian.nvidia-6.17/changelog @@ -1,10 +1,12 @@ -linux-nvidia-6.17 (6.17.0-1008.8) UNRELEASED; urgency=medium +linux-nvidia-6.17 (6.17.0-1008.8) noble; urgency=medium - CHANGELOG: Do not edit directly. Autogenerated at release. - CHANGELOG: Use the printchanges target to see the current changes. - CHANGELOG: Use the insertchanges target to create the final log. + * noble/linux-nvidia-6.17: 6.17.0-1008.8 -proposed tracker (LP: #2138765) - -- Abdur Rahman Tue, 20 Jan 2026 16:51:03 -0500 + * mt7925: Incorrect MLO mode in firmware control (LP: #2138755) + - NVIDIA: SAUCE: wifi: mt76: mt7925: Fix incorrect MLO mode in firmware + control + + -- Abdur Rahman Tue, 20 Jan 2026 16:54:12 -0500 linux-nvidia-6.17 (6.17.0-1007.7) noble; urgency=medium From 222e25ff771fa40aeb2f4ee2f94e523330fedea2 Mon Sep 17 00:00:00 2001 From: Kartik Rajput Date: Mon, 1 Dec 2025 13:48:51 +0530 Subject: [PATCH 216/247] NVIDIA: VR: SAUCE: soc/tegra: misc: Use SMCCC to get chipid BugLink: https://bugs.launchpad.net/bugs/2138329 Tegra410 and Tegra241 have deprecated HIDREV register. It is recommended to use ARM SMCCC calls to get chip_id, major and minor revisions. Use ARM SMCCC to get chip_id, major and minor revision. Signed-off-by: Kartik Rajput Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/soc/tegra/fuse/tegra-apbmisc.c | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c index 0ce94fdc536fb..7ef1782534d5e 100644 --- a/drivers/soc/tegra/fuse/tegra-apbmisc.c +++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -27,6 +28,11 @@ #define PMC_STRAPPING_OPT_A_RAM_CODE_MASK_SHORT \ (0x3 << PMC_STRAPPING_OPT_A_RAM_CODE_SHIFT) +#define TEGRA_SMCCC_PLATFORM(x) ((x >> 8) & 0xff) +#define TEGRA_SMCCC_CHIP_ID(x) ((x >> 4) & 0xff) +#define TEGRA_SMCCC_MAJOR_REV(x) (x & 0xf) +#define TEGRA_SMCCC_MINOR_REV(x) (x & 0xf) + static void __iomem *apbmisc_base; static bool long_ram_code; static u32 strapping; @@ -41,21 +47,46 @@ u32 tegra_read_chipid(void) u8 tegra_get_chip_id(void) { +#ifdef CONFIG_HAVE_ARM_SMCCC_DISCOVERY + s32 soc_id = arm_smccc_get_soc_id_version(); + + if (soc_id >= 0) + return TEGRA_SMCCC_CHIP_ID(soc_id); +#endif return (tegra_read_chipid() >> 8) & 0xff; } u8 tegra_get_major_rev(void) { +#ifdef CONFIG_HAVE_ARM_SMCCC_DISCOVERY + s32 soc_id = arm_smccc_get_soc_id_version(); + + if (soc_id >= 0) + return TEGRA_SMCCC_MAJOR_REV(soc_id); +#endif return (tegra_read_chipid() >> 4) & 0xf; } u8 tegra_get_minor_rev(void) { +#ifdef CONFIG_HAVE_ARM_SMCCC_DISCOVERY + s32 revision = arm_smccc_get_soc_id_revision(); + + if (revision >= 0) + return TEGRA_SMCCC_MINOR_REV(revision); +#endif return (tegra_read_chipid() >> 16) & 0xf; + } u8 tegra_get_platform(void) { +#ifdef CONFIG_HAVE_ARM_SMCCC_DISCOVERY + s32 revision = arm_smccc_get_soc_id_revision(); + + if (revision >= 0) + return TEGRA_SMCCC_PLATFORM(revision); +#endif return (tegra_read_chipid() >> 20) & 0xf; } From ed96f968e6c3e8b5285a46e0b7cc05f60851bb7d Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Wed, 5 Nov 2025 18:38:49 +0800 Subject: [PATCH 217/247] arch_topology: Provide a stub topology_core_has_smt() for !CONFIG_GENERIC_ARCH_TOPOLOGY BugLink: https://bugs.launchpad.net/bugs/2138375 The arm_pmu driver is using topology_core_has_smt() for retrieving the SMT implementation which depends on CONFIG_GENERIC_ARCH_TOPOLOGY. The config is optional on arm platforms so provide a !CONFIG_GENERIC_ARCH_TOPOLOGY stub for topology_core_has_smt(). Fixes: c3d78c34ad00 ("perf: arm_pmuv3: Don't use PMCCNTR_EL0 on SMT cores") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511041757.vuCGOmFc-lkp@intel.com/ Suggested-by: Will Deacon Signed-off-by: Yicong Yang Reviewed-by: Mark Brown Signed-off-by: Will Deacon (cherry picked from commit 7ab06ea41af53aa1713186ceaa154179e4b0d4c9) Signed-off-by: Nirmoy Das Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- include/linux/arch_topology.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index daa1af2e8204b..0c2a8b846c20c 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -100,6 +100,10 @@ static inline bool topology_core_has_smt(int cpu) return cpu_topology[cpu].thread_id != -1; } -#endif +#else + +static inline bool topology_core_has_smt(int cpu) { return false; } + +#endif /* CONFIG_GENERIC_ARCH_TOPOLOGY */ #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ From f198764ea997285f369d115202a577f6dee55b0a Mon Sep 17 00:00:00 2001 From: Srirangan Madhavan Date: Thu, 20 Feb 2025 20:39:06 -0800 Subject: [PATCH 218/247] NVIDIA: VR: SAUCE: cxl: add support for cxl reset BugLink: https://bugs.launchpad.net/bugs/2138266 Type 2 devices are being introduced and will require finer-grained reset mechanisms beyond bus-wide reset methods. Add support for CXL reset per CXL v3.2 Section 9.6/9.7 Signed-off-by: Srirangan Madhavan (backported from https://lore.kernel.org/all/20250221043906.1593189-3-smadhavan@nvidia.com/) [Nirmoy: Add #include "../cxl/cxlpci.h" and fix a compile error with if (reg & CXL_DVSEC_CXL_RST_CAPABLE == 0)] Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/cxl/cxlpci.h | 40 ++++++++---- drivers/pci/pci.c | 147 +++++++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 2 +- 3 files changed, 175 insertions(+), 14 deletions(-) diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 54e219b0049ea..67ad5b007498e 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -16,19 +16,33 @@ /* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ #define CXL_DVSEC_PCIE_DEVICE 0 -#define CXL_DVSEC_CAP_OFFSET 0xA -#define CXL_DVSEC_MEM_CAPABLE BIT(2) -#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4) -#define CXL_DVSEC_CTRL_OFFSET 0xC -#define CXL_DVSEC_MEM_ENABLE BIT(2) -#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) -#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) -#define CXL_DVSEC_MEM_INFO_VALID BIT(0) -#define CXL_DVSEC_MEM_ACTIVE BIT(1) -#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28) -#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) -#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) -#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) +#define CXL_DVSEC_CAP_OFFSET 0xA +#define CXL_DVSEC_CACHE_CAPABLE BIT(0) +#define CXL_DVSEC_MEM_CAPABLE BIT(2) +#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4) +#define CXL_DVSEC_CACHE_WBI_CAPABLE BIT(6) +#define CXL_DVSEC_CXL_RST_CAPABLE BIT(7) +#define CXL_DVSEC_CXL_RST_TIMEOUT_MASK GENMASK(10, 8) +#define CXL_DVSEC_CXL_RST_MEM_CLR_CAPABLE BIT(11) +#define CXL_DVSEC_CTRL_OFFSET 0xC +#define CXL_DVSEC_MEM_ENABLE BIT(2) +#define CXL_DVSEC_CTRL2_OFFSET 0x10 +#define CXL_DVSEC_DISABLE_CACHING BIT(0) +#define CXL_DVSEC_INIT_CACHE_WBI BIT(1) +#define CXL_DVSEC_INIT_CXL_RESET BIT(2) +#define CXL_DVSEC_CXL_RST_MEM_CLR_ENABLE BIT(3) +#define CXL_DVSEC_STATUS2_OFFSET 0x12 +#define CXL_DVSEC_CACHE_INVALID BIT(0) +#define CXL_DVSEC_CXL_RST_COMPLETE BIT(1) +#define CXL_DVSEC_CXL_RESET_ERR BIT(2) +#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + ((i) * 0x10)) +#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + ((i) * 0x10)) +#define CXL_DVSEC_MEM_INFO_VALID BIT(0) +#define CXL_DVSEC_MEM_ACTIVE BIT(1) +#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28) +#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + ((i) * 0x10)) +#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + ((i) * 0x10)) +#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) #define CXL_DVSEC_RANGE_MAX 2 diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 372de7961d2a6..9a6943688e6db 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -30,6 +30,7 @@ #include #include #include +#include "../cxl/cxlpci.h" #include "pci.h" DEFINE_MUTEX(pci_slot_mutex); @@ -5133,6 +5134,151 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) return rc; } +static int cxl_reset_prepare(struct pci_dev *dev, u16 dvsec) +{ + u32 timeout_us = 100, timeout_tot_us = 10000; + u16 reg, cap; + int rc; + + if (!pci_wait_for_pending_transaction(dev)) + pci_err(dev, "timed out waiting for pending transaction; performing cxl reset anyway\n"); + + /* Check if the device is cache capable. */ + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CAP_OFFSET, &cap); + if (rc) + return rc; + + if (!(cap & CXL_DVSEC_CACHE_CAPABLE)) + return 0; + + /* Disable cache. WB and invalidate cache if capability is advertised */ + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, ®); + if (rc) + return rc; + reg |= CXL_DVSEC_DISABLE_CACHING; + /* + * DEVCTL2 bits are written only once. So check WB+I capability while + * keeping disable caching set. + */ + if (cap & CXL_DVSEC_CACHE_WBI_CAPABLE) + reg |= CXL_DVSEC_INIT_CACHE_WBI; + pci_write_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, reg); + + /* + * From Section 9.6: "Software may leverage the cache size reported in + * the DVSEC CXL Capability2 register to compute a suitable timeout + * value". + * Given there is no conversion factor for cache size -> timeout, + * setting timer for default 10ms. + */ + do { + if (timeout_tot_us == 0) + return -ETIMEDOUT; + usleep_range(timeout_us, timeout_us + 1); + timeout_tot_us -= timeout_us; + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, + ®); + if (rc) + return rc; + } while (!(reg & CXL_DVSEC_CACHE_INVALID)); + + return 0; +} + +static int cxl_reset_init(struct pci_dev *dev, u16 dvsec) +{ + /* + * Timeout values ref CXL Spec v3.2 Ch 8 Control and Status Registers, + * under section 8.1.3.1 DVSEC CXL Capability. + */ + u32 reset_timeouts_ms[] = { 10, 100, 1000, 10000, 100000 }; + u16 reg; + u32 timeout_ms; + int rc, ind; + + /* Check if CXL Reset MEM CLR is supported. */ + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CAP_OFFSET, ®); + if (rc) + return rc; + + if (reg & CXL_DVSEC_CXL_RST_MEM_CLR_CAPABLE) { + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, + ®); + if (rc) + return rc; + + reg |= CXL_DVSEC_CXL_RST_MEM_CLR_ENABLE; + pci_write_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, reg); + } + + /* Read timeout value. */ + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CAP_OFFSET, ®); + if (rc) + return rc; + ind = FIELD_GET(CXL_DVSEC_CXL_RST_TIMEOUT_MASK, reg); + timeout_ms = reset_timeouts_ms[ind]; + + /* Write reset config. */ + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, ®); + if (rc) + return rc; + + reg |= CXL_DVSEC_INIT_CXL_RESET; + pci_write_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, reg); + + /* Wait till timeout and then check reset status is complete. */ + msleep(timeout_ms); + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_STATUS2_OFFSET, ®); + if (rc) + return rc; + if (reg & CXL_DVSEC_CXL_RESET_ERR || + ~reg & CXL_DVSEC_CXL_RST_COMPLETE) + return -ETIMEDOUT; + + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, ®); + if (rc) + return rc; + reg &= (~CXL_DVSEC_DISABLE_CACHING); + pci_write_config_word(dev, dvsec + CXL_DVSEC_CTRL2_OFFSET, reg); + + return 0; +} + +/** + * cxl_reset - initiate a cxl reset + * @dev: device to reset + * @probe: if true, return 0 if device can be reset this way + * + * Initiate a cxl reset on @dev. + */ +static int cxl_reset(struct pci_dev *dev, bool probe) +{ + u16 dvsec, reg; + int rc; + + dvsec = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL, + CXL_DVSEC_PCIE_DEVICE); + if (!dvsec) + return -ENOTTY; + + /* Check if CXL Reset is supported. */ + rc = pci_read_config_word(dev, dvsec + CXL_DVSEC_CAP_OFFSET, ®); + if (rc) + return -ENOTTY; + + if ((reg & CXL_DVSEC_CXL_RST_CAPABLE) == 0) + return -ENOTTY; + + if (probe) + return 0; + + rc = cxl_reset_prepare(dev, dvsec); + if (rc) + return rc; + + return cxl_reset_init(dev, dvsec); +} + void pci_dev_lock(struct pci_dev *dev) { /* block PM suspend, driver probe, etc. */ @@ -5219,6 +5365,7 @@ const struct pci_reset_fn_method pci_reset_fn_methods[] = { { pci_dev_acpi_reset, .name = "acpi" }, { pcie_reset_flr, .name = "flr" }, { pci_af_flr, .name = "af_flr" }, + { cxl_reset, .name = "cxl_reset" }, { pci_pm_reset, .name = "pm" }, { pci_reset_bus_function, .name = "bus" }, { cxl_reset_bus_function, .name = "cxl_bus" }, diff --git a/include/linux/pci.h b/include/linux/pci.h index 17e244200d2c4..f06af598122f6 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -51,7 +51,7 @@ PCI_STATUS_PARITY) /* Number of reset methods used in pci_reset_fn_methods array in pci.c */ -#define PCI_NUM_RESET_METHODS 8 +#define PCI_NUM_RESET_METHODS 9 #define PCI_RESET_PROBE true #define PCI_RESET_DO_RESET false From d423d35e17bf9c259f9c8b46bf4a6844f9b5a76a Mon Sep 17 00:00:00 2001 From: Vishal Aslot Date: Tue, 14 Oct 2025 19:40:05 -0700 Subject: [PATCH 219/247] NVIDIA: VR: SAUCE: cxl_test: enable zero sized decoders under hb0 BugLink: https://bugs.launchpad.net/bugs/2138266 The cxl core in linux updated to supported committed decoders of zero size, because this is allowed by the CXL spec. This patch updates cxl_test to enable decoders 1 and 2 in the host-bridge 0 port, in a switch uport under hb0, and the endpoints ports with size zero simulating committed zero sized decoders. Signed-off-by: Vishal Aslot (backported from https://lore.kernel.org/all/20251015024019.1189713-1-vaslot@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- tools/testing/cxl/test/cxl.c | 96 +++++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 2 deletions(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 6a25cca5636f7..f4dceecf7e335 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -726,6 +726,45 @@ static void default_mock_decoder(struct cxl_decoder *cxld) cxld->reset = mock_decoder_reset; } +static void size_zero_mock_decoder_ep(struct cxl_decoder *cxld, u64 base) +{ + struct cxl_endpoint_decoder *cxled; + + cxled = to_cxl_endpoint_decoder(&cxld->dev); + cxld->hpa_range = (struct range){ + .start = base, + .end = base - 1, /* Size 0 */ + }; + + cxld->interleave_ways = 2; + cxld->interleave_granularity = 4096; + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + cxld->flags = CXL_DECODER_F_ENABLE; + cxled->state = CXL_DECODER_STATE_AUTO; + cxld->commit = mock_decoder_commit; + cxld->reset = mock_decoder_reset; +} + +static void size_zero_mock_decoder_sw(struct device *dev, u64 base, int i) +{ + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + + cxlsd = to_cxl_switch_decoder(dev); + cxld = &cxlsd->cxld; + cxld->flags = CXL_DECODER_F_ENABLE; + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + if (i == 0) + cxld->interleave_ways = 2; + else + cxld->interleave_ways = 1; + cxld->interleave_granularity = 4096; + cxld->hpa_range = (struct range) { + .start = base, + .end = base - 1, /* Size 0 */ + }; +} + static int first_decoder(struct device *dev, const void *data) { struct cxl_decoder *cxld; @@ -738,6 +777,30 @@ static int first_decoder(struct device *dev, const void *data) return 0; } +static int second_decoder(struct device *dev, const void *data) +{ + struct cxl_decoder *cxld; + + if (!is_switch_decoder(dev)) + return 0; + cxld = to_cxl_decoder(dev); + if (cxld->id == 1) + return 1; + return 0; +} + +static int third_decoder(struct device *dev, const void *data) +{ + struct cxl_decoder *cxld; + + if (!is_switch_decoder(dev)) + return 0; + cxld = to_cxl_decoder(dev); + if (cxld->id == 2) + return 1; + return 0; +} + static void mock_init_hdm_decoder(struct cxl_decoder *cxld) { struct acpi_cedt_cfmws *window = mock_cfmws[0]; @@ -750,7 +813,7 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) struct cxl_dport *dport; struct device *dev; bool hb0 = false; - u64 base; + u64 base = window->base_hpa; int i; if (is_endpoint_decoder(&cxld->dev)) { @@ -774,6 +837,20 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) port = cxled_to_port(cxled); } + /* + * Decoders 1 and 2 of the endpoint under host bridge 0 should be enabled as zero-sized. + * It would be even better to make sure that the parent switch uport decoder was + * also enabled before enabling the size zero decoders but there is no harm in doing it + * anyway. + */ + if (hb0 && (cxld->id == 1 || cxld->id == 2)) { + port = to_cxl_port(cxld->dev.parent); + size_zero_mock_decoder_ep(cxld, base); + /* Commit the zero-sized decoder */ + port->commit_end = cxld->id; + return; + } + /* * The first decoder on the first 2 devices on the first switch * attached to host-bridge0 mock a fake / static RAM region. All @@ -787,7 +864,6 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) return; } - base = window->base_hpa; cxld->hpa_range = (struct range) { .start = base, .end = base + size - 1, @@ -845,6 +921,22 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) .end = base + size - 1, }; put_device(dev); + + /* Enable the next two decoders also and make them zero sized */ + dev = device_find_child(&iter->dev, NULL, second_decoder); + WARN_ON(!dev); + if (dev) { + size_zero_mock_decoder_sw(dev, base, i); + iter->commit_end = 1; + put_device(dev); + } + dev = device_find_child(&iter->dev, NULL, third_decoder); + WARN_ON(!dev); + if (dev) { + size_zero_mock_decoder_sw(dev, base, i); + iter->commit_end = 2; + put_device(dev); + } } } From baae30ad3a923babf1f5b92781e353b06ae470fb Mon Sep 17 00:00:00 2001 From: Vishal Aslot Date: Tue, 14 Oct 2025 19:40:06 -0700 Subject: [PATCH 220/247] NVIDIA: VR: SAUCE: cxl: Allow zero sized HDM decoders BugLink: https://bugs.launchpad.net/bugs/2138266 CXL spec permits committing zero sized decoders. Linux currently considers them as an error. Zero-sized decoders are helpful when the BIOS is committing them. Often BIOS will also lock them to prevent them being changed due to the TSP requirement. For example, if the type 3 device is part of a TCB. The host bridge, switch, and end-point decoders can all be committed with zero-size. If they are locked along the VH, it is often to prevent hotplugging of a new device that could not be attested post boot and cannot be included in TCB. The caller leaves the decoder allocated but does not add it. It simply continues to the next decoder. Signed-off-by: Vishal Aslot (backported from https://lore.kernel.org/all/20251015024019.1189713-1-vaslot@nvidia.com/) Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/cxl/core/hdm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index e9e1d555cec65..4550915fee1ef 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -1046,13 +1046,14 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, return -ENXIO; } + port->commit_end = cxld->id; + if (size == 0) { - dev_warn(&port->dev, + dev_dbg(&port->dev, "decoder%d.%d: Committed with zero size\n", port->id, cxld->id); - return -ENXIO; + return -ENOSPC; } - port->commit_end = cxld->id; } else { if (cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); @@ -1210,6 +1211,8 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, rc = init_hdm_decoder(port, cxld, target_map, hdm, i, &dpa_base, info); if (rc) { + if (rc == -ENOSPC) + continue; dev_warn(&port->dev, "Failed to initialize decoder%d.%d\n", port->id, i); From bd4c79069f610028914d1067c8d3b88246e3614b Mon Sep 17 00:00:00 2001 From: Koba Ko Date: Tue, 25 Nov 2025 10:23:16 +0000 Subject: [PATCH 221/247] NVIDIA: VR: SAUCE: cxl/hdm: Fix infinite loop in DPA partition discovery BugLink: https://bugs.launchpad.net/bugs/2138266 The loop condition in __cxl_dpa_reserve() is missing the comparison operator, causing potential infinite loop and array out-of-bounds: for (int i = 0; cxlds->nr_partitions; i++) Should be: for (int i = 0; i < cxlds->nr_partitions; i++) Without the '<' operator, if no partition matches the decoder's DPA resource, 'i' increments beyond the part[] array bounds (size 2), triggering UBSAN errors and corrupting the part index. Fixes: be5cbd0840275 ("cxl: Kill enum cxl_decoder_mode") Signed-off-by: Koba Ko Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/cxl/core/hdm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 4550915fee1ef..e930191057c04 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -413,7 +413,7 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, * is not set. */ if (cxled->part < 0) - for (int i = 0; cxlds->nr_partitions; i++) + for (int i = 0; i < cxlds->nr_partitions; i++) if (resource_contains(&cxlds->part[i].res, res)) { cxled->part = i; break; From 11783e1c4bf1e41826c3e69282e4d150287b3f17 Mon Sep 17 00:00:00 2001 From: Koba Ko Date: Tue, 25 Nov 2025 13:07:35 +0000 Subject: [PATCH 222/247] NVIDIA: VR: SAUCE: cxl/region: Validate partition index before array access BugLink: https://bugs.launchpad.net/bugs/2138266 Check partition index bounds before accessing cxlds->part[] to prevent out-of-bounds when part is -1 or invalid. Fixes: 5ec67596e368) cxl/region: Drop goto pattern of construct_region() Signed-off-by: Koba Ko Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/cxl/core/region.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index be45211843282..adebbb1db5078 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3410,6 +3410,14 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, int rc, part = READ_ONCE(cxled->part); struct cxl_region *cxlr; + if (part < 0 || part >= cxlds->nr_partitions) { + dev_err(cxlmd->dev.parent, + "%s:%s: invalid partition index %d (max %u)\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + part, cxlds->nr_partitions); + return ERR_PTR(-ENXIO); + } + do { cxlr = __create_region(cxlrd, cxlds->part[part].mode, atomic_read(&cxlrd->region_id)); From d1df8431c200fd1c00f5b6ece76efe1f1d8866d1 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Mon, 12 Jan 2026 11:14:02 -0800 Subject: [PATCH 223/247] NVIDIA: VR: SAUCE: [Config] Add a CXL config for CXL type 3 devices BugLink: https://bugs.launchpad.net/bugs/2138266 Signed-off-by: Nirmoy Das Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- debian.nvidia-6.17/config/annotations | 3 +++ 1 file changed, 3 insertions(+) diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index f26617dfcb574..e3f884ae0cafb 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -96,6 +96,9 @@ CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE note<'LP: #2028576: Perf governo CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL policy<{'amd64': 'n', 'arm64': 'n'}> CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL note<'LP: #2028576: Perf governor required for NVIDIA workloads'> +CONFIG_CXL_MEM_RAW_COMMANDS policy<{'amd64': 'n', 'arm64': 'y'}> +CONFIG_CXL_MEM_RAW_COMMANDS note<'Enable CXL raw commands for memory devices'> + CONFIG_DRM_NOUVEAU policy<{'amd64': 'n', 'arm64': 'n'}> CONFIG_DRM_NOUVEAU note<'Disable nouveau for NVIDIA kernels'> From 6ba6a19bea658fd694af3f11519a057c733de8cf Mon Sep 17 00:00:00 2001 From: Akhil R Date: Mon, 18 Aug 2025 10:03:45 +0530 Subject: [PATCH 224/247] i2c: tegra: Add Tegra256 support BugLink: https://bugs.launchpad.net/bugs/2138238 Add compatible and the hardware struct for Tegra256. Tegra256 controllers use a different parent clock. Hence the timing parameters are different from the previous generations to meet the expected frequencies. Signed-off-by: Akhil R Acked-by: Thierry Reding Signed-off-by: Wolfram Sang (cherry picked from commit 6e3cb25e62f2081f19057f1abe62c014b8e814de) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 4eb31b913c1a7..e533460bccc39 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1649,7 +1649,33 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { .has_interface_timing_reg = true, }; +static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { + .has_continue_xfer_support = true, + .has_per_pkt_xfer_complete_irq = true, + .clk_divisor_hs_mode = 7, + .clk_divisor_std_mode = 0x7a, + .clk_divisor_fast_mode = 0x40, + .clk_divisor_fast_plus_mode = 0x19, + .has_config_load_reg = true, + .has_multi_master_mode = true, + .has_slcg_override_reg = true, + .has_mst_fifo = true, + .has_mst_reset = true, + .quirks = &tegra194_i2c_quirks, + .supports_bus_clear = true, + .has_apb_dma = false, + .tlow_std_mode = 0x8, + .thigh_std_mode = 0x7, + .tlow_fast_fastplus_mode = 0x3, + .thigh_fast_fastplus_mode = 0x3, + .setup_hold_time_std_mode = 0x08080808, + .setup_hold_time_fast_fast_plus_mode = 0x02020202, + .setup_hold_time_hs_mode = 0x090909, + .has_interface_timing_reg = true, +}; + static const struct of_device_id tegra_i2c_of_match[] = { + { .compatible = "nvidia,tegra256-i2c", .data = &tegra256_i2c_hw, }, { .compatible = "nvidia,tegra194-i2c", .data = &tegra194_i2c_hw, }, { .compatible = "nvidia,tegra186-i2c", .data = &tegra186_i2c_hw, }, #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) From abf4b63a8ba32d1ec64e09381f74515336f8094e Mon Sep 17 00:00:00 2001 From: Kartik Rajput Date: Tue, 18 Nov 2025 19:36:15 +0530 Subject: [PATCH 225/247] NVIDIA: VR: SAUCE: i2c: tegra: Do not configure DMA if not supported BugLink: https://bugs.launchpad.net/bugs/2138238 On Tegra264, not all I2C controllers have the necessary interface to GPC DMA, this causes failures when function tegra_i2c_init_dma() is called. Ensure that "dmas" device-tree property is present before initializing DMA in function tegra_i2c_init_dma(). Signed-off-by: Kartik Rajput Reviewed-by: Jon Hunter Acked-by: Thierry Reding (backported from https://lore.kernel.org/linux-tegra/20251118140620.549-1-akhilrajeev@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index e533460bccc39..bd26b232ffb33 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -449,6 +449,11 @@ static int tegra_i2c_init_dma(struct tegra_i2c_dev *i2c_dev) if (IS_VI(i2c_dev)) return 0; + if (!of_property_present(i2c_dev->dev->of_node, "dmas")) { + dev_dbg(i2c_dev->dev, "DMA not available, falling back to PIO\n"); + return 0; + } + if (i2c_dev->hw->has_apb_dma) { if (!IS_ENABLED(CONFIG_TEGRA20_APB_DMA)) { dev_dbg(i2c_dev->dev, "APB DMA support not enabled\n"); From 0ccc6e65e436c327647d7a99328ee91fbefe1b94 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Tue, 18 Nov 2025 19:36:16 +0530 Subject: [PATCH 226/247] NVIDIA: VR: SAUCE: i2c: tegra: Use separate variables for fast and fastplus BugLink: https://bugs.launchpad.net/bugs/2138238 The current implementation uses a single value of THIGH, TLOW and setup hold time for both fast and fastplus. But these values can be different for each speed mode and should be using separate variables. Split the variables used for fast and fast plus mode. Signed-off-by: Akhil R Reviewed-by: Jon Hunter Acked-by: Thierry Reding (backported from https://lore.kernel.org/linux-tegra/20251118140620.549-1-akhilrajeev@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 119 ++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 46 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index bd26b232ffb33..c0382c9a04308 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -196,12 +196,16 @@ enum msg_end_type { * @has_apb_dma: Support of APBDMA on corresponding Tegra chip. * @tlow_std_mode: Low period of the clock in standard mode. * @thigh_std_mode: High period of the clock in standard mode. - * @tlow_fast_fastplus_mode: Low period of the clock in fast/fast-plus modes. - * @thigh_fast_fastplus_mode: High period of the clock in fast/fast-plus modes. + * @tlow_fast_mode: Low period of the clock in fast mode. + * @thigh_fast_mode: High period of the clock in fast mode. + * @tlow_fastplus_mode: Low period of the clock in fast-plus mode. + * @thigh_fastplus_mode: High period of the clock in fast-plus mode. * @setup_hold_time_std_mode: Setup and hold time for start and stop conditions * in standard mode. - * @setup_hold_time_fast_fast_plus_mode: Setup and hold time for start and stop - * conditions in fast/fast-plus modes. + * @setup_hold_time_fast_mode: Setup and hold time for start and stop + * conditions in fast mode. + * @setup_hold_time_fastplus_mode: Setup and hold time for start and stop + * conditions in fast-plus mode. * @setup_hold_time_hs_mode: Setup and hold time for start and stop conditions * in HS mode. * @has_interface_timing_reg: Has interface timing register to program the tuned @@ -224,10 +228,13 @@ struct tegra_i2c_hw_feature { bool has_apb_dma; u32 tlow_std_mode; u32 thigh_std_mode; - u32 tlow_fast_fastplus_mode; - u32 thigh_fast_fastplus_mode; + u32 tlow_fast_mode; + u32 thigh_fast_mode; + u32 tlow_fastplus_mode; + u32 thigh_fastplus_mode; u32 setup_hold_time_std_mode; - u32 setup_hold_time_fast_fast_plus_mode; + u32 setup_hold_time_fast_mode; + u32 setup_hold_time_fastplus_mode; u32 setup_hold_time_hs_mode; bool has_interface_timing_reg; }; @@ -677,25 +684,21 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) if (IS_VI(i2c_dev)) tegra_i2c_vi_init(i2c_dev); - switch (t->bus_freq_hz) { - case I2C_MAX_STANDARD_MODE_FREQ + 1 ... I2C_MAX_FAST_MODE_PLUS_FREQ: - default: - tlow = i2c_dev->hw->tlow_fast_fastplus_mode; - thigh = i2c_dev->hw->thigh_fast_fastplus_mode; - tsu_thd = i2c_dev->hw->setup_hold_time_fast_fast_plus_mode; - - if (t->bus_freq_hz > I2C_MAX_FAST_MODE_FREQ) - non_hs_mode = i2c_dev->hw->clk_divisor_fast_plus_mode; - else - non_hs_mode = i2c_dev->hw->clk_divisor_fast_mode; - break; - - case 0 ... I2C_MAX_STANDARD_MODE_FREQ: + if (t->bus_freq_hz <= I2C_MAX_STANDARD_MODE_FREQ) { tlow = i2c_dev->hw->tlow_std_mode; thigh = i2c_dev->hw->thigh_std_mode; tsu_thd = i2c_dev->hw->setup_hold_time_std_mode; non_hs_mode = i2c_dev->hw->clk_divisor_std_mode; - break; + } else if (t->bus_freq_hz <= I2C_MAX_FAST_MODE_FREQ) { + tlow = i2c_dev->hw->tlow_fast_mode; + thigh = i2c_dev->hw->thigh_fast_mode; + tsu_thd = i2c_dev->hw->setup_hold_time_fast_mode; + non_hs_mode = i2c_dev->hw->clk_divisor_fast_mode; + } else { + tlow = i2c_dev->hw->tlow_fastplus_mode; + thigh = i2c_dev->hw->thigh_fastplus_mode; + tsu_thd = i2c_dev->hw->setup_hold_time_fastplus_mode; + non_hs_mode = i2c_dev->hw->clk_divisor_fast_plus_mode; } /* make sure clock divisor programmed correctly */ @@ -1496,10 +1499,13 @@ static const struct tegra_i2c_hw_feature tegra20_i2c_hw = { .has_apb_dma = true, .tlow_std_mode = 0x4, .thigh_std_mode = 0x2, - .tlow_fast_fastplus_mode = 0x4, - .thigh_fast_fastplus_mode = 0x2, + .tlow_fast_mode = 0x4, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x4, + .thigh_fastplus_mode = 0x2, .setup_hold_time_std_mode = 0x0, - .setup_hold_time_fast_fast_plus_mode = 0x0, + .setup_hold_time_fast_mode = 0x0, + .setup_hold_time_fastplus_mode = 0x0, .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = false, }; @@ -1521,10 +1527,13 @@ static const struct tegra_i2c_hw_feature tegra30_i2c_hw = { .has_apb_dma = true, .tlow_std_mode = 0x4, .thigh_std_mode = 0x2, - .tlow_fast_fastplus_mode = 0x4, - .thigh_fast_fastplus_mode = 0x2, + .tlow_fast_mode = 0x4, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x4, + .thigh_fastplus_mode = 0x2, .setup_hold_time_std_mode = 0x0, - .setup_hold_time_fast_fast_plus_mode = 0x0, + .setup_hold_time_fast_mode = 0x0, + .setup_hold_time_fastplus_mode = 0x0, .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = false, }; @@ -1546,10 +1555,13 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { .has_apb_dma = true, .tlow_std_mode = 0x4, .thigh_std_mode = 0x2, - .tlow_fast_fastplus_mode = 0x4, - .thigh_fast_fastplus_mode = 0x2, + .tlow_fast_mode = 0x4, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x4, + .thigh_fastplus_mode = 0x2, .setup_hold_time_std_mode = 0x0, - .setup_hold_time_fast_fast_plus_mode = 0x0, + .setup_hold_time_fast_mode = 0x0, + .setup_hold_time_fastplus_mode = 0x0, .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = false, }; @@ -1571,10 +1583,13 @@ static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { .has_apb_dma = true, .tlow_std_mode = 0x4, .thigh_std_mode = 0x2, - .tlow_fast_fastplus_mode = 0x4, - .thigh_fast_fastplus_mode = 0x2, + .tlow_fast_mode = 0x4, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x4, + .thigh_fastplus_mode = 0x2, .setup_hold_time_std_mode = 0x0, - .setup_hold_time_fast_fast_plus_mode = 0x0, + .setup_hold_time_fast_mode = 0x0, + .setup_hold_time_fastplus_mode = 0x0, .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = true, }; @@ -1596,10 +1611,13 @@ static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { .has_apb_dma = true, .tlow_std_mode = 0x4, .thigh_std_mode = 0x2, - .tlow_fast_fastplus_mode = 0x4, - .thigh_fast_fastplus_mode = 0x2, + .tlow_fast_mode = 0x4, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x4, + .thigh_fastplus_mode = 0x2, .setup_hold_time_std_mode = 0, - .setup_hold_time_fast_fast_plus_mode = 0, + .setup_hold_time_fast_mode = 0, + .setup_hold_time_fastplus_mode = 0, .setup_hold_time_hs_mode = 0, .has_interface_timing_reg = true, }; @@ -1621,10 +1639,13 @@ static const struct tegra_i2c_hw_feature tegra186_i2c_hw = { .has_apb_dma = false, .tlow_std_mode = 0x4, .thigh_std_mode = 0x3, - .tlow_fast_fastplus_mode = 0x4, - .thigh_fast_fastplus_mode = 0x2, + .tlow_fast_mode = 0x4, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x4, + .thigh_fastplus_mode = 0x2, .setup_hold_time_std_mode = 0, - .setup_hold_time_fast_fast_plus_mode = 0, + .setup_hold_time_fast_mode = 0, + .setup_hold_time_fastplus_mode = 0, .setup_hold_time_hs_mode = 0, .has_interface_timing_reg = true, }; @@ -1646,10 +1667,13 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { .has_apb_dma = false, .tlow_std_mode = 0x8, .thigh_std_mode = 0x7, - .tlow_fast_fastplus_mode = 0x2, - .thigh_fast_fastplus_mode = 0x2, + .tlow_fast_mode = 0x2, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x2, + .thigh_fastplus_mode = 0x2, .setup_hold_time_std_mode = 0x08080808, - .setup_hold_time_fast_fast_plus_mode = 0x02020202, + .setup_hold_time_fast_mode = 0x02020202, + .setup_hold_time_fastplus_mode = 0x02020202, .setup_hold_time_hs_mode = 0x090909, .has_interface_timing_reg = true, }; @@ -1671,10 +1695,13 @@ static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { .has_apb_dma = false, .tlow_std_mode = 0x8, .thigh_std_mode = 0x7, - .tlow_fast_fastplus_mode = 0x3, - .thigh_fast_fastplus_mode = 0x3, + .tlow_fast_mode = 0x3, + .thigh_fast_mode = 0x3, + .tlow_fastplus_mode = 0x3, + .thigh_fastplus_mode = 0x3, .setup_hold_time_std_mode = 0x08080808, - .setup_hold_time_fast_fast_plus_mode = 0x02020202, + .setup_hold_time_fast_mode = 0x02020202, + .setup_hold_time_fastplus_mode = 0x02020202, .setup_hold_time_hs_mode = 0x090909, .has_interface_timing_reg = true, }; From 627c6261b62720bbae05b853caa52d349384b469 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Tue, 18 Nov 2025 19:36:17 +0530 Subject: [PATCH 227/247] NVIDIA: VR: SAUCE: i2c: tegra: Update Tegra256 timing parameters BugLink: https://bugs.launchpad.net/bugs/2138238 Update the timing parameters of Tegra256 so that the signals are complaint with the I2C specification for SCL low time. Signed-off-by: Akhil R Reviewed-by: Jon Hunter Acked-by: Thierry Reding (backported from https://lore.kernel.org/linux-tegra/20251118140620.549-1-akhilrajeev@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index c0382c9a04308..470d0d32d5712 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1684,7 +1684,7 @@ static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { .clk_divisor_hs_mode = 7, .clk_divisor_std_mode = 0x7a, .clk_divisor_fast_mode = 0x40, - .clk_divisor_fast_plus_mode = 0x19, + .clk_divisor_fast_plus_mode = 0x14, .has_config_load_reg = true, .has_multi_master_mode = true, .has_slcg_override_reg = true, @@ -1695,14 +1695,13 @@ static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { .has_apb_dma = false, .tlow_std_mode = 0x8, .thigh_std_mode = 0x7, - .tlow_fast_mode = 0x3, - .thigh_fast_mode = 0x3, - .tlow_fastplus_mode = 0x3, - .thigh_fastplus_mode = 0x3, + .tlow_fast_mode = 0x4, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x4, + .thigh_fastplus_mode = 0x4, .setup_hold_time_std_mode = 0x08080808, - .setup_hold_time_fast_mode = 0x02020202, - .setup_hold_time_fastplus_mode = 0x02020202, - .setup_hold_time_hs_mode = 0x090909, + .setup_hold_time_fast_mode = 0x04010101, + .setup_hold_time_fastplus_mode = 0x04020202, .has_interface_timing_reg = true, }; From b9fa3f2c49734f1257f66c60f58e696ccad73bde Mon Sep 17 00:00:00 2001 From: Akhil R Date: Tue, 18 Nov 2025 19:36:18 +0530 Subject: [PATCH 228/247] NVIDIA: VR: SAUCE: i2c: tegra: Add HS mode support BugLink: https://bugs.launchpad.net/bugs/2138238 Add support for High Speed (HS) mode transfers for Tegra194 and later chips. While HS mode has been documented in the technical reference manuals since Tegra20, the hardware implementation appears to be broken on all chips prior to Tegra194. When HS mode is not supported, set the frequency to FM+ instead. Signed-off-by: Akhil R Signed-off-by: Kartik Rajput Reviewed-by: Jon Hunter Acked-by: Thierry Reding (backported from https://lore.kernel.org/linux-tegra/20251118140620.549-1-akhilrajeev@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 59 ++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 470d0d32d5712..3cbda03633162 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -85,6 +85,7 @@ #define PACKET_HEADER0_PROTOCOL GENMASK(7, 4) #define PACKET_HEADER0_PROTOCOL_I2C 1 +#define I2C_HEADER_HS_MODE BIT(22) #define I2C_HEADER_CONT_ON_NAK BIT(21) #define I2C_HEADER_READ BIT(19) #define I2C_HEADER_10BIT_ADDR BIT(18) @@ -200,6 +201,8 @@ enum msg_end_type { * @thigh_fast_mode: High period of the clock in fast mode. * @tlow_fastplus_mode: Low period of the clock in fast-plus mode. * @thigh_fastplus_mode: High period of the clock in fast-plus mode. + * @tlow_hs_mode: Low period of the clock in HS mode. + * @thigh_hs_mode: High period of the clock in HS mode. * @setup_hold_time_std_mode: Setup and hold time for start and stop conditions * in standard mode. * @setup_hold_time_fast_mode: Setup and hold time for start and stop @@ -210,6 +213,7 @@ enum msg_end_type { * in HS mode. * @has_interface_timing_reg: Has interface timing register to program the tuned * timing settings. + * @enable_hs_mode_support: Enable support for high speed (HS) mode transfers. */ struct tegra_i2c_hw_feature { bool has_continue_xfer_support; @@ -232,11 +236,14 @@ struct tegra_i2c_hw_feature { u32 thigh_fast_mode; u32 tlow_fastplus_mode; u32 thigh_fastplus_mode; + u32 tlow_hs_mode; + u32 thigh_hs_mode; u32 setup_hold_time_std_mode; u32 setup_hold_time_fast_mode; u32 setup_hold_time_fastplus_mode; u32 setup_hold_time_hs_mode; bool has_interface_timing_reg; + bool enable_hs_mode_support; }; /** @@ -646,6 +653,7 @@ static int tegra_i2c_master_reset(struct tegra_i2c_dev *i2c_dev) static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) { u32 val, clk_divisor, clk_multiplier, tsu_thd, tlow, thigh, non_hs_mode; + u32 max_bus_freq_hz; struct i2c_timings *t = &i2c_dev->timings; int err; @@ -684,6 +692,14 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) if (IS_VI(i2c_dev)) tegra_i2c_vi_init(i2c_dev); + if (i2c_dev->hw->enable_hs_mode_support) + max_bus_freq_hz = I2C_MAX_HIGH_SPEED_MODE_FREQ; + else + max_bus_freq_hz = I2C_MAX_FAST_MODE_PLUS_FREQ; + + if (WARN_ON(t->bus_freq_hz > max_bus_freq_hz)) + t->bus_freq_hz = max_bus_freq_hz; + if (t->bus_freq_hz <= I2C_MAX_STANDARD_MODE_FREQ) { tlow = i2c_dev->hw->tlow_std_mode; thigh = i2c_dev->hw->thigh_std_mode; @@ -694,11 +710,22 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) thigh = i2c_dev->hw->thigh_fast_mode; tsu_thd = i2c_dev->hw->setup_hold_time_fast_mode; non_hs_mode = i2c_dev->hw->clk_divisor_fast_mode; - } else { + } else if (t->bus_freq_hz <= I2C_MAX_FAST_MODE_PLUS_FREQ) { tlow = i2c_dev->hw->tlow_fastplus_mode; thigh = i2c_dev->hw->thigh_fastplus_mode; tsu_thd = i2c_dev->hw->setup_hold_time_fastplus_mode; non_hs_mode = i2c_dev->hw->clk_divisor_fast_plus_mode; + } else { + /* + * When using HS mode, i.e. when the bus frequency is greater than fast plus mode, + * the non-hs timing registers will be used for sending the master code byte for + * transition to HS mode. Configure the non-hs timing registers for Fast Mode to + * send the master code byte at 400kHz. + */ + tlow = i2c_dev->hw->tlow_fast_mode; + thigh = i2c_dev->hw->thigh_fast_mode; + tsu_thd = i2c_dev->hw->setup_hold_time_fast_mode; + non_hs_mode = i2c_dev->hw->clk_divisor_fast_mode; } /* make sure clock divisor programmed correctly */ @@ -720,6 +747,18 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) if (i2c_dev->hw->has_interface_timing_reg && tsu_thd) i2c_writel(i2c_dev, tsu_thd, I2C_INTERFACE_TIMING_1); + /* Write HS mode registers. These will get used only for HS mode*/ + if (i2c_dev->hw->enable_hs_mode_support) { + tlow = i2c_dev->hw->tlow_hs_mode; + thigh = i2c_dev->hw->thigh_hs_mode; + tsu_thd = i2c_dev->hw->setup_hold_time_hs_mode; + + val = FIELD_PREP(I2C_HS_INTERFACE_TIMING_THIGH, thigh) | + FIELD_PREP(I2C_HS_INTERFACE_TIMING_TLOW, tlow); + i2c_writel(i2c_dev, val, I2C_HS_INTERFACE_TIMING_0); + i2c_writel(i2c_dev, tsu_thd, I2C_HS_INTERFACE_TIMING_1); + } + clk_multiplier = (tlow + thigh + 2) * (non_hs_mode + 1); err = clk_set_rate(i2c_dev->div_clk, @@ -1217,6 +1256,9 @@ static void tegra_i2c_push_packet_header(struct tegra_i2c_dev *i2c_dev, if (msg->flags & I2C_M_RD) packet_header |= I2C_HEADER_READ; + if (i2c_dev->timings.bus_freq_hz > I2C_MAX_FAST_MODE_PLUS_FREQ) + packet_header |= I2C_HEADER_HS_MODE; + if (i2c_dev->dma_mode && !i2c_dev->msg_read) *dma_buf++ = packet_header; else @@ -1508,6 +1550,7 @@ static const struct tegra_i2c_hw_feature tegra20_i2c_hw = { .setup_hold_time_fastplus_mode = 0x0, .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = false, + .enable_hs_mode_support = false, }; static const struct tegra_i2c_hw_feature tegra30_i2c_hw = { @@ -1536,6 +1579,7 @@ static const struct tegra_i2c_hw_feature tegra30_i2c_hw = { .setup_hold_time_fastplus_mode = 0x0, .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = false, + .enable_hs_mode_support = false, }; static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { @@ -1564,6 +1608,7 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { .setup_hold_time_fastplus_mode = 0x0, .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = false, + .enable_hs_mode_support = false, }; static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { @@ -1592,6 +1637,7 @@ static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { .setup_hold_time_fastplus_mode = 0x0, .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = true, + .enable_hs_mode_support = false, }; static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { @@ -1620,6 +1666,7 @@ static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { .setup_hold_time_fastplus_mode = 0, .setup_hold_time_hs_mode = 0, .has_interface_timing_reg = true, + .enable_hs_mode_support = false, }; static const struct tegra_i2c_hw_feature tegra186_i2c_hw = { @@ -1648,6 +1695,7 @@ static const struct tegra_i2c_hw_feature tegra186_i2c_hw = { .setup_hold_time_fastplus_mode = 0, .setup_hold_time_hs_mode = 0, .has_interface_timing_reg = true, + .enable_hs_mode_support = false, }; static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { @@ -1671,17 +1719,20 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { .thigh_fast_mode = 0x2, .tlow_fastplus_mode = 0x2, .thigh_fastplus_mode = 0x2, + .tlow_hs_mode = 0x8, + .thigh_hs_mode = 0x3, .setup_hold_time_std_mode = 0x08080808, .setup_hold_time_fast_mode = 0x02020202, .setup_hold_time_fastplus_mode = 0x02020202, .setup_hold_time_hs_mode = 0x090909, .has_interface_timing_reg = true, + .enable_hs_mode_support = true, }; static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { .has_continue_xfer_support = true, .has_per_pkt_xfer_complete_irq = true, - .clk_divisor_hs_mode = 7, + .clk_divisor_hs_mode = 9, .clk_divisor_std_mode = 0x7a, .clk_divisor_fast_mode = 0x40, .clk_divisor_fast_plus_mode = 0x14, @@ -1699,10 +1750,14 @@ static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { .thigh_fast_mode = 0x2, .tlow_fastplus_mode = 0x4, .thigh_fastplus_mode = 0x4, + .tlow_hs_mode = 0x3, + .thigh_hs_mode = 0x2, .setup_hold_time_std_mode = 0x08080808, .setup_hold_time_fast_mode = 0x04010101, .setup_hold_time_fastplus_mode = 0x04020202, + .setup_hold_time_hs_mode = 0x030303, .has_interface_timing_reg = true, + .enable_hs_mode_support = true, }; static const struct of_device_id tegra_i2c_of_match[] = { From 2e221cb4a1e806a508f7cb8bcc839eb475d1f3b2 Mon Sep 17 00:00:00 2001 From: Kartik Rajput Date: Tue, 18 Nov 2025 19:36:19 +0530 Subject: [PATCH 229/247] NVIDIA: VR: SAUCE: i2c: tegra: Add support for SW mutex register BugLink: https://bugs.launchpad.net/bugs/2138238 Add support for SW mutex register introduced in Tegra264 to provide an option to share the interface between multiple firmwares and/or VMs. This involves following steps: - A firmware/OS writes its unique ID to the mutex REQUEST field. - Ownership is established when reading the GRANT field returns the same ID. - If GRANT shows a different non-zero ID, the firmware/OS retries until timeout. - After completing access, it releases the mutex by writing 0. However, the hardware does not ensure any protection based on the values. The driver/firmware should honor the peer who already holds the mutex. Signed-off-by: Kartik Rajput Signed-off-by: Akhil R Reviewed-by: Jon Hunter Acked-by: Thierry Reding (backported from https://lore.kernel.org/linux-tegra/20251118140620.549-1-akhilrajeev@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 93 ++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 3cbda03633162..84f2d5f4b794c 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -137,6 +137,14 @@ #define I2C_MASTER_RESET_CNTRL 0x0a8 +#define I2C_SW_MUTEX 0x0ec +#define I2C_SW_MUTEX_REQUEST GENMASK(3, 0) +#define I2C_SW_MUTEX_GRANT GENMASK(7, 4) +#define I2C_SW_MUTEX_ID_CCPLEX 9 + +/* SW mutex acquire timeout value in microseconds. */ +#define I2C_SW_MUTEX_TIMEOUT_US (25 * USEC_PER_MSEC) + /* configuration load timeout in microseconds */ #define I2C_CONFIG_LOAD_TIMEOUT 1000000 @@ -214,6 +222,7 @@ enum msg_end_type { * @has_interface_timing_reg: Has interface timing register to program the tuned * timing settings. * @enable_hs_mode_support: Enable support for high speed (HS) mode transfers. + * @has_mutex: Has mutex register for mutual exclusion with other firmwares or VMs. */ struct tegra_i2c_hw_feature { bool has_continue_xfer_support; @@ -244,6 +253,7 @@ struct tegra_i2c_hw_feature { u32 setup_hold_time_hs_mode; bool has_interface_timing_reg; bool enable_hs_mode_support; + bool has_mutex; }; /** @@ -388,6 +398,76 @@ static void i2c_readsl(struct tegra_i2c_dev *i2c_dev, void *data, readsl(i2c_dev->base + tegra_i2c_reg_addr(i2c_dev, reg), data, len); } +static bool tegra_i2c_mutex_acquired(struct tegra_i2c_dev *i2c_dev) +{ + unsigned int reg = tegra_i2c_reg_addr(i2c_dev, I2C_SW_MUTEX); + u32 val, id; + + val = readl(i2c_dev->base + reg); + id = FIELD_GET(I2C_SW_MUTEX_GRANT, val); + + return id == I2C_SW_MUTEX_ID_CCPLEX; +} + +static bool tegra_i2c_mutex_trylock(struct tegra_i2c_dev *i2c_dev) +{ + unsigned int reg = tegra_i2c_reg_addr(i2c_dev, I2C_SW_MUTEX); + u32 val, id; + + val = readl(i2c_dev->base + reg); + id = FIELD_GET(I2C_SW_MUTEX_GRANT, val); + if (id != 0 && id != I2C_SW_MUTEX_ID_CCPLEX) + return false; + + val = FIELD_PREP(I2C_SW_MUTEX_REQUEST, I2C_SW_MUTEX_ID_CCPLEX); + writel(val, i2c_dev->base + reg); + + return tegra_i2c_mutex_acquired(i2c_dev); +} + +static int tegra_i2c_mutex_lock(struct tegra_i2c_dev *i2c_dev) +{ + bool locked; + int ret; + + if (!i2c_dev->hw->has_mutex) + return 0; + + if (i2c_dev->atomic_mode) + ret = read_poll_timeout_atomic(tegra_i2c_mutex_trylock, locked, locked, + USEC_PER_MSEC, I2C_SW_MUTEX_TIMEOUT_US, + false, i2c_dev); + else + ret = read_poll_timeout(tegra_i2c_mutex_trylock, locked, locked, USEC_PER_MSEC, + I2C_SW_MUTEX_TIMEOUT_US, false, i2c_dev); + + if (ret) + dev_warn(i2c_dev->dev, "failed to acquire mutex\n"); + + return ret; +} + +static int tegra_i2c_mutex_unlock(struct tegra_i2c_dev *i2c_dev) +{ + unsigned int reg = tegra_i2c_reg_addr(i2c_dev, I2C_SW_MUTEX); + u32 val, id; + + if (!i2c_dev->hw->has_mutex) + return 0; + + val = readl(i2c_dev->base + reg); + + id = FIELD_GET(I2C_SW_MUTEX_GRANT, val); + if (id && id != I2C_SW_MUTEX_ID_CCPLEX) { + dev_warn(i2c_dev->dev, "unable to unlock mutex, mutex is owned by: %u\n", id); + return -EPERM; + } + + writel(0, i2c_dev->base + reg); + + return 0; +} + static void tegra_i2c_mask_irq(struct tegra_i2c_dev *i2c_dev, u32 mask) { u32 int_mask; @@ -1443,6 +1523,10 @@ static int tegra_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], return ret; } + ret = tegra_i2c_mutex_lock(i2c_dev); + if (ret) + return ret; + for (i = 0; i < num; i++) { enum msg_end_type end_type = MSG_END_STOP; @@ -1472,6 +1556,7 @@ static int tegra_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], break; } + ret = tegra_i2c_mutex_unlock(i2c_dev); pm_runtime_put(i2c_dev->dev); return ret ?: i; @@ -1551,6 +1636,7 @@ static const struct tegra_i2c_hw_feature tegra20_i2c_hw = { .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = false, .enable_hs_mode_support = false, + .has_mutex = false, }; static const struct tegra_i2c_hw_feature tegra30_i2c_hw = { @@ -1580,6 +1666,7 @@ static const struct tegra_i2c_hw_feature tegra30_i2c_hw = { .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = false, .enable_hs_mode_support = false, + .has_mutex = false, }; static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { @@ -1609,6 +1696,7 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = false, .enable_hs_mode_support = false, + .has_mutex = false, }; static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { @@ -1638,6 +1726,7 @@ static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { .setup_hold_time_hs_mode = 0x0, .has_interface_timing_reg = true, .enable_hs_mode_support = false, + .has_mutex = false, }; static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { @@ -1667,6 +1756,7 @@ static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { .setup_hold_time_hs_mode = 0, .has_interface_timing_reg = true, .enable_hs_mode_support = false, + .has_mutex = false, }; static const struct tegra_i2c_hw_feature tegra186_i2c_hw = { @@ -1696,6 +1786,7 @@ static const struct tegra_i2c_hw_feature tegra186_i2c_hw = { .setup_hold_time_hs_mode = 0, .has_interface_timing_reg = true, .enable_hs_mode_support = false, + .has_mutex = false, }; static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { @@ -1727,6 +1818,7 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { .setup_hold_time_hs_mode = 0x090909, .has_interface_timing_reg = true, .enable_hs_mode_support = true, + .has_mutex = false, }; static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { @@ -1758,6 +1850,7 @@ static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { .setup_hold_time_hs_mode = 0x030303, .has_interface_timing_reg = true, .enable_hs_mode_support = true, + .has_mutex = true, }; static const struct of_device_id tegra_i2c_of_match[] = { From ec94826777e669218c0538266bdfa9e26c30e185 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Tue, 18 Nov 2025 19:36:20 +0530 Subject: [PATCH 230/247] NVIDIA: VR: SAUCE: i2c: tegra: Add Tegra264 support BugLink: https://bugs.launchpad.net/bugs/2138238 Add support for Tegra264 SoC which supports 17 generic I2C controllers, two of which are in the AON (always-on) partition of the SoC. In addition to the features supported by Tegra194 it also supports a SW mutex register to allow sharing the same I2C instance across multiple firmware. Signed-off-by: Akhil R Signed-off-by: Kartik Rajput Reviewed-by: Jon Hunter Acked-by: Thierry Reding (backported from https://lore.kernel.org/linux-tegra/20251118140620.549-1-akhilrajeev@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 84f2d5f4b794c..d05015ef425d5 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1853,7 +1853,40 @@ static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { .has_mutex = true, }; +static const struct tegra_i2c_hw_feature tegra264_i2c_hw = { + .has_continue_xfer_support = true, + .has_per_pkt_xfer_complete_irq = true, + .clk_divisor_hs_mode = 1, + .clk_divisor_std_mode = 0x1d, + .clk_divisor_fast_mode = 0x15, + .clk_divisor_fast_plus_mode = 0x8, + .has_config_load_reg = true, + .has_multi_master_mode = true, + .has_slcg_override_reg = true, + .has_mst_fifo = true, + .has_mst_reset = true, + .quirks = &tegra194_i2c_quirks, + .supports_bus_clear = true, + .has_apb_dma = false, + .tlow_std_mode = 0x8, + .thigh_std_mode = 0x7, + .tlow_fast_mode = 0x2, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x2, + .thigh_fastplus_mode = 0x2, + .tlow_hs_mode = 0x4, + .thigh_hs_mode = 0x2, + .setup_hold_time_std_mode = 0x08080808, + .setup_hold_time_fast_mode = 0x02020202, + .setup_hold_time_fastplus_mode = 0x02020202, + .setup_hold_time_hs_mode = 0x090909, + .has_interface_timing_reg = true, + .enable_hs_mode_support = true, + .has_mutex = true, +}; + static const struct of_device_id tegra_i2c_of_match[] = { + { .compatible = "nvidia,tegra264-i2c", .data = &tegra264_i2c_hw, }, { .compatible = "nvidia,tegra256-i2c", .data = &tegra256_i2c_hw, }, { .compatible = "nvidia,tegra194-i2c", .data = &tegra194_i2c_hw, }, { .compatible = "nvidia,tegra186-i2c", .data = &tegra186_i2c_hw, }, From fb0681f33515c1765f715b16e0de41dca99f6691 Mon Sep 17 00:00:00 2001 From: Kartik Rajput Date: Wed, 7 Jan 2026 19:56:46 +0530 Subject: [PATCH 231/247] NVIDIA: VR: SAUCE: i2c: tegra: Introduce tegra_i2c_variant to identify DVC and VI BugLink: https://bugs.launchpad.net/bugs/2138238 Replace the per-instance boolean flags with an enum tegra_i2c_variant since DVC and VI are mutually exclusive. Update IS_DVC/IS_VI and variant initialization accordingly. Suggested-by: Jon Hunter Signed-off-by: Kartik Rajput (backported from https://lore.kernel.org/all/20260107142649.14917-1-kkartik@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index d05015ef425d5..9a09079dcc9cf 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -171,6 +171,18 @@ enum msg_end_type { MSG_END_CONTINUE, }; +/* + * tegra_i2c_variant: Identifies the variant of I2C controller. + * @TEGRA_I2C_VARIANT_DEFAULT: Identifies the default I2C controller. + * @TEGRA_I2C_VARIANT_DVC: Identifies the DVC I2C controller, has a different register layout. + * @TEGRA_I2C_VARIANT_VI: Identifies the VI I2C controller, has a different register layout. + */ +enum tegra_i2c_variant { + TEGRA_I2C_VARIANT_DEFAULT, + TEGRA_I2C_VARIANT_DVC, + TEGRA_I2C_VARIANT_VI, +}; + /** * struct tegra_i2c_hw_feature : per hardware generation features * @has_continue_xfer_support: continue-transfer supported @@ -269,8 +281,7 @@ struct tegra_i2c_hw_feature { * @base_phys: physical base address of the I2C controller * @cont_id: I2C controller ID, used for packet header * @irq: IRQ number of transfer complete interrupt - * @is_dvc: identifies the DVC I2C controller, has a different register layout - * @is_vi: identifies the VI I2C controller, has a different register layout + * @variant: This represents the I2C controller variant. * @msg_complete: transfer completion notifier * @msg_buf_remaining: size of unsent data in the message buffer * @msg_len: length of message in current transfer @@ -323,12 +334,13 @@ struct tegra_i2c_dev { bool atomic_mode; bool dma_mode; bool msg_read; - bool is_dvc; - bool is_vi; + enum tegra_i2c_variant variant; }; -#define IS_DVC(dev) (IS_ENABLED(CONFIG_ARCH_TEGRA_2x_SOC) && (dev)->is_dvc) -#define IS_VI(dev) (IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) && (dev)->is_vi) +#define IS_DVC(dev) (IS_ENABLED(CONFIG_ARCH_TEGRA_2x_SOC) && \ + (dev)->variant == TEGRA_I2C_VARIANT_DVC) +#define IS_VI(dev) (IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) && \ + (dev)->variant == TEGRA_I2C_VARIANT_VI) static void dvc_writel(struct tegra_i2c_dev *i2c_dev, u32 val, unsigned int reg) @@ -1915,13 +1927,15 @@ static void tegra_i2c_parse_dt(struct tegra_i2c_dev *i2c_dev) multi_mode = device_property_read_bool(i2c_dev->dev, "multi-master"); i2c_dev->multimaster_mode = multi_mode; + i2c_dev->variant = TEGRA_I2C_VARIANT_DEFAULT; + if (IS_ENABLED(CONFIG_ARCH_TEGRA_2x_SOC) && of_device_is_compatible(np, "nvidia,tegra20-i2c-dvc")) - i2c_dev->is_dvc = true; + i2c_dev->variant = TEGRA_I2C_VARIANT_DVC; if (IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) && of_device_is_compatible(np, "nvidia,tegra210-i2c-vi")) - i2c_dev->is_vi = true; + i2c_dev->variant = TEGRA_I2C_VARIANT_VI; } static int tegra_i2c_init_clocks(struct tegra_i2c_dev *i2c_dev) From baa7990e00c946522e220815c623f170eb102fa3 Mon Sep 17 00:00:00 2001 From: Kartik Rajput Date: Wed, 7 Jan 2026 19:56:47 +0530 Subject: [PATCH 232/247] NVIDIA: VR: SAUCE: i2c: tegra: Move variant to tegra_i2c_hw_feature BugLink: https://bugs.launchpad.net/bugs/2138238 Move the variant field into tegra_i2c_hw_feature and populate it for all SoCs. Add dedicated SoC data for "nvidia,tegra20-i2c-dvc" and "nvidia,tegra210-i2c-vi" compatibles. Drop the compatible-string checks from tegra_i2c_parse_dt to initialize the Tegra I2C variant. Signed-off-by: Kartik Rajput (backported from https://lore.kernel.org/all/20260107142649.14917-1-kkartik@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 98 ++++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 17 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 9a09079dcc9cf..cb6455fb3ee1b 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -235,6 +235,7 @@ enum tegra_i2c_variant { * timing settings. * @enable_hs_mode_support: Enable support for high speed (HS) mode transfers. * @has_mutex: Has mutex register for mutual exclusion with other firmwares or VMs. + * @variant: This represents the I2C controller variant. */ struct tegra_i2c_hw_feature { bool has_continue_xfer_support; @@ -266,6 +267,7 @@ struct tegra_i2c_hw_feature { bool has_interface_timing_reg; bool enable_hs_mode_support; bool has_mutex; + enum tegra_i2c_variant variant; }; /** @@ -281,7 +283,6 @@ struct tegra_i2c_hw_feature { * @base_phys: physical base address of the I2C controller * @cont_id: I2C controller ID, used for packet header * @irq: IRQ number of transfer complete interrupt - * @variant: This represents the I2C controller variant. * @msg_complete: transfer completion notifier * @msg_buf_remaining: size of unsent data in the message buffer * @msg_len: length of message in current transfer @@ -334,13 +335,12 @@ struct tegra_i2c_dev { bool atomic_mode; bool dma_mode; bool msg_read; - enum tegra_i2c_variant variant; }; #define IS_DVC(dev) (IS_ENABLED(CONFIG_ARCH_TEGRA_2x_SOC) && \ - (dev)->variant == TEGRA_I2C_VARIANT_DVC) + (dev)->hw->variant == TEGRA_I2C_VARIANT_DVC) #define IS_VI(dev) (IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) && \ - (dev)->variant == TEGRA_I2C_VARIANT_VI) + (dev)->hw->variant == TEGRA_I2C_VARIANT_VI) static void dvc_writel(struct tegra_i2c_dev *i2c_dev, u32 val, unsigned int reg) @@ -1649,8 +1649,42 @@ static const struct tegra_i2c_hw_feature tegra20_i2c_hw = { .has_interface_timing_reg = false, .enable_hs_mode_support = false, .has_mutex = false, + .variant = TEGRA_I2C_VARIANT_DEFAULT, }; +#if IS_ENABLED(CONFIG_ARCH_TEGRA_2x_SOC) +static const struct tegra_i2c_hw_feature tegra20_dvc_i2c_hw = { + .has_continue_xfer_support = false, + .has_per_pkt_xfer_complete_irq = false, + .clk_divisor_hs_mode = 3, + .clk_divisor_std_mode = 0, + .clk_divisor_fast_mode = 0, + .clk_divisor_fast_plus_mode = 0, + .has_config_load_reg = false, + .has_multi_master_mode = false, + .has_slcg_override_reg = false, + .has_mst_fifo = false, + .has_mst_reset = false, + .quirks = &tegra_i2c_quirks, + .supports_bus_clear = false, + .has_apb_dma = true, + .tlow_std_mode = 0x4, + .thigh_std_mode = 0x2, + .tlow_fast_mode = 0x4, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x4, + .thigh_fastplus_mode = 0x2, + .setup_hold_time_std_mode = 0x0, + .setup_hold_time_fast_mode = 0x0, + .setup_hold_time_fastplus_mode = 0x0, + .setup_hold_time_hs_mode = 0x0, + .has_interface_timing_reg = false, + .enable_hs_mode_support = false, + .has_mutex = false, + .variant = TEGRA_I2C_VARIANT_DVC, +}; +#endif + static const struct tegra_i2c_hw_feature tegra30_i2c_hw = { .has_continue_xfer_support = true, .has_per_pkt_xfer_complete_irq = false, @@ -1679,6 +1713,7 @@ static const struct tegra_i2c_hw_feature tegra30_i2c_hw = { .has_interface_timing_reg = false, .enable_hs_mode_support = false, .has_mutex = false, + .variant = TEGRA_I2C_VARIANT_DEFAULT, }; static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { @@ -1709,6 +1744,7 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { .has_interface_timing_reg = false, .enable_hs_mode_support = false, .has_mutex = false, + .variant = TEGRA_I2C_VARIANT_DEFAULT, }; static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { @@ -1739,6 +1775,7 @@ static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { .has_interface_timing_reg = true, .enable_hs_mode_support = false, .has_mutex = false, + .variant = TEGRA_I2C_VARIANT_DEFAULT, }; static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { @@ -1769,8 +1806,42 @@ static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { .has_interface_timing_reg = true, .enable_hs_mode_support = false, .has_mutex = false, + .variant = TEGRA_I2C_VARIANT_DEFAULT, }; +#if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) +static const struct tegra_i2c_hw_feature tegra210_vi_i2c_hw = { + .has_continue_xfer_support = true, + .has_per_pkt_xfer_complete_irq = true, + .clk_divisor_hs_mode = 1, + .clk_divisor_std_mode = 0x19, + .clk_divisor_fast_mode = 0x19, + .clk_divisor_fast_plus_mode = 0x10, + .has_config_load_reg = true, + .has_multi_master_mode = false, + .has_slcg_override_reg = true, + .has_mst_fifo = false, + .has_mst_reset = false, + .quirks = &tegra_i2c_quirks, + .supports_bus_clear = true, + .has_apb_dma = true, + .tlow_std_mode = 0x4, + .thigh_std_mode = 0x2, + .tlow_fast_mode = 0x4, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x4, + .thigh_fastplus_mode = 0x2, + .setup_hold_time_std_mode = 0, + .setup_hold_time_fast_mode = 0, + .setup_hold_time_fastplus_mode = 0, + .setup_hold_time_hs_mode = 0, + .has_interface_timing_reg = true, + .enable_hs_mode_support = false, + .has_mutex = false, + .variant = TEGRA_I2C_VARIANT_VI, +}; +#endif + static const struct tegra_i2c_hw_feature tegra186_i2c_hw = { .has_continue_xfer_support = true, .has_per_pkt_xfer_complete_irq = true, @@ -1799,6 +1870,7 @@ static const struct tegra_i2c_hw_feature tegra186_i2c_hw = { .has_interface_timing_reg = true, .enable_hs_mode_support = false, .has_mutex = false, + .variant = TEGRA_I2C_VARIANT_DEFAULT, }; static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { @@ -1831,6 +1903,7 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { .has_interface_timing_reg = true, .enable_hs_mode_support = true, .has_mutex = false, + .variant = TEGRA_I2C_VARIANT_DEFAULT, }; static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { @@ -1863,6 +1936,7 @@ static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { .has_interface_timing_reg = true, .enable_hs_mode_support = true, .has_mutex = true, + .variant = TEGRA_I2C_VARIANT_DEFAULT, }; static const struct tegra_i2c_hw_feature tegra264_i2c_hw = { @@ -1895,6 +1969,7 @@ static const struct tegra_i2c_hw_feature tegra264_i2c_hw = { .has_interface_timing_reg = true, .enable_hs_mode_support = true, .has_mutex = true, + .variant = TEGRA_I2C_VARIANT_DEFAULT, }; static const struct of_device_id tegra_i2c_of_match[] = { @@ -1903,7 +1978,7 @@ static const struct of_device_id tegra_i2c_of_match[] = { { .compatible = "nvidia,tegra194-i2c", .data = &tegra194_i2c_hw, }, { .compatible = "nvidia,tegra186-i2c", .data = &tegra186_i2c_hw, }, #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) - { .compatible = "nvidia,tegra210-i2c-vi", .data = &tegra210_i2c_hw, }, + { .compatible = "nvidia,tegra210-i2c-vi", .data = &tegra210_vi_i2c_hw, }, #endif { .compatible = "nvidia,tegra210-i2c", .data = &tegra210_i2c_hw, }, { .compatible = "nvidia,tegra124-i2c", .data = &tegra124_i2c_hw, }, @@ -1911,7 +1986,7 @@ static const struct of_device_id tegra_i2c_of_match[] = { { .compatible = "nvidia,tegra30-i2c", .data = &tegra30_i2c_hw, }, { .compatible = "nvidia,tegra20-i2c", .data = &tegra20_i2c_hw, }, #if IS_ENABLED(CONFIG_ARCH_TEGRA_2x_SOC) - { .compatible = "nvidia,tegra20-i2c-dvc", .data = &tegra20_i2c_hw, }, + { .compatible = "nvidia,tegra20-i2c-dvc", .data = &tegra20_dvc_i2c_hw, }, #endif {}, }; @@ -1919,23 +1994,12 @@ MODULE_DEVICE_TABLE(of, tegra_i2c_of_match); static void tegra_i2c_parse_dt(struct tegra_i2c_dev *i2c_dev) { - struct device_node *np = i2c_dev->dev->of_node; bool multi_mode; i2c_parse_fw_timings(i2c_dev->dev, &i2c_dev->timings, true); multi_mode = device_property_read_bool(i2c_dev->dev, "multi-master"); i2c_dev->multimaster_mode = multi_mode; - - i2c_dev->variant = TEGRA_I2C_VARIANT_DEFAULT; - - if (IS_ENABLED(CONFIG_ARCH_TEGRA_2x_SOC) && - of_device_is_compatible(np, "nvidia,tegra20-i2c-dvc")) - i2c_dev->variant = TEGRA_I2C_VARIANT_DVC; - - if (IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) && - of_device_is_compatible(np, "nvidia,tegra210-i2c-vi")) - i2c_dev->variant = TEGRA_I2C_VARIANT_VI; } static int tegra_i2c_init_clocks(struct tegra_i2c_dev *i2c_dev) From 8c067c6131252a862e13c7e729e1a54c026c64f5 Mon Sep 17 00:00:00 2001 From: Kartik Rajput Date: Wed, 7 Jan 2026 19:56:48 +0530 Subject: [PATCH 233/247] NVIDIA: VR: SAUCE: i2c: tegra: Add logic to support different register offsets BugLink: https://bugs.launchpad.net/bugs/2138238 Tegra410 use different offsets for existing I2C registers, update the logic to use appropriate offsets per SoC. As the registers offsets are now also defined for dvc and vi, following function are not required and they are removed: - tegra_i2c_reg_addr(): No translation required. - dvc_writel(): Replaced with i2c_writel() with DVC check. - dvc_readl(): Replaced with i2c_readl(). Signed-off-by: Kartik Rajput (backported from https://lore.kernel.org/all/20260107142649.14917-1-kkartik@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 393 +++++++++++++++++++++------------ 1 file changed, 251 insertions(+), 142 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index cb6455fb3ee1b..821e7627e56ec 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -30,47 +30,37 @@ #define BYTES_PER_FIFO_WORD 4 -#define I2C_CNFG 0x000 #define I2C_CNFG_DEBOUNCE_CNT GENMASK(14, 12) #define I2C_CNFG_PACKET_MODE_EN BIT(10) #define I2C_CNFG_NEW_MASTER_FSM BIT(11) #define I2C_CNFG_MULTI_MASTER_MODE BIT(17) -#define I2C_STATUS 0x01c -#define I2C_SL_CNFG 0x020 + #define I2C_SL_CNFG_NACK BIT(1) #define I2C_SL_CNFG_NEWSL BIT(2) -#define I2C_SL_ADDR1 0x02c -#define I2C_SL_ADDR2 0x030 -#define I2C_TLOW_SEXT 0x034 -#define I2C_TX_FIFO 0x050 -#define I2C_RX_FIFO 0x054 -#define I2C_PACKET_TRANSFER_STATUS 0x058 -#define I2C_FIFO_CONTROL 0x05c + #define I2C_FIFO_CONTROL_TX_FLUSH BIT(1) #define I2C_FIFO_CONTROL_RX_FLUSH BIT(0) #define I2C_FIFO_CONTROL_TX_TRIG(x) (((x) - 1) << 5) #define I2C_FIFO_CONTROL_RX_TRIG(x) (((x) - 1) << 2) -#define I2C_FIFO_STATUS 0x060 + #define I2C_FIFO_STATUS_TX GENMASK(7, 4) #define I2C_FIFO_STATUS_RX GENMASK(3, 0) -#define I2C_INT_MASK 0x064 -#define I2C_INT_STATUS 0x068 + #define I2C_INT_BUS_CLR_DONE BIT(11) #define I2C_INT_PACKET_XFER_COMPLETE BIT(7) #define I2C_INT_NO_ACK BIT(3) #define I2C_INT_ARBITRATION_LOST BIT(2) #define I2C_INT_TX_FIFO_DATA_REQ BIT(1) #define I2C_INT_RX_FIFO_DATA_REQ BIT(0) -#define I2C_CLK_DIVISOR 0x06c + #define I2C_CLK_DIVISOR_STD_FAST_MODE GENMASK(31, 16) #define I2C_CLK_DIVISOR_HSMODE GENMASK(15, 0) -#define DVC_CTRL_REG1 0x000 #define DVC_CTRL_REG1_INTR_EN BIT(10) -#define DVC_CTRL_REG3 0x008 + #define DVC_CTRL_REG3_SW_PROG BIT(26) #define DVC_CTRL_REG3_I2C_DONE_INTR_EN BIT(30) -#define DVC_STATUS 0x00c + #define DVC_STATUS_I2C_DONE_INTR BIT(30) #define I2C_ERR_NONE 0x00 @@ -94,50 +84,38 @@ #define I2C_HEADER_CONTINUE_XFER BIT(15) #define I2C_HEADER_SLAVE_ADDR_SHIFT 1 -#define I2C_BUS_CLEAR_CNFG 0x084 #define I2C_BC_SCLK_THRESHOLD GENMASK(23, 16) #define I2C_BC_STOP_COND BIT(2) #define I2C_BC_TERMINATE BIT(1) #define I2C_BC_ENABLE BIT(0) -#define I2C_BUS_CLEAR_STATUS 0x088 + #define I2C_BC_STATUS BIT(0) -#define I2C_CONFIG_LOAD 0x08c #define I2C_MSTR_CONFIG_LOAD BIT(0) -#define I2C_CLKEN_OVERRIDE 0x090 #define I2C_MST_CORE_CLKEN_OVR BIT(0) -#define I2C_INTERFACE_TIMING_0 0x094 -#define I2C_INTERFACE_TIMING_THIGH GENMASK(13, 8) -#define I2C_INTERFACE_TIMING_TLOW GENMASK(5, 0) -#define I2C_INTERFACE_TIMING_1 0x098 -#define I2C_INTERFACE_TIMING_TBUF GENMASK(29, 24) -#define I2C_INTERFACE_TIMING_TSU_STO GENMASK(21, 16) -#define I2C_INTERFACE_TIMING_THD_STA GENMASK(13, 8) -#define I2C_INTERFACE_TIMING_TSU_STA GENMASK(5, 0) - -#define I2C_HS_INTERFACE_TIMING_0 0x09c -#define I2C_HS_INTERFACE_TIMING_THIGH GENMASK(13, 8) -#define I2C_HS_INTERFACE_TIMING_TLOW GENMASK(5, 0) -#define I2C_HS_INTERFACE_TIMING_1 0x0a0 -#define I2C_HS_INTERFACE_TIMING_TSU_STO GENMASK(21, 16) -#define I2C_HS_INTERFACE_TIMING_THD_STA GENMASK(13, 8) -#define I2C_HS_INTERFACE_TIMING_TSU_STA GENMASK(5, 0) - -#define I2C_MST_FIFO_CONTROL 0x0b4 +#define I2C_INTERFACE_TIMING_THIGH GENMASK(13, 8) +#define I2C_INTERFACE_TIMING_TLOW GENMASK(5, 0) +#define I2C_INTERFACE_TIMING_TBUF GENMASK(29, 24) +#define I2C_INTERFACE_TIMING_TSU_STO GENMASK(21, 16) +#define I2C_INTERFACE_TIMING_THD_STA GENMASK(13, 8) +#define I2C_INTERFACE_TIMING_TSU_STA GENMASK(5, 0) + +#define I2C_HS_INTERFACE_TIMING_THIGH GENMASK(13, 8) +#define I2C_HS_INTERFACE_TIMING_TLOW GENMASK(5, 0) +#define I2C_HS_INTERFACE_TIMING_TSU_STO GENMASK(21, 16) +#define I2C_HS_INTERFACE_TIMING_THD_STA GENMASK(13, 8) +#define I2C_HS_INTERFACE_TIMING_TSU_STA GENMASK(5, 0) + #define I2C_MST_FIFO_CONTROL_RX_FLUSH BIT(0) #define I2C_MST_FIFO_CONTROL_TX_FLUSH BIT(1) #define I2C_MST_FIFO_CONTROL_RX_TRIG(x) (((x) - 1) << 4) #define I2C_MST_FIFO_CONTROL_TX_TRIG(x) (((x) - 1) << 16) -#define I2C_MST_FIFO_STATUS 0x0b8 #define I2C_MST_FIFO_STATUS_TX GENMASK(23, 16) #define I2C_MST_FIFO_STATUS_RX GENMASK(7, 0) -#define I2C_MASTER_RESET_CNTRL 0x0a8 - -#define I2C_SW_MUTEX 0x0ec #define I2C_SW_MUTEX_REQUEST GENMASK(3, 0) #define I2C_SW_MUTEX_GRANT GENMASK(7, 4) #define I2C_SW_MUTEX_ID_CCPLEX 9 @@ -159,6 +137,149 @@ */ #define I2C_PIO_MODE_PREFERRED_LEN 32 +struct tegra_i2c_regs { + unsigned int cnfg; + unsigned int status; + unsigned int sl_cnfg; + unsigned int sl_addr1; + unsigned int sl_addr2; + unsigned int tlow_sext; + unsigned int tx_fifo; + unsigned int rx_fifo; + unsigned int packet_transfer_status; + unsigned int fifo_control; + unsigned int fifo_status; + unsigned int int_mask; + unsigned int int_status; + unsigned int clk_divisor; + unsigned int bus_clear_cnfg; + unsigned int bus_clear_status; + unsigned int config_load; + unsigned int clken_override; + unsigned int interface_timing_0; + unsigned int interface_timing_1; + unsigned int hs_interface_timing_0; + unsigned int hs_interface_timing_1; + unsigned int master_reset_cntrl; + unsigned int mst_fifo_control; + unsigned int mst_fifo_status; + unsigned int sw_mutex; + unsigned int dvc_ctrl_reg1; + unsigned int dvc_ctrl_reg3; + unsigned int dvc_status; +}; + +static const struct tegra_i2c_regs tegra20_i2c_regs = { + .cnfg = 0x000, + .status = 0x01c, + .sl_cnfg = 0x020, + .sl_addr1 = 0x02c, + .sl_addr2 = 0x030, + .tx_fifo = 0x050, + .rx_fifo = 0x054, + .packet_transfer_status = 0x058, + .fifo_control = 0x05c, + .fifo_status = 0x060, + .int_mask = 0x064, + .int_status = 0x068, + .clk_divisor = 0x06c, + .bus_clear_cnfg = 0x084, + .bus_clear_status = 0x088, + .config_load = 0x08c, + .clken_override = 0x090, + .interface_timing_0 = 0x094, + .interface_timing_1 = 0x098, + .hs_interface_timing_0 = 0x09c, + .hs_interface_timing_1 = 0x0a0, + .master_reset_cntrl = 0x0a8, + .mst_fifo_control = 0x0b4, + .mst_fifo_status = 0x0b8, +}; + +#if IS_ENABLED(CONFIG_ARCH_TEGRA_2x_SOC) +static const struct tegra_i2c_regs tegra20_dvc_i2c_regs = { + .dvc_ctrl_reg1 = 0x000, + .dvc_ctrl_reg3 = 0x008, + .dvc_status = 0x00c, + .cnfg = 0x040, + .status = 0x05c, + .tx_fifo = 0x060, + .rx_fifo = 0x064, + .packet_transfer_status = 0x068, + .fifo_control = 0x06c, + .fifo_status = 0x070, + .int_mask = 0x074, + .int_status = 0x078, + .clk_divisor = 0x07c, + .bus_clear_cnfg = 0x0c4, + .bus_clear_status = 0x0c8, + .config_load = 0x0cc, + .clken_override = 0x0d0, + .interface_timing_0 = 0x0d4, + .interface_timing_1 = 0x0d8, + .hs_interface_timing_0 = 0x0dc, + .hs_interface_timing_1 = 0x0e0, + .master_reset_cntrl = 0x0e8, + .mst_fifo_control = 0x0c4, + .mst_fifo_status = 0x0c8, +}; +#endif + +#if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) +static const struct tegra_i2c_regs tegra210_vi_i2c_regs = { + .cnfg = 0x0c00, + .status = 0x0c70, + .tlow_sext = 0x0cd0, + .tx_fifo = 0x0d40, + .rx_fifo = 0x0d50, + .packet_transfer_status = 0x0d60, + .fifo_control = 0x0d70, + .fifo_status = 0x0d80, + .int_mask = 0x0d90, + .int_status = 0x0da0, + .clk_divisor = 0x0db0, + .bus_clear_cnfg = 0x0e10, + .bus_clear_status = 0x0e20, + .config_load = 0x0e30, + .clken_override = 0x0e40, + .interface_timing_0 = 0x0e50, + .interface_timing_1 = 0x0e60, + .hs_interface_timing_0 = 0x0e70, + .hs_interface_timing_1 = 0x0e80, + .master_reset_cntrl = 0x0ea0, + .mst_fifo_control = 0x0ed0, + .mst_fifo_status = 0x0ee0, +}; +#endif + +static const struct tegra_i2c_regs tegra264_i2c_regs = { + .cnfg = 0x000, + .status = 0x01c, + .sl_cnfg = 0x020, + .sl_addr1 = 0x02c, + .sl_addr2 = 0x030, + .tx_fifo = 0x050, + .rx_fifo = 0x054, + .packet_transfer_status = 0x058, + .fifo_control = 0x05c, + .fifo_status = 0x060, + .int_mask = 0x064, + .int_status = 0x068, + .clk_divisor = 0x06c, + .bus_clear_cnfg = 0x084, + .bus_clear_status = 0x088, + .config_load = 0x08c, + .clken_override = 0x090, + .interface_timing_0 = 0x094, + .interface_timing_1 = 0x098, + .hs_interface_timing_0 = 0x09c, + .hs_interface_timing_1 = 0x0a0, + .master_reset_cntrl = 0x0a8, + .mst_fifo_control = 0x0b4, + .mst_fifo_status = 0x0b8, + .sw_mutex = 0x0ec, +}; + /* * msg_end_type: The bus control which needs to be sent at end of transfer. * @MSG_END_STOP: Send stop pulse. @@ -236,6 +357,7 @@ enum tegra_i2c_variant { * @enable_hs_mode_support: Enable support for high speed (HS) mode transfers. * @has_mutex: Has mutex register for mutual exclusion with other firmwares or VMs. * @variant: This represents the I2C controller variant. + * @regs: Register offsets for the specific SoC variant. */ struct tegra_i2c_hw_feature { bool has_continue_xfer_support; @@ -268,6 +390,7 @@ struct tegra_i2c_hw_feature { bool enable_hs_mode_support; bool has_mutex; enum tegra_i2c_variant variant; + const struct tegra_i2c_regs *regs; }; /** @@ -342,51 +465,26 @@ struct tegra_i2c_dev { #define IS_VI(dev) (IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) && \ (dev)->hw->variant == TEGRA_I2C_VARIANT_VI) -static void dvc_writel(struct tegra_i2c_dev *i2c_dev, u32 val, - unsigned int reg) -{ - writel_relaxed(val, i2c_dev->base + reg); -} - -static u32 dvc_readl(struct tegra_i2c_dev *i2c_dev, unsigned int reg) -{ - return readl_relaxed(i2c_dev->base + reg); -} - -/* - * If necessary, i2c_writel() and i2c_readl() will offset the register - * in order to talk to the I2C block inside the DVC block. - */ -static u32 tegra_i2c_reg_addr(struct tegra_i2c_dev *i2c_dev, unsigned int reg) -{ - if (IS_DVC(i2c_dev)) - reg += (reg >= I2C_TX_FIFO) ? 0x10 : 0x40; - else if (IS_VI(i2c_dev)) - reg = 0xc00 + (reg << 2); - - return reg; -} - static void i2c_writel(struct tegra_i2c_dev *i2c_dev, u32 val, unsigned int reg) { - writel_relaxed(val, i2c_dev->base + tegra_i2c_reg_addr(i2c_dev, reg)); + writel_relaxed(val, i2c_dev->base + reg); /* read back register to make sure that register writes completed */ - if (reg != I2C_TX_FIFO) - readl_relaxed(i2c_dev->base + tegra_i2c_reg_addr(i2c_dev, reg)); + if (!IS_DVC(i2c_dev) && reg != i2c_dev->hw->regs->tx_fifo) + readl_relaxed(i2c_dev->base + reg); else if (IS_VI(i2c_dev)) - readl_relaxed(i2c_dev->base + tegra_i2c_reg_addr(i2c_dev, I2C_INT_STATUS)); + readl_relaxed(i2c_dev->base + i2c_dev->hw->regs->int_status); } static u32 i2c_readl(struct tegra_i2c_dev *i2c_dev, unsigned int reg) { - return readl_relaxed(i2c_dev->base + tegra_i2c_reg_addr(i2c_dev, reg)); + return readl_relaxed(i2c_dev->base + reg); } static void i2c_writesl(struct tegra_i2c_dev *i2c_dev, void *data, unsigned int reg, unsigned int len) { - writesl(i2c_dev->base + tegra_i2c_reg_addr(i2c_dev, reg), data, len); + writesl(i2c_dev->base + reg, data, len); } static void i2c_writesl_vi(struct tegra_i2c_dev *i2c_dev, void *data, @@ -407,12 +505,12 @@ static void i2c_writesl_vi(struct tegra_i2c_dev *i2c_dev, void *data, static void i2c_readsl(struct tegra_i2c_dev *i2c_dev, void *data, unsigned int reg, unsigned int len) { - readsl(i2c_dev->base + tegra_i2c_reg_addr(i2c_dev, reg), data, len); + readsl(i2c_dev->base + reg, data, len); } static bool tegra_i2c_mutex_acquired(struct tegra_i2c_dev *i2c_dev) { - unsigned int reg = tegra_i2c_reg_addr(i2c_dev, I2C_SW_MUTEX); + unsigned int reg = i2c_dev->hw->regs->sw_mutex; u32 val, id; val = readl(i2c_dev->base + reg); @@ -423,7 +521,7 @@ static bool tegra_i2c_mutex_acquired(struct tegra_i2c_dev *i2c_dev) static bool tegra_i2c_mutex_trylock(struct tegra_i2c_dev *i2c_dev) { - unsigned int reg = tegra_i2c_reg_addr(i2c_dev, I2C_SW_MUTEX); + unsigned int reg = i2c_dev->hw->regs->sw_mutex; u32 val, id; val = readl(i2c_dev->base + reg); @@ -461,7 +559,7 @@ static int tegra_i2c_mutex_lock(struct tegra_i2c_dev *i2c_dev) static int tegra_i2c_mutex_unlock(struct tegra_i2c_dev *i2c_dev) { - unsigned int reg = tegra_i2c_reg_addr(i2c_dev, I2C_SW_MUTEX); + unsigned int reg = i2c_dev->hw->regs->sw_mutex; u32 val, id; if (!i2c_dev->hw->has_mutex) @@ -484,16 +582,16 @@ static void tegra_i2c_mask_irq(struct tegra_i2c_dev *i2c_dev, u32 mask) { u32 int_mask; - int_mask = i2c_readl(i2c_dev, I2C_INT_MASK) & ~mask; - i2c_writel(i2c_dev, int_mask, I2C_INT_MASK); + int_mask = i2c_readl(i2c_dev, i2c_dev->hw->regs->int_mask) & ~mask; + i2c_writel(i2c_dev, int_mask, i2c_dev->hw->regs->int_mask); } static void tegra_i2c_unmask_irq(struct tegra_i2c_dev *i2c_dev, u32 mask) { u32 int_mask; - int_mask = i2c_readl(i2c_dev, I2C_INT_MASK) | mask; - i2c_writel(i2c_dev, int_mask, I2C_INT_MASK); + int_mask = i2c_readl(i2c_dev, i2c_dev->hw->regs->int_mask) | mask; + i2c_writel(i2c_dev, int_mask, i2c_dev->hw->regs->int_mask); } static void tegra_i2c_dma_complete(void *args) @@ -621,14 +719,14 @@ static void tegra_dvc_init(struct tegra_i2c_dev *i2c_dev) { u32 val; - val = dvc_readl(i2c_dev, DVC_CTRL_REG3); + val = i2c_readl(i2c_dev, i2c_dev->hw->regs->dvc_ctrl_reg3); val |= DVC_CTRL_REG3_SW_PROG; val |= DVC_CTRL_REG3_I2C_DONE_INTR_EN; - dvc_writel(i2c_dev, val, DVC_CTRL_REG3); + i2c_writel(i2c_dev, val, i2c_dev->hw->regs->dvc_ctrl_reg3); - val = dvc_readl(i2c_dev, DVC_CTRL_REG1); + val = i2c_readl(i2c_dev, i2c_dev->hw->regs->dvc_ctrl_reg1); val |= DVC_CTRL_REG1_INTR_EN; - dvc_writel(i2c_dev, val, DVC_CTRL_REG1); + i2c_writel(i2c_dev, val, i2c_dev->hw->regs->dvc_ctrl_reg1); } static void tegra_i2c_vi_init(struct tegra_i2c_dev *i2c_dev) @@ -637,34 +735,34 @@ static void tegra_i2c_vi_init(struct tegra_i2c_dev *i2c_dev) value = FIELD_PREP(I2C_INTERFACE_TIMING_THIGH, 2) | FIELD_PREP(I2C_INTERFACE_TIMING_TLOW, 4); - i2c_writel(i2c_dev, value, I2C_INTERFACE_TIMING_0); + i2c_writel(i2c_dev, value, i2c_dev->hw->regs->interface_timing_0); value = FIELD_PREP(I2C_INTERFACE_TIMING_TBUF, 4) | FIELD_PREP(I2C_INTERFACE_TIMING_TSU_STO, 7) | FIELD_PREP(I2C_INTERFACE_TIMING_THD_STA, 4) | FIELD_PREP(I2C_INTERFACE_TIMING_TSU_STA, 4); - i2c_writel(i2c_dev, value, I2C_INTERFACE_TIMING_1); + i2c_writel(i2c_dev, value, i2c_dev->hw->regs->interface_timing_1); value = FIELD_PREP(I2C_HS_INTERFACE_TIMING_THIGH, 3) | FIELD_PREP(I2C_HS_INTERFACE_TIMING_TLOW, 8); - i2c_writel(i2c_dev, value, I2C_HS_INTERFACE_TIMING_0); + i2c_writel(i2c_dev, value, i2c_dev->hw->regs->hs_interface_timing_0); value = FIELD_PREP(I2C_HS_INTERFACE_TIMING_TSU_STO, 11) | FIELD_PREP(I2C_HS_INTERFACE_TIMING_THD_STA, 11) | FIELD_PREP(I2C_HS_INTERFACE_TIMING_TSU_STA, 11); - i2c_writel(i2c_dev, value, I2C_HS_INTERFACE_TIMING_1); + i2c_writel(i2c_dev, value, i2c_dev->hw->regs->hs_interface_timing_1); value = FIELD_PREP(I2C_BC_SCLK_THRESHOLD, 9) | I2C_BC_STOP_COND; - i2c_writel(i2c_dev, value, I2C_BUS_CLEAR_CNFG); + i2c_writel(i2c_dev, value, i2c_dev->hw->regs->bus_clear_cnfg); - i2c_writel(i2c_dev, 0x0, I2C_TLOW_SEXT); + i2c_writel(i2c_dev, 0x0, i2c_dev->hw->regs->tlow_sext); } static int tegra_i2c_poll_register(struct tegra_i2c_dev *i2c_dev, u32 reg, u32 mask, u32 delay_us, u32 timeout_us) { - void __iomem *addr = i2c_dev->base + tegra_i2c_reg_addr(i2c_dev, reg); + void __iomem *addr = i2c_dev->base + reg; u32 val; if (!i2c_dev->atomic_mode) @@ -683,11 +781,11 @@ static int tegra_i2c_flush_fifos(struct tegra_i2c_dev *i2c_dev) if (i2c_dev->hw->has_mst_fifo) { mask = I2C_MST_FIFO_CONTROL_TX_FLUSH | I2C_MST_FIFO_CONTROL_RX_FLUSH; - offset = I2C_MST_FIFO_CONTROL; + offset = i2c_dev->hw->regs->mst_fifo_control; } else { mask = I2C_FIFO_CONTROL_TX_FLUSH | I2C_FIFO_CONTROL_RX_FLUSH; - offset = I2C_FIFO_CONTROL; + offset = i2c_dev->hw->regs->fifo_control; } val = i2c_readl(i2c_dev, offset); @@ -710,9 +808,9 @@ static int tegra_i2c_wait_for_config_load(struct tegra_i2c_dev *i2c_dev) if (!i2c_dev->hw->has_config_load_reg) return 0; - i2c_writel(i2c_dev, I2C_MSTR_CONFIG_LOAD, I2C_CONFIG_LOAD); + i2c_writel(i2c_dev, I2C_MSTR_CONFIG_LOAD, i2c_dev->hw->regs->config_load); - err = tegra_i2c_poll_register(i2c_dev, I2C_CONFIG_LOAD, 0xffffffff, + err = tegra_i2c_poll_register(i2c_dev, i2c_dev->hw->regs->config_load, 0xffffffff, 1000, I2C_CONFIG_LOAD_TIMEOUT); if (err) { dev_err(i2c_dev->dev, "failed to load config\n"); @@ -733,10 +831,10 @@ static int tegra_i2c_master_reset(struct tegra_i2c_dev *i2c_dev) * SW needs to wait for 2us after assertion and de-assertion of this soft * reset. */ - i2c_writel(i2c_dev, 0x1, I2C_MASTER_RESET_CNTRL); + i2c_writel(i2c_dev, 0x1, i2c_dev->hw->regs->master_reset_cntrl); fsleep(2); - i2c_writel(i2c_dev, 0x0, I2C_MASTER_RESET_CNTRL); + i2c_writel(i2c_dev, 0x0, i2c_dev->hw->regs->master_reset_cntrl); fsleep(2); return 0; @@ -778,8 +876,8 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) if (i2c_dev->hw->has_multi_master_mode) val |= I2C_CNFG_MULTI_MASTER_MODE; - i2c_writel(i2c_dev, val, I2C_CNFG); - i2c_writel(i2c_dev, 0, I2C_INT_MASK); + i2c_writel(i2c_dev, val, i2c_dev->hw->regs->cnfg); + i2c_writel(i2c_dev, 0, i2c_dev->hw->regs->int_mask); if (IS_VI(i2c_dev)) tegra_i2c_vi_init(i2c_dev); @@ -824,12 +922,12 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) clk_divisor = FIELD_PREP(I2C_CLK_DIVISOR_HSMODE, i2c_dev->hw->clk_divisor_hs_mode) | FIELD_PREP(I2C_CLK_DIVISOR_STD_FAST_MODE, non_hs_mode); - i2c_writel(i2c_dev, clk_divisor, I2C_CLK_DIVISOR); + i2c_writel(i2c_dev, clk_divisor, i2c_dev->hw->regs->clk_divisor); if (i2c_dev->hw->has_interface_timing_reg) { val = FIELD_PREP(I2C_INTERFACE_TIMING_THIGH, thigh) | FIELD_PREP(I2C_INTERFACE_TIMING_TLOW, tlow); - i2c_writel(i2c_dev, val, I2C_INTERFACE_TIMING_0); + i2c_writel(i2c_dev, val, i2c_dev->hw->regs->interface_timing_0); } /* @@ -837,7 +935,7 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) * Otherwise, preserve the chip default values. */ if (i2c_dev->hw->has_interface_timing_reg && tsu_thd) - i2c_writel(i2c_dev, tsu_thd, I2C_INTERFACE_TIMING_1); + i2c_writel(i2c_dev, tsu_thd, i2c_dev->hw->regs->interface_timing_1); /* Write HS mode registers. These will get used only for HS mode*/ if (i2c_dev->hw->enable_hs_mode_support) { @@ -847,8 +945,8 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) val = FIELD_PREP(I2C_HS_INTERFACE_TIMING_THIGH, thigh) | FIELD_PREP(I2C_HS_INTERFACE_TIMING_TLOW, tlow); - i2c_writel(i2c_dev, val, I2C_HS_INTERFACE_TIMING_0); - i2c_writel(i2c_dev, tsu_thd, I2C_HS_INTERFACE_TIMING_1); + i2c_writel(i2c_dev, val, i2c_dev->hw->regs->hs_interface_timing_0); + i2c_writel(i2c_dev, tsu_thd, i2c_dev->hw->regs->hs_interface_timing_1); } clk_multiplier = (tlow + thigh + 2) * (non_hs_mode + 1); @@ -861,12 +959,12 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) } if (!IS_DVC(i2c_dev) && !IS_VI(i2c_dev)) { - u32 sl_cfg = i2c_readl(i2c_dev, I2C_SL_CNFG); + u32 sl_cfg = i2c_readl(i2c_dev, i2c_dev->hw->regs->sl_cnfg); sl_cfg |= I2C_SL_CNFG_NACK | I2C_SL_CNFG_NEWSL; - i2c_writel(i2c_dev, sl_cfg, I2C_SL_CNFG); - i2c_writel(i2c_dev, 0xfc, I2C_SL_ADDR1); - i2c_writel(i2c_dev, 0x00, I2C_SL_ADDR2); + i2c_writel(i2c_dev, sl_cfg, i2c_dev->hw->regs->sl_cnfg); + i2c_writel(i2c_dev, 0xfc, i2c_dev->hw->regs->sl_addr1); + i2c_writel(i2c_dev, 0x00, i2c_dev->hw->regs->sl_addr2); } err = tegra_i2c_flush_fifos(i2c_dev); @@ -874,7 +972,7 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev) return err; if (i2c_dev->multimaster_mode && i2c_dev->hw->has_slcg_override_reg) - i2c_writel(i2c_dev, I2C_MST_CORE_CLKEN_OVR, I2C_CLKEN_OVERRIDE); + i2c_writel(i2c_dev, I2C_MST_CORE_CLKEN_OVR, i2c_dev->hw->regs->clken_override); err = tegra_i2c_wait_for_config_load(i2c_dev); if (err) @@ -895,9 +993,9 @@ static int tegra_i2c_disable_packet_mode(struct tegra_i2c_dev *i2c_dev) */ udelay(DIV_ROUND_UP(2 * 1000000, i2c_dev->timings.bus_freq_hz)); - cnfg = i2c_readl(i2c_dev, I2C_CNFG); + cnfg = i2c_readl(i2c_dev, i2c_dev->hw->regs->cnfg); if (cnfg & I2C_CNFG_PACKET_MODE_EN) - i2c_writel(i2c_dev, cnfg & ~I2C_CNFG_PACKET_MODE_EN, I2C_CNFG); + i2c_writel(i2c_dev, cnfg & ~I2C_CNFG_PACKET_MODE_EN, i2c_dev->hw->regs->cnfg); return tegra_i2c_wait_for_config_load(i2c_dev); } @@ -917,10 +1015,10 @@ static int tegra_i2c_empty_rx_fifo(struct tegra_i2c_dev *i2c_dev) return -EINVAL; if (i2c_dev->hw->has_mst_fifo) { - val = i2c_readl(i2c_dev, I2C_MST_FIFO_STATUS); + val = i2c_readl(i2c_dev, i2c_dev->hw->regs->mst_fifo_status); rx_fifo_avail = FIELD_GET(I2C_MST_FIFO_STATUS_RX, val); } else { - val = i2c_readl(i2c_dev, I2C_FIFO_STATUS); + val = i2c_readl(i2c_dev, i2c_dev->hw->regs->fifo_status); rx_fifo_avail = FIELD_GET(I2C_FIFO_STATUS_RX, val); } @@ -929,7 +1027,7 @@ static int tegra_i2c_empty_rx_fifo(struct tegra_i2c_dev *i2c_dev) if (words_to_transfer > rx_fifo_avail) words_to_transfer = rx_fifo_avail; - i2c_readsl(i2c_dev, buf, I2C_RX_FIFO, words_to_transfer); + i2c_readsl(i2c_dev, buf, i2c_dev->hw->regs->rx_fifo, words_to_transfer); buf += words_to_transfer * BYTES_PER_FIFO_WORD; buf_remaining -= words_to_transfer * BYTES_PER_FIFO_WORD; @@ -945,7 +1043,7 @@ static int tegra_i2c_empty_rx_fifo(struct tegra_i2c_dev *i2c_dev) * when (words_to_transfer was > rx_fifo_avail) earlier * in this function. */ - val = i2c_readl(i2c_dev, I2C_RX_FIFO); + val = i2c_readl(i2c_dev, i2c_dev->hw->regs->rx_fifo); val = cpu_to_le32(val); memcpy(buf, &val, buf_remaining); buf_remaining = 0; @@ -970,10 +1068,10 @@ static int tegra_i2c_fill_tx_fifo(struct tegra_i2c_dev *i2c_dev) u32 val; if (i2c_dev->hw->has_mst_fifo) { - val = i2c_readl(i2c_dev, I2C_MST_FIFO_STATUS); + val = i2c_readl(i2c_dev, i2c_dev->hw->regs->mst_fifo_status); tx_fifo_avail = FIELD_GET(I2C_MST_FIFO_STATUS_TX, val); } else { - val = i2c_readl(i2c_dev, I2C_FIFO_STATUS); + val = i2c_readl(i2c_dev, i2c_dev->hw->regs->fifo_status); tx_fifo_avail = FIELD_GET(I2C_FIFO_STATUS_TX, val); } @@ -1004,9 +1102,9 @@ static int tegra_i2c_fill_tx_fifo(struct tegra_i2c_dev *i2c_dev) i2c_dev->msg_buf = buf + words_to_transfer * BYTES_PER_FIFO_WORD; if (IS_VI(i2c_dev)) - i2c_writesl_vi(i2c_dev, buf, I2C_TX_FIFO, words_to_transfer); + i2c_writesl_vi(i2c_dev, buf, i2c_dev->hw->regs->tx_fifo, words_to_transfer); else - i2c_writesl(i2c_dev, buf, I2C_TX_FIFO, words_to_transfer); + i2c_writesl(i2c_dev, buf, i2c_dev->hw->regs->tx_fifo, words_to_transfer); buf += words_to_transfer * BYTES_PER_FIFO_WORD; } @@ -1028,7 +1126,7 @@ static int tegra_i2c_fill_tx_fifo(struct tegra_i2c_dev *i2c_dev) i2c_dev->msg_buf_remaining = 0; i2c_dev->msg_buf = NULL; - i2c_writel(i2c_dev, val, I2C_TX_FIFO); + i2c_writel(i2c_dev, val, i2c_dev->hw->regs->tx_fifo); } return 0; @@ -1040,13 +1138,13 @@ static irqreturn_t tegra_i2c_isr(int irq, void *dev_id) struct tegra_i2c_dev *i2c_dev = dev_id; u32 status; - status = i2c_readl(i2c_dev, I2C_INT_STATUS); + status = i2c_readl(i2c_dev, i2c_dev->hw->regs->int_status); if (status == 0) { dev_warn(i2c_dev->dev, "IRQ status 0 %08x %08x %08x\n", - i2c_readl(i2c_dev, I2C_PACKET_TRANSFER_STATUS), - i2c_readl(i2c_dev, I2C_STATUS), - i2c_readl(i2c_dev, I2C_CNFG)); + i2c_readl(i2c_dev, i2c_dev->hw->regs->packet_transfer_status), + i2c_readl(i2c_dev, i2c_dev->hw->regs->status), + i2c_readl(i2c_dev, i2c_dev->hw->regs->cnfg)); i2c_dev->msg_err |= I2C_ERR_UNKNOWN_INTERRUPT; goto err; } @@ -1089,9 +1187,9 @@ static irqreturn_t tegra_i2c_isr(int irq, void *dev_id) } } - i2c_writel(i2c_dev, status, I2C_INT_STATUS); + i2c_writel(i2c_dev, status, i2c_dev->hw->regs->int_status); if (IS_DVC(i2c_dev)) - dvc_writel(i2c_dev, DVC_STATUS_I2C_DONE_INTR, DVC_STATUS); + i2c_writel(i2c_dev, DVC_STATUS_I2C_DONE_INTR, i2c_dev->hw->regs->dvc_status); /* * During message read XFER_COMPLETE interrupt is triggered prior to @@ -1127,10 +1225,10 @@ static irqreturn_t tegra_i2c_isr(int irq, void *dev_id) if (i2c_dev->hw->supports_bus_clear) tegra_i2c_mask_irq(i2c_dev, I2C_INT_BUS_CLR_DONE); - i2c_writel(i2c_dev, status, I2C_INT_STATUS); + i2c_writel(i2c_dev, status, i2c_dev->hw->regs->int_status); if (IS_DVC(i2c_dev)) - dvc_writel(i2c_dev, DVC_STATUS_I2C_DONE_INTR, DVC_STATUS); + i2c_writel(i2c_dev, DVC_STATUS_I2C_DONE_INTR, i2c_dev->hw->regs->dvc_status); if (i2c_dev->dma_mode) { dmaengine_terminate_async(i2c_dev->dma_chan); @@ -1150,9 +1248,9 @@ static void tegra_i2c_config_fifo_trig(struct tegra_i2c_dev *i2c_dev, int err; if (i2c_dev->hw->has_mst_fifo) - reg = I2C_MST_FIFO_CONTROL; + reg = i2c_dev->hw->regs->mst_fifo_control; else - reg = I2C_FIFO_CONTROL; + reg = i2c_dev->hw->regs->fifo_control; if (i2c_dev->dma_mode) { if (len & 0xF) @@ -1163,7 +1261,7 @@ static void tegra_i2c_config_fifo_trig(struct tegra_i2c_dev *i2c_dev, dma_burst = 8; if (i2c_dev->msg_read) { - reg_offset = tegra_i2c_reg_addr(i2c_dev, I2C_RX_FIFO); + reg_offset = i2c_dev->hw->regs->rx_fifo; slv_config.src_addr = i2c_dev->base_phys + reg_offset; slv_config.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; @@ -1174,7 +1272,7 @@ static void tegra_i2c_config_fifo_trig(struct tegra_i2c_dev *i2c_dev, else val = I2C_FIFO_CONTROL_RX_TRIG(dma_burst); } else { - reg_offset = tegra_i2c_reg_addr(i2c_dev, I2C_TX_FIFO); + reg_offset = i2c_dev->hw->regs->tx_fifo; slv_config.dst_addr = i2c_dev->base_phys + reg_offset; slv_config.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; @@ -1217,7 +1315,7 @@ static unsigned long tegra_i2c_poll_completion(struct tegra_i2c_dev *i2c_dev, ktime_t ktimeout = ktime_add_ms(ktime, timeout_ms); do { - u32 status = i2c_readl(i2c_dev, I2C_INT_STATUS); + u32 status = i2c_readl(i2c_dev, i2c_dev->hw->regs->int_status); if (status) tegra_i2c_isr(i2c_dev->irq, i2c_dev); @@ -1276,14 +1374,14 @@ static int tegra_i2c_issue_bus_clear(struct i2c_adapter *adap) val = FIELD_PREP(I2C_BC_SCLK_THRESHOLD, 9) | I2C_BC_STOP_COND | I2C_BC_TERMINATE; - i2c_writel(i2c_dev, val, I2C_BUS_CLEAR_CNFG); + i2c_writel(i2c_dev, val, i2c_dev->hw->regs->bus_clear_cnfg); err = tegra_i2c_wait_for_config_load(i2c_dev); if (err) return err; val |= I2C_BC_ENABLE; - i2c_writel(i2c_dev, val, I2C_BUS_CLEAR_CNFG); + i2c_writel(i2c_dev, val, i2c_dev->hw->regs->bus_clear_cnfg); tegra_i2c_unmask_irq(i2c_dev, I2C_INT_BUS_CLR_DONE); time_left = tegra_i2c_wait_completion(i2c_dev, &i2c_dev->msg_complete, 50); @@ -1294,7 +1392,7 @@ static int tegra_i2c_issue_bus_clear(struct i2c_adapter *adap) return -ETIMEDOUT; } - val = i2c_readl(i2c_dev, I2C_BUS_CLEAR_STATUS); + val = i2c_readl(i2c_dev, i2c_dev->hw->regs->bus_clear_status); if (!(val & I2C_BC_STATUS)) { dev_err(i2c_dev->dev, "un-recovered arbitration lost\n"); return -EIO; @@ -1319,14 +1417,14 @@ static void tegra_i2c_push_packet_header(struct tegra_i2c_dev *i2c_dev, if (i2c_dev->dma_mode && !i2c_dev->msg_read) *dma_buf++ = packet_header; else - i2c_writel(i2c_dev, packet_header, I2C_TX_FIFO); + i2c_writel(i2c_dev, packet_header, i2c_dev->hw->regs->tx_fifo); packet_header = i2c_dev->msg_len - 1; if (i2c_dev->dma_mode && !i2c_dev->msg_read) *dma_buf++ = packet_header; else - i2c_writel(i2c_dev, packet_header, I2C_TX_FIFO); + i2c_writel(i2c_dev, packet_header, i2c_dev->hw->regs->tx_fifo); packet_header = I2C_HEADER_IE_ENABLE; @@ -1354,7 +1452,7 @@ static void tegra_i2c_push_packet_header(struct tegra_i2c_dev *i2c_dev, if (i2c_dev->dma_mode && !i2c_dev->msg_read) *dma_buf++ = packet_header; else - i2c_writel(i2c_dev, packet_header, I2C_TX_FIFO); + i2c_writel(i2c_dev, packet_header, i2c_dev->hw->regs->tx_fifo); } static int tegra_i2c_error_recover(struct tegra_i2c_dev *i2c_dev, @@ -1475,7 +1573,7 @@ static int tegra_i2c_xfer_msg(struct tegra_i2c_dev *i2c_dev, tegra_i2c_unmask_irq(i2c_dev, int_mask); dev_dbg(i2c_dev->dev, "unmasked IRQ: %02x\n", - i2c_readl(i2c_dev, I2C_INT_MASK)); + i2c_readl(i2c_dev, i2c_dev->hw->regs->int_mask)); if (i2c_dev->dma_mode) { time_left = tegra_i2c_wait_completion(i2c_dev, @@ -1650,6 +1748,7 @@ static const struct tegra_i2c_hw_feature tegra20_i2c_hw = { .enable_hs_mode_support = false, .has_mutex = false, .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra20_i2c_regs, }; #if IS_ENABLED(CONFIG_ARCH_TEGRA_2x_SOC) @@ -1682,6 +1781,7 @@ static const struct tegra_i2c_hw_feature tegra20_dvc_i2c_hw = { .enable_hs_mode_support = false, .has_mutex = false, .variant = TEGRA_I2C_VARIANT_DVC, + .regs = &tegra20_dvc_i2c_regs, }; #endif @@ -1714,6 +1814,7 @@ static const struct tegra_i2c_hw_feature tegra30_i2c_hw = { .enable_hs_mode_support = false, .has_mutex = false, .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra20_i2c_regs, }; static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { @@ -1745,6 +1846,7 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = { .enable_hs_mode_support = false, .has_mutex = false, .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra20_i2c_regs, }; static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { @@ -1776,6 +1878,7 @@ static const struct tegra_i2c_hw_feature tegra124_i2c_hw = { .enable_hs_mode_support = false, .has_mutex = false, .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra20_i2c_regs, }; static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { @@ -1807,6 +1910,7 @@ static const struct tegra_i2c_hw_feature tegra210_i2c_hw = { .enable_hs_mode_support = false, .has_mutex = false, .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra20_i2c_regs, }; #if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) @@ -1839,6 +1943,7 @@ static const struct tegra_i2c_hw_feature tegra210_vi_i2c_hw = { .enable_hs_mode_support = false, .has_mutex = false, .variant = TEGRA_I2C_VARIANT_VI, + .regs = &tegra210_vi_i2c_regs, }; #endif @@ -1871,6 +1976,7 @@ static const struct tegra_i2c_hw_feature tegra186_i2c_hw = { .enable_hs_mode_support = false, .has_mutex = false, .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra20_i2c_regs, }; static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { @@ -1904,6 +2010,7 @@ static const struct tegra_i2c_hw_feature tegra194_i2c_hw = { .enable_hs_mode_support = true, .has_mutex = false, .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra20_i2c_regs, }; static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { @@ -1937,6 +2044,7 @@ static const struct tegra_i2c_hw_feature tegra256_i2c_hw = { .enable_hs_mode_support = true, .has_mutex = true, .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra20_i2c_regs, }; static const struct tegra_i2c_hw_feature tegra264_i2c_hw = { @@ -1970,6 +2078,7 @@ static const struct tegra_i2c_hw_feature tegra264_i2c_hw = { .enable_hs_mode_support = true, .has_mutex = true, .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra264_i2c_regs, }; static const struct of_device_id tegra_i2c_of_match[] = { From 51acc19762662c09a69f52919f784da2407f0739 Mon Sep 17 00:00:00 2001 From: Kartik Rajput Date: Wed, 7 Jan 2026 19:56:49 +0530 Subject: [PATCH 234/247] NVIDIA: VR: SAUCE: i2c: tegra: Add support for Tegra410 BugLink: https://bugs.launchpad.net/bugs/2138238 Add support for the Tegra410 SoC, which has 4 I2C controllers. The controllers are feature-equivalent to Tegra264; only the register offsets differ. Signed-off-by: Kartik Rajput (backported from https://lore.kernel.org/all/20260107142649.14917-1-kkartik@nvidia.com/) Signed-off-by: Matthew R. Ochs Acked-by: Carol L Soto Acked-by: Jamie Nguyen Acked-by: Nirmoy Das Acked-by: Abdur Rahman Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/i2c/busses/i2c-tegra.c | 64 ++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 821e7627e56ec..44afef32d6569 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -280,6 +280,35 @@ static const struct tegra_i2c_regs tegra264_i2c_regs = { .sw_mutex = 0x0ec, }; +static const struct tegra_i2c_regs tegra410_i2c_regs = { + .cnfg = 0x000, + .status = 0x01c, + .sl_cnfg = 0x020, + .sl_addr1 = 0x02c, + .sl_addr2 = 0x030, + .tlow_sext = 0x034, + .tx_fifo = 0x054, + .rx_fifo = 0x058, + .packet_transfer_status = 0x05c, + .fifo_control = 0x060, + .fifo_status = 0x064, + .int_mask = 0x068, + .int_status = 0x06c, + .clk_divisor = 0x070, + .bus_clear_cnfg = 0x088, + .bus_clear_status = 0x08c, + .config_load = 0x090, + .clken_override = 0x094, + .interface_timing_0 = 0x098, + .interface_timing_1 = 0x09c, + .hs_interface_timing_0 = 0x0a0, + .hs_interface_timing_1 = 0x0a4, + .master_reset_cntrl = 0x0ac, + .mst_fifo_control = 0x0b8, + .mst_fifo_status = 0x0bc, + .sw_mutex = 0x0f0, +}; + /* * msg_end_type: The bus control which needs to be sent at end of transfer. * @MSG_END_STOP: Send stop pulse. @@ -2081,6 +2110,40 @@ static const struct tegra_i2c_hw_feature tegra264_i2c_hw = { .regs = &tegra264_i2c_regs, }; +static const struct tegra_i2c_hw_feature tegra410_i2c_hw = { + .has_continue_xfer_support = true, + .has_per_pkt_xfer_complete_irq = true, + .clk_divisor_hs_mode = 1, + .clk_divisor_std_mode = 0x3f, + .clk_divisor_fast_mode = 0x2c, + .clk_divisor_fast_plus_mode = 0x11, + .has_config_load_reg = true, + .has_multi_master_mode = true, + .has_slcg_override_reg = true, + .has_mst_fifo = true, + .has_mst_reset = true, + .quirks = &tegra194_i2c_quirks, + .supports_bus_clear = true, + .has_apb_dma = false, + .tlow_std_mode = 0x8, + .thigh_std_mode = 0x7, + .tlow_fast_mode = 0x2, + .thigh_fast_mode = 0x2, + .tlow_fastplus_mode = 0x2, + .thigh_fastplus_mode = 0x2, + .tlow_hs_mode = 0x8, + .thigh_hs_mode = 0x6, + .setup_hold_time_std_mode = 0x08080808, + .setup_hold_time_fast_mode = 0x02020202, + .setup_hold_time_fastplus_mode = 0x02020202, + .setup_hold_time_hs_mode = 0x0b0b0b, + .has_interface_timing_reg = true, + .enable_hs_mode_support = true, + .has_mutex = true, + .variant = TEGRA_I2C_VARIANT_DEFAULT, + .regs = &tegra410_i2c_regs, +}; + static const struct of_device_id tegra_i2c_of_match[] = { { .compatible = "nvidia,tegra264-i2c", .data = &tegra264_i2c_hw, }, { .compatible = "nvidia,tegra256-i2c", .data = &tegra256_i2c_hw, }, @@ -2391,6 +2454,7 @@ static const struct acpi_device_id tegra_i2c_acpi_match[] = { {.id = "NVDA0101", .driver_data = (kernel_ulong_t)&tegra210_i2c_hw}, {.id = "NVDA0201", .driver_data = (kernel_ulong_t)&tegra186_i2c_hw}, {.id = "NVDA0301", .driver_data = (kernel_ulong_t)&tegra194_i2c_hw}, + {.id = "NVDA2017", .driver_data = (kernel_ulong_t)&tegra410_i2c_hw}, { } }; MODULE_DEVICE_TABLE(acpi, tegra_i2c_acpi_match); From 73ae998c082f3b35645b2466bdfc3d2e188af5f0 Mon Sep 17 00:00:00 2001 From: Muteeb Akram Date: Tue, 6 Jan 2026 01:58:48 +0000 Subject: [PATCH 235/247] NVIDIA: SAUCE: r8127: print GPL_CLAIM with KERN_INFO BugLink: https://bugs.launchpad.net/bugs/2137588 Add KERN_INFO log level for GPL_CLAIM to downgrade warning messages Signed-off-by: ChunHao Lin Signed-off-by: Revanth Kumar Uppala Signed-off-by: Muteeb Akram Acked-by: Jamie Nguyen Acked-by: Matthew R. Ochs Acked-by: Jacob Martin Acked-by: Noah Wager Signed-off-by: Brad Figg --- drivers/net/ethernet/realtek/r8127/r8127_n.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8127/r8127_n.c b/drivers/net/ethernet/realtek/r8127/r8127_n.c index 496fec1320d12..9e39016ea2c80 100755 --- a/drivers/net/ethernet/realtek/r8127/r8127_n.c +++ b/drivers/net/ethernet/realtek/r8127/r8127_n.c @@ -14298,7 +14298,7 @@ rtl8127_init_one(struct pci_dev *pdev, rtl8127_sysfs_init(dev); #endif /* ENABLE_R8127_SYSFS */ - printk("%s", GPL_CLAIM); + printk(KERN_INFO "%s", GPL_CLAIM); out: return rc; From 758287b90ee73ec5fc102dec21c2968a1e2699ef Mon Sep 17 00:00:00 2001 From: Akhil R Date: Tue, 16 Dec 2025 10:39:25 +0530 Subject: [PATCH 236/247] NVIDIA: VR: SAUCE: dt-bindings: i3c: Add mipi-i3c-static-method to support SETAASA Add the 'mipi-i3c-static-method' property to specify which discovery method an I3C device supports during bus initialization. The property will be used specifically if an I3C device requires SETAASA for device discovery and address assignment. ENTDAA and SETDASA will be supported by default if this property is absent. This also removes the requirement that all I3C devices should support Dynamic Address Assignment (DAA). Hence remove statements that mentions otherwise. Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- .../devicetree/bindings/i3c/i3c.yaml | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/Documentation/devicetree/bindings/i3c/i3c.yaml b/Documentation/devicetree/bindings/i3c/i3c.yaml index e25fa72fd7857..7fa0cf490f209 100644 --- a/Documentation/devicetree/bindings/i3c/i3c.yaml +++ b/Documentation/devicetree/bindings/i3c/i3c.yaml @@ -24,15 +24,14 @@ properties: description: | Each I2C device connected to the bus should be described in a subnode. - All I3C devices are supposed to support DAA (Dynamic Address Assignment), - and are thus discoverable. So, by default, I3C devices do not have to be - described in the device tree. This being said, one might want to attach - extra resources to these devices, and those resources may have to be - described in the device tree, which in turn means we have to describe - I3C devices. - - Another use case for describing an I3C device in the device tree is when - this I3C device has a static I2C address and we want to assign it a + By default, I3C devices do not have to be described in the device tree. + This being said, one might want to attach extra resources to these + devices, and those resources may have to be described in the device tree, + which in turn means we have to describe I3C devices. + + I3C child would have to be described in the device tree if the I3C device + uses SETAASA for its discovery and needs to be assigned a static + address, or if it uses a static I2C address and we want to assign a specific I3C dynamic address before the DAA takes place (so that other devices on the bus can't take this dynamic address). @@ -147,6 +146,17 @@ patternProperties: through SETDASA. If static address is not present, this address is assigned through SETNEWDA after assigning a temporary address via ENTDAA. + mipi-i3c-static-method: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [ 1, 2 ] + description: | + Bitmap (Bit(0) = ENTDAA, Bit(1) = SETAASA) that indicates the static + address method used for the device discovery. This property is mandatory + for I3C devices that require to use SETAASA instead of ENTDAA to assign + a static address. The static address will be the one encoded in reg[0] + if SETAASA is used. ENTDAA will remain as the default method even if + this property is not present. + required: - reg From e2c2d264192e070b7d6dcdfc9fd6ab9875d96106 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Tue, 13 Jan 2026 10:53:51 +0530 Subject: [PATCH 237/247] NVIDIA: VR: SAUCE: ACPICA: Read LVR from the I2C resource descriptor ACPI 6.3 specifies byte 8 of I2C Serial Bus Connection descriptor to be used for Legacy Virtual Register (LVR) data as specified in the MIPI I3C Specification for an I2C device connected to an I3C Host Controller. Update the rsconvert_info to include this field. For I2C devices on an I2C bus, this field is Reserved and unused. Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- drivers/acpi/acpica/rsserial.c | 6 +++++- include/acpi/acrestyp.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpica/rsserial.c b/drivers/acpi/acpica/rsserial.c index 279bfa27da94d..1119c64795a77 100644 --- a/drivers/acpi/acpica/rsserial.c +++ b/drivers/acpi/acpica/rsserial.c @@ -315,7 +315,7 @@ struct acpi_rsconvert_info acpi_rs_convert_csi2_serial_bus[14] = { * ******************************************************************************/ -struct acpi_rsconvert_info acpi_rs_convert_i2c_serial_bus[17] = { +struct acpi_rsconvert_info acpi_rs_convert_i2c_serial_bus[18] = { {ACPI_RSC_INITGET, ACPI_RESOURCE_TYPE_SERIAL_BUS, ACPI_RS_SIZE(struct acpi_resource_i2c_serialbus), ACPI_RSC_TABLE_SIZE(acpi_rs_convert_i2c_serial_bus)}, @@ -391,6 +391,10 @@ struct acpi_rsconvert_info acpi_rs_convert_i2c_serial_bus[17] = { AML_OFFSET(i2c_serial_bus.type_specific_flags), 0}, + {ACPI_RSC_MOVE8, ACPI_RS_OFFSET(data.i2c_serial_bus.lvr), + AML_OFFSET(i2c_serial_bus.type_specific_flags) + 1, + 0}, + {ACPI_RSC_MOVE32, ACPI_RS_OFFSET(data.i2c_serial_bus.connection_speed), AML_OFFSET(i2c_serial_bus.connection_speed), 1}, diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h index 842f932e2c2bc..2be59da4fe4fb 100644 --- a/include/acpi/acrestyp.h +++ b/include/acpi/acrestyp.h @@ -421,6 +421,7 @@ ACPI_RESOURCE_SERIAL_COMMON}; struct acpi_resource_i2c_serialbus { ACPI_RESOURCE_SERIAL_COMMON u8 access_mode; + u8 lvr; u16 slave_address; u32 connection_speed; }; From 382e28992b57a21e2ad0f31309e7bd2ad2b7206b Mon Sep 17 00:00:00 2001 From: Akhil R Date: Tue, 14 Oct 2025 11:18:01 +0530 Subject: [PATCH 238/247] NVIDIA: VR: SAUCE: i3c: master: Use unified device property interface Replace all OF specific functions with unified device property functions as a pre-requisite to support both ACPI and device tree. Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- drivers/i3c/master.c | 103 ++++++++++++++++++++----------------- include/linux/i3c/master.h | 5 +- 2 files changed, 59 insertions(+), 49 deletions(-) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 67a18e437f831..3764f2e4ecded 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -14,9 +14,12 @@ #include #include #include +#include #include #include #include +#include +#include #include "internals.h" @@ -466,7 +469,7 @@ static void i3c_bus_cleanup(struct i3c_bus *i3cbus) mutex_unlock(&i3c_core_lock); } -static int i3c_bus_init(struct i3c_bus *i3cbus, struct device_node *np) +static int i3c_bus_init(struct i3c_bus *i3cbus, struct fwnode_handle *fwnode) { int ret, start, end, id = -1; @@ -476,8 +479,8 @@ static int i3c_bus_init(struct i3c_bus *i3cbus, struct device_node *np) i3c_bus_init_addrslots(i3cbus); i3cbus->mode = I3C_BUS_MODE_PURE; - if (np) - id = of_alias_get_id(np, "i3c"); + if (fwnode && is_of_node(fwnode)) + id = of_alias_get_id(to_of_node(fwnode), "i3c"); mutex_lock(&i3c_core_lock); if (id >= 0) { @@ -710,7 +713,7 @@ static void i3c_masterdev_release(struct device *dev) WARN_ON(!list_empty(&bus->devs.i2c) || !list_empty(&bus->devs.i3c)); i3c_bus_cleanup(bus); - of_node_put(dev->of_node); + fwnode_handle_put(dev->fwnode); } static const struct device_type i3c_masterdev_type = { @@ -894,7 +897,7 @@ static void i3c_device_release(struct device *dev) WARN_ON(i3cdev->desc); - of_node_put(i3cdev->dev.of_node); + fwnode_handle_put(dev->fwnode); kfree(i3cdev); } @@ -1682,7 +1685,7 @@ i3c_master_register_new_i3c_devs(struct i3c_master_controller *master) desc->info.pid); if (desc->boardinfo) - desc->dev->dev.of_node = desc->boardinfo->of_node; + device_set_node(&desc->dev->dev, desc->boardinfo->fwnode); ret = device_register(&desc->dev->dev); if (ret) { @@ -2188,20 +2191,25 @@ EXPORT_SYMBOL_GPL(i3c_master_add_i3c_dev_locked); #define OF_I3C_REG1_IS_I2C_DEV BIT(31) static int -of_i3c_master_add_i2c_boardinfo(struct i3c_master_controller *master, - struct device_node *node, u32 *reg) +i3c_master_add_i2c_boardinfo(struct i3c_master_controller *master, + struct fwnode_handle *fwnode, u32 *reg) { struct i2c_dev_boardinfo *boardinfo; struct device *dev = &master->dev; - int ret; + struct acpi_device *adev; + int ret = -EINVAL; boardinfo = devm_kzalloc(dev, sizeof(*boardinfo), GFP_KERNEL); if (!boardinfo) return -ENOMEM; - ret = of_i2c_get_board_info(dev, node, &boardinfo->base); - if (ret) - return ret; + if (is_of_node(fwnode)) { + ret = of_i2c_get_board_info(dev, to_of_node(fwnode), &boardinfo->base); + if (ret) + return ret; + } else { + return -EINVAL; + } /* * The I3C Specification does not clearly say I2C devices with 10-bit @@ -2217,14 +2225,14 @@ of_i3c_master_add_i2c_boardinfo(struct i3c_master_controller *master, boardinfo->lvr = reg[2]; list_add_tail(&boardinfo->node, &master->boardinfo.i2c); - of_node_get(node); + fwnode_handle_get(fwnode); return 0; } static int -of_i3c_master_add_i3c_boardinfo(struct i3c_master_controller *master, - struct device_node *node, u32 *reg) +i3c_master_add_i3c_boardinfo(struct i3c_master_controller *master, + struct fwnode_handle *fwnode, u32 *reg) { struct i3c_dev_boardinfo *boardinfo; struct device *dev = &master->dev; @@ -2247,7 +2255,7 @@ of_i3c_master_add_i3c_boardinfo(struct i3c_master_controller *master, boardinfo->static_addr = reg[0]; - if (!of_property_read_u32(node, "assigned-address", &init_dyn_addr)) { + if (!fwnode_property_read_u32(fwnode, "assigned-address", &init_dyn_addr)) { if (init_dyn_addr > I3C_MAX_ADDR) return -EINVAL; @@ -2264,22 +2272,19 @@ of_i3c_master_add_i3c_boardinfo(struct i3c_master_controller *master, return -EINVAL; boardinfo->init_dyn_addr = init_dyn_addr; - boardinfo->of_node = of_node_get(node); + boardinfo->fwnode = fwnode_handle_get(fwnode); list_add_tail(&boardinfo->node, &master->boardinfo.i3c); return 0; } -static int of_i3c_master_add_dev(struct i3c_master_controller *master, - struct device_node *node) +static int i3c_master_add_dev(struct i3c_master_controller *master, + struct fwnode_handle *fwnode) { u32 reg[3]; int ret; - if (!master) - return -EINVAL; - - ret = of_property_read_u32_array(node, "reg", reg, ARRAY_SIZE(reg)); + ret = fwnode_property_read_u32_array(fwnode, "reg", reg, ARRAY_SIZE(reg)); if (ret) return ret; @@ -2288,30 +2293,28 @@ static int of_i3c_master_add_dev(struct i3c_master_controller *master, * dealing with an I2C device. */ if (!reg[1]) - ret = of_i3c_master_add_i2c_boardinfo(master, node, reg); + ret = i3c_master_add_i2c_boardinfo(master, fwnode, reg); else - ret = of_i3c_master_add_i3c_boardinfo(master, node, reg); + ret = i3c_master_add_i3c_boardinfo(master, fwnode, reg); return ret; } -static int of_populate_i3c_bus(struct i3c_master_controller *master) +static int fwnode_populate_i3c_bus(struct i3c_master_controller *master) { struct device *dev = &master->dev; - struct device_node *i3cbus_np = dev->of_node; - struct device_node *node; + struct fwnode_handle *fwnode = dev_fwnode(dev); + struct fwnode_handle *child; int ret; u32 val; - if (!i3cbus_np) + if (!fwnode) return 0; - for_each_available_child_of_node(i3cbus_np, node) { - ret = of_i3c_master_add_dev(master, node); - if (ret) { - of_node_put(node); + fwnode_for_each_available_child_node(fwnode, child) { + ret = i3c_master_add_dev(master, child); + if (ret) return ret; - } } /* @@ -2319,10 +2322,10 @@ static int of_populate_i3c_bus(struct i3c_master_controller *master) * on the bus are not supporting typical rates, or if the bus topology * prevents it from using max possible rate. */ - if (!of_property_read_u32(i3cbus_np, "i2c-scl-hz", &val)) + if (!device_property_read_u32(dev, "i2c-scl-hz", &val)) master->bus.scl_rate.i2c = val; - if (!of_property_read_u32(i3cbus_np, "i3c-scl-hz", &val)) + if (!device_property_read_u32(dev, "i3c-scl-hz", &val)) master->bus.scl_rate.i3c = val; return 0; @@ -2371,7 +2374,7 @@ static u8 i3c_master_i2c_get_lvr(struct i2c_client *client) u8 lvr = I3C_LVR_I2C_INDEX(2) | I3C_LVR_I2C_FM_MODE; u32 reg[3]; - if (!of_property_read_u32_array(client->dev.of_node, "reg", reg, ARRAY_SIZE(reg))) + if (!fwnode_property_read_u32_array(client->dev.fwnode, "reg", reg, ARRAY_SIZE(reg))) lvr = reg[2]; return lvr; @@ -2484,7 +2487,8 @@ static int i3c_master_i2c_adapter_init(struct i3c_master_controller *master) struct i2c_adapter *adap = i3c_master_to_i2c_adapter(master); struct i2c_dev_desc *i2cdev; struct i2c_dev_boardinfo *i2cboardinfo; - int ret, id; + struct fwnode_handle *fwnode = dev_fwnode(&master->dev); + int ret, id = -1; adap->dev.parent = master->dev.parent; adap->owner = master->dev.parent->driver->owner; @@ -2495,7 +2499,9 @@ static int i3c_master_i2c_adapter_init(struct i3c_master_controller *master) adap->timeout = HZ; adap->retries = 3; - id = of_alias_get_id(master->dev.of_node, "i2c"); + if (fwnode && is_of_node(fwnode)) + id = of_alias_get_id(to_of_node(fwnode), "i2c"); + if (id >= 0) { adap->nr = id; ret = i2c_add_numbered_adapter(adap); @@ -2802,7 +2808,7 @@ int i3c_master_register(struct i3c_master_controller *master, return ret; master->dev.parent = parent; - master->dev.of_node = of_node_get(parent->of_node); + device_set_node(&master->dev, fwnode_handle_get(dev_fwnode(parent))); master->dev.bus = &i3c_bus_type; master->dev.type = &i3c_masterdev_type; master->dev.release = i3c_masterdev_release; @@ -2811,7 +2817,7 @@ int i3c_master_register(struct i3c_master_controller *master, INIT_LIST_HEAD(&master->boardinfo.i2c); INIT_LIST_HEAD(&master->boardinfo.i3c); - ret = i3c_bus_init(i3cbus, master->dev.of_node); + ret = i3c_bus_init(i3cbus, dev_fwnode(&master->dev)); if (ret) return ret; @@ -2822,7 +2828,7 @@ int i3c_master_register(struct i3c_master_controller *master, master->dev.coherent_dma_mask = parent->coherent_dma_mask; master->dev.dma_parms = parent->dma_parms; - ret = of_populate_i3c_bus(master); + ret = fwnode_populate_i3c_bus(master); if (ret) goto err_put_dev; @@ -3062,11 +3068,14 @@ static int __init i3c_init(void) { int res; - res = of_alias_get_highest_id("i3c"); - if (res >= 0) { - mutex_lock(&i3c_core_lock); - __i3c_first_dynamic_bus_num = res + 1; - mutex_unlock(&i3c_core_lock); + /* of_alias_get_highest_id is DT-specific, only call for DT systems */ + if (IS_ENABLED(CONFIG_OF)) { + res = of_alias_get_highest_id("i3c"); + if (res >= 0) { + mutex_lock(&i3c_core_lock); + __i3c_first_dynamic_bus_num = res + 1; + mutex_unlock(&i3c_core_lock); + } } res = bus_register_notifier(&i2c_bus_type, &i2cdev_notifier); diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 043f5c7ff398f..5b2d13b229a91 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -177,7 +177,8 @@ struct i3c_device_ibi_info { * @pid: I3C Provisioned ID exposed by the device. This is a unique identifier * that may be used to attach boardinfo to i3c_dev_desc when the device * does not have a static address - * @of_node: optional DT node in case the device has been described in the DT + * @fwnode: Firmware node (DT or ACPI) in case the device has been + * described in firmware * * This structure is used to attach board-level information to an I3C device. * Not all I3C devices connected on the bus will have a boardinfo. It's only @@ -189,7 +190,7 @@ struct i3c_dev_boardinfo { u8 init_dyn_addr; u8 static_addr; u64 pid; - struct device_node *of_node; + struct fwnode_handle *fwnode; }; /** From 80b9967598ad76a0090d100fad11f83e5ead479b Mon Sep 17 00:00:00 2001 From: Akhil R Date: Wed, 7 Jan 2026 17:52:13 +0530 Subject: [PATCH 239/247] NVIDIA: VR: SAUCE: i3c: master: Follow MIPI DISCO for ACPI enumeration Use _ADR and mipi-i3c-static-address properties to follow the MIPI I3C Discovery and Configuration Specification [1] and get the I2C device from the ACPI I2C resource descriptor. [1] https://www.mipi.org/specifications/disco Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- drivers/i3c/master.c | 94 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 11 deletions(-) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 3764f2e4ecded..73071f6ea087e 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -2190,6 +2190,23 @@ EXPORT_SYMBOL_GPL(i3c_master_add_i3c_dev_locked); #define OF_I3C_REG1_IS_I2C_DEV BIT(31) +static int i3c_acpi_get_i2c_resource(struct acpi_resource *ares, void *data) +{ + struct i2c_dev_boardinfo *boardinfo = data; + struct acpi_resource_i2c_serialbus *sb; + + if (!i2c_acpi_get_i2c_resource(ares, &sb)) + return 1; + + boardinfo->base.addr = sb->slave_address; + if (sb->access_mode == ACPI_I2C_10BIT_MODE) + boardinfo->base.flags |= I2C_CLIENT_TEN; + + boardinfo->lvr = sb->lvr; + + return 1; +} + static int i3c_master_add_i2c_boardinfo(struct i3c_master_controller *master, struct fwnode_handle *fwnode, u32 *reg) @@ -2197,6 +2214,7 @@ i3c_master_add_i2c_boardinfo(struct i3c_master_controller *master, struct i2c_dev_boardinfo *boardinfo; struct device *dev = &master->dev; struct acpi_device *adev; + LIST_HEAD(resources); int ret = -EINVAL; boardinfo = devm_kzalloc(dev, sizeof(*boardinfo), GFP_KERNEL); @@ -2207,8 +2225,22 @@ i3c_master_add_i2c_boardinfo(struct i3c_master_controller *master, ret = of_i2c_get_board_info(dev, to_of_node(fwnode), &boardinfo->base); if (ret) return ret; - } else { - return -EINVAL; + + /* LVR is encoded in reg[2] for Device Tree. */ + boardinfo->lvr = reg[2]; + } else if (is_acpi_device_node(fwnode)) { + adev = to_acpi_device_node(fwnode); + if (!adev) + return -ENODEV; + + boardinfo->base.fwnode = acpi_fwnode_handle(adev); + ret = acpi_dev_get_resources(adev, &resources, + i3c_acpi_get_i2c_resource, boardinfo); + + if (ACPI_FAILURE(ret) || !boardinfo->base.addr) + return -EINVAL; + + acpi_dev_free_resource_list(&resources); } /* @@ -2221,9 +2253,6 @@ i3c_master_add_i2c_boardinfo(struct i3c_master_controller *master, return -EOPNOTSUPP; } - /* LVR is encoded in reg[2]. */ - boardinfo->lvr = reg[2]; - list_add_tail(&boardinfo->node, &master->boardinfo.i2c); fwnode_handle_get(fwnode); @@ -2278,8 +2307,8 @@ i3c_master_add_i3c_boardinfo(struct i3c_master_controller *master, return 0; } -static int i3c_master_add_dev(struct i3c_master_controller *master, - struct fwnode_handle *fwnode) +static int i3c_master_add_of_dev(struct i3c_master_controller *master, + struct fwnode_handle *fwnode) { u32 reg[3]; int ret; @@ -2300,19 +2329,44 @@ static int i3c_master_add_dev(struct i3c_master_controller *master, return ret; } +static int i3c_master_add_acpi_dev(struct i3c_master_controller *master, + struct fwnode_handle *fwnode) +{ + struct acpi_device *adev = to_acpi_device_node(fwnode); + u32 reg[3], adr; + + /* I2C device on an I3C bus should not have _ADR property as per spec */ + if (!acpi_has_method(adev->handle, "_ADR")) + return i3c_master_add_i2c_boardinfo(master, adev->handle, reg); + + adr = acpi_device_adr(adev); + + /* _ADR will have the 48 bit PID of the device */ + reg[1] = lower_32_bits(adr); + reg[2] = upper_32_bits(adr); + + fwnode_property_read_u32(fwnode, "mipi-i3c-static-address", ®[0]); + + return i3c_master_add_i3c_boardinfo(master, fwnode, reg); +} + static int fwnode_populate_i3c_bus(struct i3c_master_controller *master) { struct device *dev = &master->dev; struct fwnode_handle *fwnode = dev_fwnode(dev); struct fwnode_handle *child; - int ret; + int ret = -ENODEV; u32 val; if (!fwnode) return 0; fwnode_for_each_available_child_node(fwnode, child) { - ret = i3c_master_add_dev(master, child); + if (is_of_node(child)) + ret = i3c_master_add_of_dev(master, child); + else if (is_acpi_device_node(child)) + ret = i3c_master_add_acpi_dev(master, child); + if (ret) return ret; } @@ -2372,10 +2426,28 @@ static u8 i3c_master_i2c_get_lvr(struct i2c_client *client) { /* Fall back to no spike filters and FM bus mode. */ u8 lvr = I3C_LVR_I2C_INDEX(2) | I3C_LVR_I2C_FM_MODE; + struct i2c_dev_boardinfo boardinfo; + struct acpi_device *adev; + LIST_HEAD(resources); u32 reg[3]; - if (!fwnode_property_read_u32_array(client->dev.fwnode, "reg", reg, ARRAY_SIZE(reg))) - lvr = reg[2]; + if (is_of_node(client->dev.fwnode)) { + if (!fwnode_property_read_u32_array(client->dev.fwnode, "reg", + reg, ARRAY_SIZE(reg))) + lvr = reg[2]; + } else if (is_acpi_device_node(client->dev.fwnode)) { + adev = to_acpi_device_node(client->dev.fwnode); + if (adev) { + memset(&boardinfo, 0, sizeof(boardinfo)); + acpi_dev_get_resources(adev, &resources, + i3c_acpi_get_i2c_resource, &boardinfo); + + if (boardinfo.base.addr) + lvr = boardinfo.lvr; + + acpi_dev_free_resource_list(&resources); + } + } return lvr; } From d9d3d771fb535a0a0bd6acbb420d9c1944db00d5 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Fri, 14 Nov 2025 12:36:04 +0530 Subject: [PATCH 240/247] NVIDIA: VR: SAUCE: i3c: master: Add support for devices using SETAASA Add device discovery support for SPD5118, SPD5108 and similar devices attached to DDR5 memory modules via I3C bus which uses SETAASA instead of ENTDAA. Follow the guidelines proposed by the MIPI Discovery and Configuration Specification [1] for discovering such devices. These devices differ from typical I3C devices in their initialization requirements. Unlike standard I3C devices that receive dynamic addresses through ENTDAA (Enter Dynamic Address Assignment), SPD devices require the SETAASA (Set All Addresses to Static Address) CCC command to map their static addresses to dynamic addresses. Additionally, it is not mandatory for these devices to implement standard CCC commands like GETPID, GETDCR, or BCR retrieval. [1] https://www.mipi.org/specifications/disco Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- drivers/i3c/master.c | 64 +++++++++++++++++++++++++++++++++++++- include/linux/i3c/ccc.h | 1 + include/linux/i3c/master.h | 20 ++++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 73071f6ea087e..34b46534deefd 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -948,6 +948,43 @@ static int i3c_master_rstdaa_locked(struct i3c_master_controller *master, return ret; } +/** + * i3c_master_setaasa_locked() - start a SETAASA procedure (Set All Addresses to Static Address) + * @master: I3C master object + * + * Send a SETAASA CCC command to set all attached I3C devices' dynamic addresses to + * their static address. + * + * This function must be called with the bus lock held in write mode. + * + * First, the SETHID CCC command is sent, followed by the SETAASA CCC. + * + * Return: 0 in case of success, a positive I3C error code if the error is + * one of the official Mx error codes, and a negative error code otherwise. + */ +static int i3c_master_setaasa_locked(struct i3c_master_controller *master) +{ + struct i3c_ccc_cmd_dest dest; + struct i3c_ccc_cmd cmd; + int ret; + + /* Send SETHID CCC command */ + i3c_ccc_cmd_dest_init(&dest, I3C_BROADCAST_ADDR, 0); + i3c_ccc_cmd_init(&cmd, false, I3C_CCC_VENDOR(0, true), &dest, 1); + ret = i3c_master_send_ccc_cmd_locked(master, &cmd); + i3c_ccc_cmd_dest_cleanup(&dest); + if (ret) + return ret; + + /* Send SETAASA CCC command */ + i3c_ccc_cmd_dest_init(&dest, I3C_BROADCAST_ADDR, 0); + i3c_ccc_cmd_init(&cmd, false, I3C_CCC_SETAASA, &dest, 1); + ret = i3c_master_send_ccc_cmd_locked(master, &cmd); + i3c_ccc_cmd_dest_cleanup(&dest); + + return ret; +} + /** * i3c_master_entdaa_locked() - start a DAA (Dynamic Address Assignment) * procedure @@ -1632,6 +1669,18 @@ static int i3c_master_early_i3c_dev_add(struct i3c_master_controller *master, if (ret) goto err_free_dev; + /* + * For devices using SETAASA instead of ENTDAA, the address is statically + * assigned. Update the dynamic address to the provided static address. + * Reattaching the I3C device is not useful. It is also not mandatory + * for such devices to implement CCC commands like GETPID, GETDCR etc. + * Hence, we can return here. + */ + if (i3cdev->boardinfo->static_addr_method & BIT(1)) { + i3cdev->info.dyn_addr = i3cdev->boardinfo->static_addr; + return 0; + } + ret = i3c_master_setdasa_locked(master, i3cdev->info.static_addr, i3cdev->boardinfo->init_dyn_addr); if (ret) @@ -1927,6 +1976,12 @@ static int i3c_master_bus_init(struct i3c_master_controller *master) goto err_bus_cleanup; } + if (master->addr_method & BIT(1)) { + ret = i3c_master_setaasa_locked(master); + if (ret) + goto err_bus_cleanup; + } + /* Disable all slave events before starting DAA. */ ret = i3c_master_disec_locked(master, I3C_BROADCAST_ADDR, I3C_CCC_EVENT_SIR | I3C_CCC_EVENT_MR | @@ -2266,7 +2321,7 @@ i3c_master_add_i3c_boardinfo(struct i3c_master_controller *master, struct i3c_dev_boardinfo *boardinfo; struct device *dev = &master->dev; enum i3c_addr_slot_status addrstatus; - u32 init_dyn_addr = 0; + u32 init_dyn_addr = 0, static_addr_method = 0; boardinfo = devm_kzalloc(dev, sizeof(*boardinfo), GFP_KERNEL); if (!boardinfo) @@ -2294,6 +2349,12 @@ i3c_master_add_i3c_boardinfo(struct i3c_master_controller *master, return -EINVAL; } + if (!fwnode_property_read_u32(fwnode, "mipi-i3c-static-method", &static_addr_method)) + boardinfo->static_addr_method = static_addr_method; + + /* Update the address methods required for device discovery */ + master->addr_method |= boardinfo->static_addr_method; + boardinfo->pid = ((u64)reg[1] << 32) | reg[2]; if ((boardinfo->pid & GENMASK_ULL(63, 48)) || @@ -2886,6 +2947,7 @@ int i3c_master_register(struct i3c_master_controller *master, master->dev.release = i3c_masterdev_release; master->ops = ops; master->secondary = secondary; + master->addr_method = BIT(0); INIT_LIST_HEAD(&master->boardinfo.i2c); INIT_LIST_HEAD(&master->boardinfo.i3c); diff --git a/include/linux/i3c/ccc.h b/include/linux/i3c/ccc.h index ad59a4ae60d12..a145d766ab6f7 100644 --- a/include/linux/i3c/ccc.h +++ b/include/linux/i3c/ccc.h @@ -32,6 +32,7 @@ #define I3C_CCC_DEFSLVS I3C_CCC_ID(0x8, true) #define I3C_CCC_ENTTM I3C_CCC_ID(0xb, true) #define I3C_CCC_ENTHDR(x) I3C_CCC_ID(0x20 + (x), true) +#define I3C_CCC_SETAASA I3C_CCC_ID(0x29, true) /* Unicast-only commands */ #define I3C_CCC_SETDASA I3C_CCC_ID(0x7, false) diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 5b2d13b229a91..636f79badd036 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -174,6 +174,15 @@ struct i3c_device_ibi_info { * assigned a dynamic address by the master. Will be used during * bus initialization to assign it a specific dynamic address * before starting DAA (Dynamic Address Assignment) + * @static_addr_method: Bitmap describing which methods of Dynamic Address + * Assignment from a Static Address are supported by this I3C Target. + * A value of 1'b1 in a bit position indicates that the Bus Controller + * supports that method, and a value of 1'b0 indicates that the Bus + * Controller does not support that method. + * Bit 0: SETDASA CCC (Direct) + * Bit 1: SETAASA CCC (Broadcast) + * Bit 2: Other CCC (vendor / standards extension) + * All other bits are reserved. * @pid: I3C Provisioned ID exposed by the device. This is a unique identifier * that may be used to attach boardinfo to i3c_dev_desc when the device * does not have a static address @@ -189,6 +198,7 @@ struct i3c_dev_boardinfo { struct list_head node; u8 init_dyn_addr; u8 static_addr; + u8 static_addr_method; u64 pid; struct fwnode_handle *fwnode; }; @@ -510,6 +520,15 @@ struct i3c_master_controller_ops { * @boardinfo.i2c: list of I2C boardinfo objects * @boardinfo: board-level information attached to devices connected on the bus * @bus: I3C bus exposed by this master + * @addr_method: Bitmap describing which methods of Address Assignment required + * to be run for discovering all the devices on the bus. + * A value of 1'b1 in a bit position indicates that the Bus Controller + * supports that method, and a value of 1'b0 indicates that the Bus + * Controller does not support that method. + * Bit 0: SETDASA CCC (Direct) + * Bit 1: SETAASA CCC (Broadcast) + * Bit 2: Other CCC (vendor / standards extension) + * All other bits are reserved. * @wq: workqueue which can be used by master * drivers if they need to postpone operations that need to take place * in a thread context. Typical examples are Hot Join processing which @@ -534,6 +553,7 @@ struct i3c_master_controller { struct list_head i2c; } boardinfo; struct i3c_bus bus; + u8 addr_method; struct workqueue_struct *wq; }; From 37d253b7b3c6b0e22348d93aba41e75ca81ef504 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Mon, 15 Dec 2025 19:43:03 +0530 Subject: [PATCH 241/247] NVIDIA: VR: SAUCE: i3c: master: Add support for devices without PID Devices using SETAASA for address assignment are not required to have a 48-bit PID according to the I3C specification. Allow such devices to register and use the static address where PID was required. Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- drivers/i3c/master.c | 48 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 34b46534deefd..4482057d6741f 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -1730,8 +1730,17 @@ i3c_master_register_new_i3c_devs(struct i3c_master_controller *master) desc->dev->dev.type = &i3c_device_type; desc->dev->dev.bus = &i3c_bus_type; desc->dev->dev.release = i3c_device_release; - dev_set_name(&desc->dev->dev, "%d-%llx", master->bus.id, - desc->info.pid); + + /* + * For devices without PID (e.g., SETAASA devices), use + * static address for naming instead. + */ + if (desc->info.pid) + dev_set_name(&desc->dev->dev, "%d-%llx", master->bus.id, + desc->info.pid); + else + dev_set_name(&desc->dev->dev, "%d-sa%02x", master->bus.id, + desc->info.static_addr); if (desc->boardinfo) device_set_node(&desc->dev->dev, desc->boardinfo->fwnode); @@ -2063,8 +2072,18 @@ static void i3c_master_attach_boardinfo(struct i3c_dev_desc *i3cdev) struct i3c_dev_boardinfo *i3cboardinfo; list_for_each_entry(i3cboardinfo, &master->boardinfo.i3c, node) { - if (i3cdev->info.pid != i3cboardinfo->pid) - continue; + /* + * For devices without PID (e.g., SETAASA devices), match by + * static address. For devices with PID, match by PID. + */ + if (i3cboardinfo->pid) { + if (i3cdev->info.pid != i3cboardinfo->pid) + continue; + } else { + if (!i3cboardinfo->static_addr || + i3cdev->info.static_addr != i3cboardinfo->static_addr) + continue; + } i3cdev->boardinfo = i3cboardinfo; i3cdev->info.static_addr = i3cboardinfo->static_addr; @@ -2078,8 +2097,12 @@ i3c_master_search_i3c_dev_duplicate(struct i3c_dev_desc *refdev) struct i3c_master_controller *master = i3c_dev_get_master(refdev); struct i3c_dev_desc *i3cdev; + if (!refdev->info.pid) + return NULL; + i3c_bus_for_each_i3cdev(&master->bus, i3cdev) { - if (i3cdev != refdev && i3cdev->info.pid == refdev->info.pid) + if (i3cdev != refdev && i3cdev->info.pid && + i3cdev->info.pid == refdev->info.pid) return i3cdev; } @@ -2357,9 +2380,12 @@ i3c_master_add_i3c_boardinfo(struct i3c_master_controller *master, boardinfo->pid = ((u64)reg[1] << 32) | reg[2]; - if ((boardinfo->pid & GENMASK_ULL(63, 48)) || - I3C_PID_RND_LOWER_32BITS(boardinfo->pid)) - return -EINVAL; + /* Skip PID validation for SETAASA devices */ + if (!(boardinfo->static_addr_method & BIT(1))) { + if ((boardinfo->pid & GENMASK_ULL(63, 48)) || + I3C_PID_RND_LOWER_32BITS(boardinfo->pid)) + return -EINVAL; + } boardinfo->init_dyn_addr = init_dyn_addr; boardinfo->fwnode = fwnode_handle_get(fwnode); @@ -2379,10 +2405,10 @@ static int i3c_master_add_of_dev(struct i3c_master_controller *master, return ret; /* - * The manufacturer ID can't be 0. If reg[1] == 0 that means we're - * dealing with an I2C device. + * I3C device should have either the manufacturer ID specified or the + * address discovery method specified. Else treat it as an I2C device. */ - if (!reg[1]) + if (!(reg[1] || fwnode_property_present(fwnode, "mipi-i3c-static-method"))) ret = i3c_master_add_i2c_boardinfo(master, fwnode, reg); else ret = i3c_master_add_i3c_boardinfo(master, fwnode, reg); From e71b1c2ead65cdd38af09030ceba7784293613ce Mon Sep 17 00:00:00 2001 From: Akhil R Date: Mon, 17 Nov 2025 21:34:44 +0530 Subject: [PATCH 242/247] NVIDIA: VR: SAUCE: i3c: dw-i3c-master: Add SETAASA as supported CCC Add SETAASA and SETHID to the supported CCC commands to support SPD devices which requires SETAASA for device enumeration. Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- drivers/i3c/master/dw-i3c-master.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 9ceedf09c3b6a..4609bf34af647 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -283,6 +283,8 @@ static bool dw_i3c_master_supports_ccc_cmd(struct i3c_master_controller *m, case I3C_CCC_GETSTATUS: case I3C_CCC_GETMXDS: case I3C_CCC_GETHDRCAP: + case I3C_CCC_SETAASA: + case I3C_CCC_VENDOR(0, true): /* SETHID */ return true; default: return false; From ead269c00defc062efab8cd0562a43cb00e07479 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Wed, 3 Dec 2025 11:51:32 +0530 Subject: [PATCH 243/247] NVIDIA: VR: SAUCE: i3c: dw-i3c-master: Do not use clk and reset functions for ACPI Clock and reset functions do not support ACPI based enumeration. Do not call them when using the ACPI. Also add a device property to read the clock frequency from ACPI table. Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- drivers/i3c/master/dw-i3c-master.c | 54 +++++++++++++++++++----------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index 4609bf34af647..e75a8ebe805b8 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -543,13 +543,20 @@ static void dw_i3c_master_set_intr_regs(struct dw_i3c_master *master) static int dw_i3c_clk_cfg(struct dw_i3c_master *master) { - unsigned long core_rate, core_period; + unsigned int core_rate, core_period; u32 scl_timing; u8 hcnt, lcnt; + int ret = 0; - core_rate = clk_get_rate(master->core_clk); - if (!core_rate) - return -EINVAL; + if (ACPI_HANDLE(master->dev)) { + ret = device_property_read_u32(master->dev, "clock-frequency", &core_rate); + if (ret) + return ret; + } else { + core_rate = clk_get_rate(master->core_clk); + if (!core_rate) + return -EINVAL; + } core_period = DIV_ROUND_UP(1000000000, core_rate); @@ -596,13 +603,20 @@ static int dw_i3c_clk_cfg(struct dw_i3c_master *master) static int dw_i2c_clk_cfg(struct dw_i3c_master *master) { - unsigned long core_rate, core_period; + unsigned int core_rate, core_period; u16 hcnt, lcnt; u32 scl_timing; + int ret = 0; - core_rate = clk_get_rate(master->core_clk); - if (!core_rate) - return -EINVAL; + if (ACPI_HANDLE(master->dev)) { + ret = device_property_read_u32(master->dev, "clock-frequency", &core_rate); + if (ret) + return ret; + } else { + core_rate = clk_get_rate(master->core_clk); + if (!core_rate) + return -EINVAL; + } core_period = DIV_ROUND_UP(1000000000, core_rate); @@ -1547,20 +1561,22 @@ int dw_i3c_common_probe(struct dw_i3c_master *master, if (IS_ERR(master->regs)) return PTR_ERR(master->regs); - master->core_clk = devm_clk_get_enabled(&pdev->dev, NULL); - if (IS_ERR(master->core_clk)) - return PTR_ERR(master->core_clk); + if (!ACPI_HANDLE(&pdev->dev)) { + master->core_clk = devm_clk_get_enabled(&pdev->dev, NULL); + if (IS_ERR(master->core_clk)) + return PTR_ERR(master->core_clk); - master->pclk = devm_clk_get_optional_enabled(&pdev->dev, "pclk"); - if (IS_ERR(master->pclk)) - return PTR_ERR(master->pclk); + master->pclk = devm_clk_get_optional_enabled(&pdev->dev, "pclk"); + if (IS_ERR(master->pclk)) + return PTR_ERR(master->pclk); - master->core_rst = devm_reset_control_get_optional_exclusive(&pdev->dev, - "core_rst"); - if (IS_ERR(master->core_rst)) - return PTR_ERR(master->core_rst); + master->core_rst = devm_reset_control_get_optional_exclusive(&pdev->dev, + "core_rst"); + if (IS_ERR(master->core_rst)) + return PTR_ERR(master->core_rst); - reset_control_deassert(master->core_rst); + reset_control_deassert(master->core_rst); + } spin_lock_init(&master->xferqueue.lock); INIT_LIST_HEAD(&master->xferqueue.list); From ee34538a29efda941f48487e241486fb3eba98f8 Mon Sep 17 00:00:00 2001 From: Akhil R Date: Tue, 13 Jan 2026 15:36:39 +0530 Subject: [PATCH 244/247] NVIDIA: VR: SAUCE: i3c: dw-i3c-master: Add ACPI ID for Tegra410 Update variable names to generic names and add Tegra410 ACPI ID to support the I3C controller in Tegra410. Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- drivers/i3c/master/dw-i3c-master.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index e75a8ebe805b8..a13ade81cae8d 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -1783,11 +1783,12 @@ static const struct of_device_id dw_i3c_master_of_match[] = { }; MODULE_DEVICE_TABLE(of, dw_i3c_master_of_match); -static const struct acpi_device_id amd_i3c_device_match[] = { +static const struct acpi_device_id dw_i3c_master_acpi_match[] = { { "AMDI0015", AMD_I3C_OD_PP_TIMING }, + { "NVDA2018" }, { } }; -MODULE_DEVICE_TABLE(acpi, amd_i3c_device_match); +MODULE_DEVICE_TABLE(acpi, dw_i3c_master_acpi_match); static struct platform_driver dw_i3c_driver = { .probe = dw_i3c_probe, @@ -1796,7 +1797,7 @@ static struct platform_driver dw_i3c_driver = { .driver = { .name = "dw-i3c-master", .of_match_table = dw_i3c_master_of_match, - .acpi_match_table = amd_i3c_device_match, + .acpi_match_table = dw_i3c_master_acpi_match, .pm = &dw_i3c_pm_ops, }, }; From 0156d88358c97dbc927fce0672995b85a083ec5e Mon Sep 17 00:00:00 2001 From: Akhil R Date: Wed, 3 Dec 2025 11:53:52 +0530 Subject: [PATCH 245/247] NVIDIA: VR: SAUCE: hwmon: spd5118: Add I3C support Add support for I3C based communication to SPD5118 devices. Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- drivers/hwmon/Kconfig | 12 +++++--- drivers/hwmon/spd5118.c | 63 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 5 deletions(-) diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 597538bfe2f83..2ce65c46c0cb3 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -2288,12 +2288,16 @@ config SENSORS_INA3221 config SENSORS_SPD5118 tristate "SPD5118 Compliant Temperature Sensors" - depends on I2C - select REGMAP_I2C + depends on I2C || I3C + select REGMAP_I2C if I2C + select REGMAP_I3C if I3C help If you say yes here you get support for SPD5118 (JEDEC JESD300) - compliant temperature sensors. Such sensors are found on DDR5 memory - modules. + compliant temperature sensors using I2C or I3C bus interface. + Such sensors are found on DDR5 memory modules. + + This driver supports both I2C and I3C interfaces. I3C devices + use 16-bit register addressing mode as specified in JESD300-5B. This driver can also be built as a module. If so, the module will be called spd5118. diff --git a/drivers/hwmon/spd5118.c b/drivers/hwmon/spd5118.c index 5da44571b6a0c..531ef3ce32a70 100644 --- a/drivers/hwmon/spd5118.c +++ b/drivers/hwmon/spd5118.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -770,7 +771,67 @@ static struct i2c_driver spd5118_i2c_driver = { .address_list = IS_ENABLED(CONFIG_SENSORS_SPD5118_DETECT) ? normal_i2c : NULL, }; -module_i2c_driver(spd5118_i2c_driver); +/* I3C */ + +static int spd5118_i3c_probe(struct i3c_device *i3cdev) +{ + struct device *dev = i3cdev_to_dev(i3cdev); + struct regmap *regmap; + unsigned int regval; + int err; + + /* + * I3C devices use 16-bit register addressing. + * Per SPD5118 specification section 7.2, I3C interface uses + * 16-bit register address mode. + */ + regmap = devm_regmap_init_i3c(i3cdev, &spd5118_regmap8_config); + if (IS_ERR(regmap)) + return dev_err_probe(dev, PTR_ERR(regmap), "regmap init failed\n"); + + /* Verify this is a SPD5118 device */ + err = regmap_read(regmap, SPD5118_REG_TYPE, ®val); + if (err) + return err; + + /* Check device type - should be 0x51 in first register */ + if (regval != 0x51) + return -ENODEV; + + err = regmap_read(regmap, SPD5118_REG_TYPE + 1, ®val); + if (err) + return err; + + /* Second register should be 0x18 (combined: 0x5118) */ + if (regval != 0x18) + return -ENODEV; + + /* I3C devices always use 16-bit addressing */ + return spd5118_common_probe(dev, regmap, true); +} + +/* + * SPD5118 does not have a manufacturer/part ID defined in the + * JESD specification for I3C. We use a generic match for now. + * Devices should be instantiated via device tree or ACPI. + */ +static const struct i3c_device_id spd5118_i3c_ids[] = { + I3C_CLASS(I3C_DCR_GENERIC_DEVICE, NULL), + { } +}; +MODULE_DEVICE_TABLE(i3c, spd5118_i3c_ids); + +static struct i3c_driver spd5118_i3c_driver = { + .driver = { + .name = "spd5118_i3c", + .of_match_table = spd5118_of_ids, + .pm = pm_sleep_ptr(&spd5118_pm_ops), + }, + .probe = spd5118_i3c_probe, + .id_table = spd5118_i3c_ids, +}; + +module_i3c_i2c_driver(spd5118_i3c_driver, &spd5118_i2c_driver) MODULE_AUTHOR("RenĂ© Rebe "); MODULE_AUTHOR("Guenter Roeck "); From 13aa65472b6e9c4943273e35707bfa122b5ddabf Mon Sep 17 00:00:00 2001 From: Akhil R Date: Wed, 3 Dec 2025 13:52:25 +0530 Subject: [PATCH 246/247] NVIDIA: VR: SAUCE: arm64: defconfig: Enable I3C and SPD5118 hwmon Add I3C and SPD5118 device as module to the defconfig Signed-off-by: Akhil R Signed-off-by: Nirmoy Das --- arch/arm64/configs/defconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 4d8c720c881f4..279db0f8965c0 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -555,6 +555,8 @@ CONFIG_I2C_TEGRA=y CONFIG_I2C_UNIPHIER_F=y CONFIG_I2C_RCAR=y CONFIG_I2C_CROS_EC_TUNNEL=y +CONFIG_I3C=m +CONFIG_DW_I3C_MASTER=m CONFIG_SPI=y CONFIG_SPI_ARMADA_3700=y CONFIG_SPI_BCM2835=m @@ -713,6 +715,7 @@ CONFIG_SENSORS_RASPBERRYPI_HWMON=m CONFIG_SENSORS_SL28CPLD=m CONFIG_SENSORS_INA2XX=m CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_SPD5118=m CONFIG_SENSORS_TMP102=m CONFIG_MISC_RP1=m CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y From 2574018ba8e8dd75983bae3a4fea7ac550332fa5 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Tue, 27 Jan 2026 05:36:17 -0800 Subject: [PATCH 247/247] NVIDIA: VR: SAUCE: [Config] Add I3C and SPD5118 config for Tegra410 Signed-off-by: Nirmoy Das --- debian.nvidia-6.17/config/annotations | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index e3f884ae0cafb..0dcf99914905c 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -111,6 +111,9 @@ CONFIG_DRM_NOUVEAU_GSP_DEFAULT note<'Disable nouveau for NVIDIA CONFIG_DRM_NOUVEAU_SVM policy<{'amd64': '-', 'arm64': '-'}> CONFIG_DRM_NOUVEAU_SVM note<'Disable nouveau for NVIDIA kernels'> +CONFIG_DW_I3C_MASTER policy<{'amd64': 'n', 'arm64': 'm'}> +CONFIG_DW_I3C_MASTER note<'Enable DesignWare I3C master controller for Tegra410'> + CONFIG_EFI_CAPSULE_LOADER policy<{'amd64': 'm', 'arm64': 'y'}> CONFIG_EFI_CAPSULE_LOADER note<'LP: #2067111'> @@ -120,6 +123,9 @@ CONFIG_ETM4X_IMPDEF_FEATURE note<'Required for Grace enablem CONFIG_GPIO_AAEON policy<{'amd64': '-'}> CONFIG_GPIO_AAEON note<'Disable all Ubuntu ODM drivers'> +CONFIG_I3C policy<{'amd64': 'n', 'arm64': 'm'}> +CONFIG_I3C note<'Enable I3C bus support for Tegra410 and SPD5118 temperature sensors'> + CONFIG_IOMMU_DEFAULT_DMA_LAZY policy<{'amd64': 'y', 'arm64': 'n'}> CONFIG_IOMMU_DEFAULT_DMA_LAZY note<'On Nvidia CPU passthrough mode is recommend so set passthrough mode as default for better performance'> @@ -183,6 +189,9 @@ CONFIG_SAMPLE_CORESIGHT_SYSCFG note<'Required for Grace enablem CONFIG_SENSORS_AAEON policy<{'amd64': '-'}> CONFIG_SENSORS_AAEON note<'Disable all Ubuntu ODM drivers'> +CONFIG_SENSORS_SPD5118 policy<{'amd64': 'n', 'arm64': 'm'}> +CONFIG_SENSORS_SPD5118 note<'Enable SPD5118 temperature sensor support for DDR5 memory modules'> + CONFIG_SND_HDA_ACPI policy<{'amd64': 'n', 'arm64': 'm'}> CONFIG_SND_HDA_ACPI note<'Add support for ACPI-enumerated HDA'>