From e496f9a0bbc2892076af7c8e848f86ce5a031c80 Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Tue, 22 Aug 2023 20:48:37 +0800 Subject: [PATCH 01/63] cpufreq: Support per-policy performance boost mainline inclusion from mainline-v6.6-rc1 commit 218a06a79d9a98a96ef46bb003d4d8adb0962056 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=218a06a79d9a98a96ef46bb003d4d8adb0962056 ---------------------------------------------------------------------- The boost control currently applies to the whole system. However, users may prefer to boost a subset of cores in order to provide prioritized performance to workloads running on the boosted cores. Enable per-policy boost by adding a 'boost' sysfs interface under each policy path. This can be found at: /sys/devices/system/cpu/cpufreq/policy<*>/boost Same to the global boost switch, writing 1/0 to the per-policy 'boost' enables/disables boost on a cpufreq policy respectively. The user view of global and per-policy boost controls should be: 1. Enabling global boost initially enables boost on all policies, and per-policy boost can then be enabled or disabled individually, given that the platform does support so. 2. Disabling global boost makes the per-policy boost interface illegal. Signed-off-by: Jie Zhan Reviewed-by: Wei Xu Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 43 +++++++++++++++++++++++++++++++++++++++ include/linux/cpufreq.h | 3 +++ 2 files changed, 46 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 950e00dfd75d..a32652b395c1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -83,6 +83,7 @@ static void cpufreq_governor_limits(struct cpufreq_policy *policy); static int cpufreq_set_policy(struct cpufreq_policy *policy, struct cpufreq_governor *new_gov, unsigned int new_pol); +static bool cpufreq_boost_supported(void); /* * Two notifier lists: the "policy" list is involved in the @@ -617,6 +618,40 @@ static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr, } define_one_global_rw(boost); +static ssize_t show_local_boost(struct cpufreq_policy *policy, char *buf) +{ + return sysfs_emit(buf, "%d\n", policy->boost_enabled); +} + +static ssize_t store_local_boost(struct cpufreq_policy *policy, + const char *buf, size_t count) +{ + int ret, enable; + + ret = kstrtoint(buf, 10, &enable); + if (ret || enable < 0 || enable > 1) + return -EINVAL; + + if (!cpufreq_driver->boost_enabled) + return -EINVAL; + + if (policy->boost_enabled == enable) + return count; + + cpus_read_lock(); + ret = cpufreq_driver->set_boost(policy, enable); + cpus_read_unlock(); + + if (ret) + return ret; + + policy->boost_enabled = enable; + + return count; +} + +static struct freq_attr local_boost = __ATTR(boost, 0644, show_local_boost, store_local_boost); + static struct cpufreq_governor *find_governor(const char *str_governor) { struct cpufreq_governor *t; @@ -1058,6 +1093,12 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) return ret; } + if (cpufreq_boost_supported()) { + ret = sysfs_create_file(&policy->kobj, &local_boost.attr); + if (ret) + return ret; + } + return 0; } @@ -2645,6 +2686,8 @@ int cpufreq_boost_trigger_state(int state) ret = cpufreq_driver->set_boost(policy, state); if (ret) goto err_reset_state; + + policy->boost_enabled = state; } put_online_cpus(); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index af9e116fa59d..d4a0374617aa 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -134,6 +134,9 @@ struct cpufreq_policy { */ bool dvfs_possible_from_any_cpu; + /* Per policy boost enabled flag. */ + bool boost_enabled; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; unsigned int cached_resolved_idx; -- Gitee From e99fccc6c0160b66ac9d1bf800eeb708d3c155eb Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Tue, 11 Mar 2025 11:24:03 +0800 Subject: [PATCH 02/63] cpufreq: Fix re-boost issue after hotplugging a CPU mainline inclusion from mainline-v6.7-rc5 commit 1608f0230510489d74a2e24e47054233b7e4678a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1608f0230510489d74a2e24e47054233b7e4678a --------------------------------------------------------------------- It turns out that CPUX will stay on the base frequency after performing these operations: 1. boost all CPUs: echo 1 > /sys/devices/system/cpu/cpufreq/boost 2. offline one CPU: echo 0 > /sys/devices/system/cpu/cpuX/online 3. deboost all CPUs: echo 0 > /sys/devices/system/cpu/cpufreq/boost 4. online CPUX: echo 1 > /sys/devices/system/cpu/cpuX/online 5. boost all CPUs again: echo 1 > /sys/devices/system/cpu/cpufreq/boost This is because max_freq_req of the policy is not updated during the online process, and the value of max_freq_req before the last offline is retained. When the CPU is boosted again, freq_qos_update_request() will do nothing because the old value is the same as the new one. This causes the CPU to stay at the base frequency. Updating max_freq_req in cpufreq_online() will solve this problem. Signed-off-by: Lifeng Zheng Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250117101457.1530653-2-zhenglifeng1@huawei.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a32652b395c1..761289e31531 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1473,6 +1473,10 @@ static int cpufreq_online(unsigned int cpu) blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_CREATE_POLICY, policy); + } else { + ret = freq_qos_update_request(policy->max_freq_req, policy->max); + if (ret < 0) + goto out_destroy_policy; } if (cpufreq_driver->get && has_target()) { -- Gitee From ecdfe460111fb0ef05e00ad483482cbc65d7a030 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Tue, 11 Mar 2025 11:24:04 +0800 Subject: [PATCH 03/63] cpufreq: Introduce a more generic way to set default per-policy boost flag mainline inclusion from mainline-v6.7-rc5 commit dd016f379ebc2d43a9405742d1a6066577509bd7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dd016f379ebc2d43a9405742d1a6066577509bd7 --------------------------------------------------------------------- In cpufreq_online() of cpufreq.c, the per-policy boost flag is already set to mirror the cpufreq_driver boost during init but using freq_table to judge if the policy has boost frequency. There are two drawbacks to this approach: 1. It doesn't work for the cpufreq drivers that do not use a frequency table. For now, acpi-cpufreq and amd-pstate have to enable boost in policy initialization. And cppc_cpufreq never set policy to boost when going online no matter what the cpufreq_driver boost flag is. 2. If the CPU goes offline when cpufreq_driver boost is enabled and then goes online when cpufreq_driver boost is disabled, the per-policy boost flag will incorrectly remain true. Running set_boost at the end of the online process is a more generic way for all cpufreq drivers. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250117101457.1530653-3-zhenglifeng1@huawei.com Acked-by: Viresh Kumar [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 761289e31531..d06eed6cb05f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1556,6 +1556,18 @@ static int cpufreq_online(unsigned int cpu) if (cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); + /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ + if (policy->boost_enabled != cpufreq_boost_enabled()) { + policy->boost_enabled = cpufreq_boost_enabled(); + ret = cpufreq_driver->set_boost(policy, policy->boost_enabled); + if (ret) { + /* If the set_boost fails, the online operation is not affected */ + pr_info("%s: CPU%d: Cannot %s BOOST\n", __func__, policy->cpu, + policy->boost_enabled ? "enable" : "disable"); + policy->boost_enabled = !policy->boost_enabled; + } + } + pr_debug("initialization complete\n"); return 0; -- Gitee From 82394386dd92e57e66993434681b7db596b75726 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Thu, 5 Nov 2020 12:55:19 +0000 Subject: [PATCH 04/63] cppc_cpufreq: simplify use of performance capabilities mainline inclusion from mainline-v5.11-rc1 commit bb025fb6c276ac874b718b9d884b7ee1099b2c22 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=bb025fb6c276ac874b718b9d884b7ee1099b2c22 ---------------------------------------------------------------------- The CPPC performance capabilities are used significantly throughout the driver. Simplify the use of them by introducing a local pointer "caps" to point to cpu_data->perf_caps, in functions that access performance capabilities often. Signed-off-by: Ionela Voinescu Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 40 +++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 01e7cbd910b9..15285b65fd3a 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -201,15 +201,16 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) { struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; int ret; - cpu_data->perf_ctrls.desired_perf = cpu_data->perf_caps.lowest_perf; + cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); if (ret) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", - cpu_data->perf_caps.lowest_perf, cpu, ret); + caps->lowest_perf, cpu, ret); } /* @@ -258,11 +259,12 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) { struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; int ret = 0; cpu_data->cpu = cpu; - ret = cppc_get_perf_caps(policy->cpu, &cpu_data->perf_caps); + ret = cppc_get_perf_caps(cpu, caps); if (ret) { pr_debug("Err reading CPU%d perf capabilities. ret:%d\n", @@ -271,23 +273,27 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) } /* Convert the lowest and nominal freq from MHz to KHz */ - cpu_data->perf_caps.lowest_freq *= 1000; - cpu_data->perf_caps.nominal_freq *= 1000; + caps->lowest_freq *= 1000; + caps->nominal_freq *= 1000; /* * Set min to lowest nonlinear perf to avoid any efficiency penalty (see * Section 8.4.7.1.1.5 of ACPI 6.1 spec) */ - policy->min = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.lowest_nonlinear_perf); - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.nominal_perf); + policy->min = cppc_cpufreq_perf_to_khz(cpu_data, + caps->lowest_nonlinear_perf); + policy->max = cppc_cpufreq_perf_to_khz(cpu_data, + caps->nominal_perf); /* * Set cpuinfo.min_freq to Lowest to make the full range of performance * available if userspace wants to use any perf between lowest & lowest * nonlinear perf */ - policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.lowest_perf); - policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu_data, cpu_data->perf_caps.nominal_perf); + policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu_data, + caps->lowest_perf); + policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu_data, + caps->nominal_perf); policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); policy->shared_type = cpu_data->shared_type; @@ -301,7 +307,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) if (unlikely(i == cpu)) continue; - memcpy(&all_cpu_data[i]->perf_caps, &cpu_data->perf_caps, + memcpy(&all_cpu_data[i]->perf_caps, caps, sizeof(cpu_data->perf_caps)); } } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) { @@ -316,18 +322,17 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) * If 'highest_perf' is greater than 'nominal_perf', we assume CPU Boost * is supported. */ - if (cpu_data->perf_caps.highest_perf > cpu_data->perf_caps.nominal_perf) + if (caps->highest_perf > caps->nominal_perf) boost_supported = true; /* Set policy->cur to norm now. */ - policy->cur = cppc_cpufreq_perf_to_khz(cpu_data, - cpu_data->perf_caps.nominal_perf); - cpu_data->perf_ctrls.desired_perf = cpu_data->perf_caps.nominal_perf; + policy->cur = cppc_cpufreq_perf_to_khz(cpu_data, caps->nominal_perf); + cpu_data->perf_ctrls.desired_perf = caps->nominal_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); if (ret) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", - cpu_data->perf_caps.nominal_perf, cpu, ret); + caps->nominal_perf, cpu, ret); return ret; } @@ -416,6 +421,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) { struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; int ret; if (!boost_supported) { @@ -425,10 +431,10 @@ static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) if (state) policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - cpu_data->perf_caps.highest_perf); + caps->highest_perf); else policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - cpu_data->perf_caps.nominal_perf); + caps->nominal_perf); policy->cpuinfo.max_freq = policy->max; ret = freq_qos_update_request(policy->max_freq_req, policy->max); -- Gitee From 88f2baa6d34c0a6692c1ae3ca4e0f78297ec87f9 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Tue, 11 Mar 2025 11:24:05 +0800 Subject: [PATCH 05/63] cpufreq: CPPC: Fix wrong max_freq in policy initialization mainline inclusion from mainline-v6.7-rc5 commit 03d8b4e76266e11662c5e544854b737843173e2d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=03d8b4e76266e11662c5e544854b737843173e2d --------------------------------------------------------------------- In policy initialization, policy->max and policy->cpuinfo.max_freq are always set to the value calculated from caps->nominal_perf. This will cause the frequency stay on base frequency even if the policy is already boosted when a CPU is going online. Fix this by using policy->boost_enabled to determine which value should be set. Signed-off-by: Lifeng Zheng Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250117101457.1530653-4-zhenglifeng1@huawei.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 15285b65fd3a..83c6139ec163 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -282,8 +282,8 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) */ policy->min = cppc_cpufreq_perf_to_khz(cpu_data, caps->lowest_nonlinear_perf); - policy->max = cppc_cpufreq_perf_to_khz(cpu_data, - caps->nominal_perf); + policy->max = cppc_cpufreq_perf_to_khz(cpu_data, policy->boost_enabled ? + caps->highest_perf : caps->nominal_perf); /* * Set cpuinfo.min_freq to Lowest to make the full range of performance @@ -292,8 +292,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) */ policy->cpuinfo.min_freq = cppc_cpufreq_perf_to_khz(cpu_data, caps->lowest_perf); - policy->cpuinfo.max_freq = cppc_cpufreq_perf_to_khz(cpu_data, - caps->nominal_perf); + policy->cpuinfo.max_freq = policy->max; policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); policy->shared_type = cpu_data->shared_type; -- Gitee From 5597d11fdf3d033194baefd2a9c661826825ca88 Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Tue, 11 Mar 2025 11:24:07 +0800 Subject: [PATCH 06/63] cpufreq: prevent NULL dereference in cpufreq_online() mainline inclusion from mainline-v6.7-rc5 commit 0813fd2e14ca6ecd4e6ba005a9766f08e26020d7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0813fd2e14ca6ecd4e6ba005a9766f08e26020d7 --------------------------------------------------------------------- Ensure cpufreq_driver->set_boost is non-NULL before using it in cpufreq_online() to prevent a potential NULL pointer dereference. Reported-by: Gautam Menghani Closes: https://lore.kernel.org/all/c9e56c5f54cc33338762c94e9bed7b5a0d5de812.camel@linux.ibm.com/ Fixes: da59223d340c ("cpufreq: Introduce a more generic way to set default per-policy boost flag") Suggested-by: Viresh Kumar Signed-off-by: Aboorva Devarajan Link: https://patch.msgid.link/20250205181347.2079272-1-aboorvad@linux.ibm.com [ rjw: Minor edits in the subject and changelog ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d06eed6cb05f..20cd91468d92 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1557,7 +1557,8 @@ static int cpufreq_online(unsigned int cpu) policy->cdev = of_cpufreq_cooling_register(policy); /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ - if (policy->boost_enabled != cpufreq_boost_enabled()) { + if (cpufreq_driver->set_boost && + policy->boost_enabled != cpufreq_boost_enabled()) { policy->boost_enabled = cpufreq_boost_enabled(); ret = cpufreq_driver->set_boost(policy, policy->boost_enabled); if (ret) { -- Gitee From cfaa53a19ccd291ab13de18b53ad1caf93573151 Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Mon, 21 Apr 2025 20:34:35 +0800 Subject: [PATCH 07/63] cpufreq: governor: Fix negative 'idle_time' handling in dbs_update() mainline inclusion from mainline-v6.14-rc3 commit 3698dd6b139dc37b35a9ad83d9330c1f99666c02 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3698dd6b139dc37b35a9ad83d9330c1f99666c02 --------------------------------------------------------------------- We observed an issue that the CPU frequency can't raise up with a 100% CPU load when NOHZ is off and the 'conservative' governor is selected. 'idle_time' can be negative if it's obtained from get_cpu_idle_time_jiffy() when NOHZ is off. This was found and explained in commit 9485e4ca0b48 ("cpufreq: governor: Fix handling of special cases in dbs_update()"). However, commit 7592019634f8 ("cpufreq: governors: Fix long idle detection logic in load calculation") introduced a comparison between 'idle_time' and 'samling_rate' to detect a long idle interval. While 'idle_time' is converted to int before comparison, it's actually promoted to unsigned again when compared with an unsigned 'sampling_rate'. Hence, this leads to wrong idle interval detection when it's in fact 100% busy and sets policy_dbs->idle_periods to a very large value. 'conservative' adjusts the frequency to minimum because of the large 'idle_periods', such that the frequency can't raise up. 'Ondemand' doesn't use policy_dbs->idle_periods so it fortunately avoids the issue. Correct negative 'idle_time' to 0 before any use of it in dbs_update(). Fixes: 7592019634f8 ("cpufreq: governors: Fix long idle detection logic in load calculation") Signed-off-by: Jie Zhan Reviewed-by: Chen Yu Link: https://patch.msgid.link/20250213035510.2402076-1-zhanjie9@hisilicon.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinghai Cen Signed-off-by: huwentao --- drivers/cpufreq/cpufreq_governor.c | 45 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 55c80319d268..5981e3ef9ce0 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -145,7 +145,23 @@ unsigned int dbs_update(struct cpufreq_policy *policy) time_elapsed = update_time - j_cdbs->prev_update_time; j_cdbs->prev_update_time = update_time; - idle_time = cur_idle_time - j_cdbs->prev_cpu_idle; + /* + * cur_idle_time could be smaller than j_cdbs->prev_cpu_idle if + * it's obtained from get_cpu_idle_time_jiffy() when NOHZ is + * off, where idle_time is calculated by the difference between + * time elapsed in jiffies and "busy time" obtained from CPU + * statistics. If a CPU is 100% busy, the time elapsed and busy + * time should grow with the same amount in two consecutive + * samples, but in practice there could be a tiny difference, + * making the accumulated idle time decrease sometimes. Hence, + * in this case, idle_time should be regarded as 0 in order to + * make the further process correct. + */ + if (cur_idle_time > j_cdbs->prev_cpu_idle) + idle_time = cur_idle_time - j_cdbs->prev_cpu_idle; + else + idle_time = 0; + j_cdbs->prev_cpu_idle = cur_idle_time; if (ignore_nice) { @@ -162,7 +178,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy) * calls, so the previous load value can be used then. */ load = j_cdbs->prev_load; - } else if (unlikely((int)idle_time > 2 * sampling_rate && + } else if (unlikely(idle_time > 2 * sampling_rate && j_cdbs->prev_load)) { /* * If the CPU had gone completely idle and a task has @@ -189,30 +205,15 @@ unsigned int dbs_update(struct cpufreq_policy *policy) load = j_cdbs->prev_load; j_cdbs->prev_load = 0; } else { - if (time_elapsed >= idle_time) { + if (time_elapsed > idle_time) load = 100 * (time_elapsed - idle_time) / time_elapsed; - } else { - /* - * That can happen if idle_time is returned by - * get_cpu_idle_time_jiffy(). In that case - * idle_time is roughly equal to the difference - * between time_elapsed and "busy time" obtained - * from CPU statistics. Then, the "busy time" - * can end up being greater than time_elapsed - * (for example, if jiffies_64 and the CPU - * statistics are updated by different CPUs), - * so idle_time may in fact be negative. That - * means, though, that the CPU was busy all - * the time (on the rough average) during the - * last sampling interval and 100 can be - * returned as the load. - */ - load = (int)idle_time < 0 ? 100 : 0; - } + else + load = 0; + j_cdbs->prev_load = load; } - if (unlikely((int)idle_time > 2 * sampling_rate)) { + if (unlikely(idle_time > 2 * sampling_rate)) { unsigned int periods = idle_time / sampling_rate; if (periods < idle_periods) -- Gitee From 198cd80a78fc374cce5c7f1c75201cb237622723 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:51 +0530 Subject: [PATCH 08/63] arm64: topology: Avoid the have_policy check mainline inclusion from mainline-v5.12-rc1 commit 384e5699e101b1ee184590a39ee60f10ec0d36b6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=384e5699e101b1ee184590a39ee60f10ec0d36b6 ---------------------------------------------------------------------- Every time I have stumbled upon this routine, I get confused with the way 'have_policy' is used and I have to dig in to understand why is it so. Here is an attempt to make it easier to understand, and hopefully it is an improvement. The 'have_policy' check was just an optimization to avoid writing to amu_fie_cpus in case we don't have to, but that optimization itself is creating more confusion than the real work. Lets just do that if all the CPUs support AMUs. It is much cleaner that way. Reviewed-by: Ionela Voinescu Signed-off-by: Viresh Kumar Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/c125766c4be93461772015ac7c9a6ae45d5756f6.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 4c7f3687356d..203c93c69f46 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -235,14 +235,14 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static inline bool +static inline void enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); if (!policy) { pr_debug("CPU%d: No cpufreq policy found.\n", cpu); - return false; + return; } if (cpumask_subset(policy->related_cpus, valid_cpus)) @@ -250,8 +250,6 @@ enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) amu_fie_cpus); cpufreq_cpu_put(policy); - - return true; } static DEFINE_STATIC_KEY_FALSE(amu_fie_key); @@ -260,7 +258,6 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key); static int __init init_amu_fie(void) { cpumask_var_t valid_cpus; - bool have_policy = false; int ret = 0; int cpu; @@ -280,17 +277,12 @@ static int __init init_amu_fie(void) continue; cpumask_set_cpu(cpu, valid_cpus); - have_policy |= enable_policy_freq_counters(cpu, valid_cpus); + enable_policy_freq_counters(cpu, valid_cpus); } - /* - * If we are not restricted by cpufreq policies, we only enable - * the use of the AMU feature for FIE if all CPUs support AMU. - * Otherwise, enable_policy_freq_counters has already enabled - * policy cpus. - */ - if (!have_policy && cpumask_equal(valid_cpus, cpu_present_mask)) - cpumask_or(amu_fie_cpus, amu_fie_cpus, valid_cpus); + /* Overwrite amu_fie_cpus if all CPUs support AMU */ + if (cpumask_equal(valid_cpus, cpu_present_mask)) + cpumask_copy(amu_fie_cpus, cpu_present_mask); if (!cpumask_empty(amu_fie_cpus)) { pr_info("CPUs[%*pbl]: counters will be used for FIE.", -- Gitee From c7e8da0e90e982125983a82c13895e9067dbb086 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:52 +0530 Subject: [PATCH 09/63] arm64: topology: Reorder init_amu_fie() a bit mainline inclusion from mainline-v5.12-rc1 commit 47b10b737c0794c2fa584d7c8103b485e274ed51 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=47b10b737c0794c2fa584d7c8103b485e274ed51 ---------------------------------------------------------------------- This patch does a couple of optimizations in init_amu_fie(), like early exits from paths where we don't need to continue any further, avoid the enable/disable dance, moving the calls to topology_scale_freq_invariant() just when we need them, instead of at the top of the routine, and avoiding calling it for the third time. Reviewed-by: Ionela Voinescu Signed-off-by: Viresh Kumar Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/a732e71ab9ec28c354eb28dd898c9b47d490863f.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 203c93c69f46..497063f0ba73 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -258,6 +258,7 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key); static int __init init_amu_fie(void) { cpumask_var_t valid_cpus; + bool invariant; int ret = 0; int cpu; @@ -284,18 +285,19 @@ static int __init init_amu_fie(void) if (cpumask_equal(valid_cpus, cpu_present_mask)) cpumask_copy(amu_fie_cpus, cpu_present_mask); - if (!cpumask_empty(amu_fie_cpus)) { - pr_info("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(amu_fie_cpus)); - static_branch_enable(&amu_fie_key); - } + if (cpumask_empty(amu_fie_cpus)) + goto free_valid_mask; - /* - * If the system is not fully invariant after AMU init, disable - * partial use of counters for frequency invariance. - */ - if (!topology_scale_freq_invariant()) - static_branch_disable(&amu_fie_key); + invariant = topology_scale_freq_invariant(); + + /* We aren't fully invariant yet */ + if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) + goto free_valid_mask; + + static_branch_enable(&amu_fie_key); + + pr_info("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(amu_fie_cpus)); free_valid_mask: free_cpumask_var(valid_cpus); -- Gitee From be72994c4b732f8d05e51fac07ffaeb6945a826c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:53 +0530 Subject: [PATCH 10/63] arm64: topology: Make AMUs work with modular cpufreq drivers mainline inclusion from mainline-v5.12-rc1 commit a5f1b187cd24f7da35eb7a48fc50839c554d52a2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=a5f1b187cd24f7da35eb7a48fc50839c554d52a2 ---------------------------------------------------------------------- The AMU counters won't get used today if the cpufreq driver is built as a module as the amu core requires everything to be ready by late init. Fix that properly by registering for cpufreq policy notifier. Note that the amu core don't have any cpufreq dependency after the first time CPUFREQ_CREATE_POLICY notifier is called for all the CPUs. And so we don't need to do anything on the CPUFREQ_REMOVE_POLICY notifier. And for the same reason we check if the CPUs are already parsed in the beginning of amu_fie_setup() and skip if that is true. Alternatively we can shoot a work from there to unregister the notifier instead, but that seemed too much instead of this simple check. While at it, convert the print message to pr_debug instead of pr_info. Signed-off-by: Viresh Kumar Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/89c1921334443e133c9c8791b4693607d65ed9f5.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 92 +++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 497063f0ba73..fc1293f4fcd1 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -235,76 +235,80 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static inline void -enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) -{ - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - - if (!policy) { - pr_debug("CPU%d: No cpufreq policy found.\n", cpu); - return; - } - - if (cpumask_subset(policy->related_cpus, valid_cpus)) - cpumask_or(amu_fie_cpus, policy->related_cpus, - amu_fie_cpus); - - cpufreq_cpu_put(policy); -} - static DEFINE_STATIC_KEY_FALSE(amu_fie_key); #define amu_freq_invariant() static_branch_unlikely(&amu_fie_key) -static int __init init_amu_fie(void) +static void amu_fie_setup(const struct cpumask *cpus) { - cpumask_var_t valid_cpus; bool invariant; - int ret = 0; int cpu; - if (!zalloc_cpumask_var(&valid_cpus, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { - ret = -ENOMEM; - goto free_valid_mask; - } + /* We are already set since the last insmod of cpufreq driver */ + if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + return; - for_each_present_cpu(cpu) { + for_each_cpu(cpu, cpus) { if (!freq_counters_valid(cpu) || freq_inv_set_max_ratio(cpu, cpufreq_get_hw_max_freq(cpu) * 1000ULL, arch_timer_get_rate())) - continue; - - cpumask_set_cpu(cpu, valid_cpus); - enable_policy_freq_counters(cpu, valid_cpus); + return; } - /* Overwrite amu_fie_cpus if all CPUs support AMU */ - if (cpumask_equal(valid_cpus, cpu_present_mask)) - cpumask_copy(amu_fie_cpus, cpu_present_mask); - - if (cpumask_empty(amu_fie_cpus)) - goto free_valid_mask; + cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); invariant = topology_scale_freq_invariant(); /* We aren't fully invariant yet */ if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) - goto free_valid_mask; + return; static_branch_enable(&amu_fie_key); - pr_info("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(amu_fie_cpus)); + pr_debug("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(cpus)); +} -free_valid_mask: - free_cpumask_var(valid_cpus); +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + + if (val == CPUFREQ_CREATE_POLICY) + amu_fie_setup(policy->related_cpus); + + /* + * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU + * counters don't have any dependency on cpufreq driver once we have + * initialized AMU support and enabled invariance. The AMU counters will + * keep on working just fine in the absence of the cpufreq driver, and + * for the CPUs for which there are no counters available, the last set + * value of freq_scale will remain valid as that is the frequency those + * CPUs are running at. + */ + + return 0; +} + +static struct notifier_block init_amu_fie_notifier = { + .notifier_call = init_amu_fie_callback, +}; + +static int __init init_amu_fie(void) +{ + int ret; + + if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) + return -ENOMEM; + + ret = cpufreq_register_notifier(&init_amu_fie_notifier, + CPUFREQ_POLICY_NOTIFIER); + if (ret) + free_cpumask_var(amu_fie_cpus); return ret; } -late_initcall_sync(init_amu_fie); +core_initcall(init_amu_fie); bool arch_freq_counters_available(const struct cpumask *cpus) { -- Gitee From ca27bd65304bccda470f0e2b4ac4dd0a8126298c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 10 Dec 2020 11:17:40 +0530 Subject: [PATCH 11/63] arm64: topology: Drop the useless update to per-cpu cycles mainline inclusion from mainline-v5.11-rc1 commit 51550a483606e35c379f78d28a7827f50e8fc09c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=51550a483606e35c379f78d28a7827f50e8fc09c ---------------------------------------------------------------------- The previous call to update_freq_counters_refs() has already updated the per-cpu variables, don't overwrite them with the same value again. Fixes: 4b9cf23c179a ("arm64: wrap and generalise counter read functions") Signed-off-by: Viresh Kumar Reviewed-by: Ionela Voinescu Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/7a171f710cdc0f808a2bfbd7db839c0d265527e7.1607579234.git.viresh.kumar@linaro.org Signed-off-by: Catalin Marinas Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index fc1293f4fcd1..6aa3e66291aa 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -338,7 +338,7 @@ void topology_scale_freq_tick(void) if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) - goto store_and_exit; + return; /* * /\core arch_max_freq_scale @@ -355,10 +355,6 @@ void topology_scale_freq_tick(void) scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); this_cpu_write(freq_scale, (unsigned long)scale); - -store_and_exit: - this_cpu_write(arch_core_cycles_prev, core_cnt); - this_cpu_write(arch_const_cycles_prev, const_cnt); } #ifdef CONFIG_SCHED_SOFT_QUOTA -- Gitee From f72d197e6a65689c27e6294b7e39501ab8cc855a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Mar 2021 08:16:40 +0530 Subject: [PATCH 12/63] arch_topology: Rename freq_scale as arch_freq_scale mainline inclusion from mainline-v5.13-rc1 commit eec73529a9321616ed13cf732cd21a17eb1a2836 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=eec73529a9321616ed13cf732cd21a17eb1a2836 ---------------------------------------------------------------------- Rename freq_scale to a less generic name, as it will get exported soon for modules. Since x86 already names its own implementation of this as arch_freq_scale, lets stick to that. Suggested-by: Will Deacon Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- arch/arm64/kernel/topology.c | 6 +++--- drivers/base/arch_topology.c | 4 ++-- include/linux/arch_topology.h | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 6aa3e66291aa..d22592bc2d8a 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -283,8 +283,8 @@ static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, * initialized AMU support and enabled invariance. The AMU counters will * keep on working just fine in the absence of the cpufreq driver, and * for the CPUs for which there are no counters available, the last set - * value of freq_scale will remain valid as that is the frequency those - * CPUs are running at. + * value of arch_freq_scale will remain valid as that is the frequency + * those CPUs are running at. */ return 0; @@ -354,7 +354,7 @@ void topology_scale_freq_tick(void) const_cnt - prev_const_cnt); scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); - this_cpu_write(freq_scale, (unsigned long)scale); + this_cpu_write(arch_freq_scale, (unsigned long)scale); } #ifdef CONFIG_SCHED_SOFT_QUOTA diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index ba1c8e1ff0b6..3e3ca9814801 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -31,7 +31,7 @@ __weak bool arch_freq_counters_available(const struct cpumask *cpus) { return false; } -DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq) @@ -53,7 +53,7 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; for_each_cpu(i, cpus) - per_cpu(freq_scale, i) = scale; + per_cpu(arch_freq_scale, i) = scale; } DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 64a05d38a45f..c81f97a4a9a4 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -27,11 +27,11 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); -DECLARE_PER_CPU(unsigned long, freq_scale); +DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) { - return per_cpu(freq_scale, cpu); + return per_cpu(arch_freq_scale, cpu); } void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, -- Gitee From 400bd16125c5c6ea209062894df20f467c95068e Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Tue, 27 Oct 2020 18:07:11 +0000 Subject: [PATCH 13/63] sched/topology,schedutil: Wrap sched domains rebuild mainline inclusion from mainline-v5.11-rc1 commit 31f6a8c0a471be7d7d05c93eac50fcb729e79b9d category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=31f6a8c0a471be7d7d05c93eac50fcb729e79b9d ---------------------------------------------------------------------- Add the rebuild_sched_domains_energy() function to wrap the functionality that rebuilds the scheduling domains if any of the Energy Aware Scheduling (EAS) initialisation conditions change. This functionality is used when schedutil is added or removed or when EAS is enabled or disabled through the sched_energy_aware sysctl. Therefore, create a single function that is used in both these cases and that can be later reused. Signed-off-by: Ionela Voinescu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Quentin Perret Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20201027180713.7642-2-ionela.voinescu@arm.com Signed-off-by: huwentao --- include/linux/sched/topology.h | 8 ++++++++ kernel/sched/cpufreq_schedutil.c | 9 +-------- kernel/sched/topology.c | 18 +++++++++++------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 31b9ccf21d5e..431ff2c6bf67 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -272,6 +272,14 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu) #endif /* !CONFIG_SMP */ +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern void rebuild_sched_domains_energy(void); +#else +static inline void rebuild_sched_domains_energy(void) +{ +} +#endif + #ifndef arch_scale_cpu_capacity /** * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5e39da0ae086..405577e99cb2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -905,16 +905,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL -extern bool sched_energy_update; -extern struct mutex sched_energy_mutex; - static void rebuild_sd_workfn(struct work_struct *work) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = true; - rebuild_sched_domains(); - sched_energy_update = false; - mutex_unlock(&sched_energy_mutex); + rebuild_sched_domains_energy(); } static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3a8673a1a3fc..62c658d8d022 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -226,6 +226,15 @@ unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +void rebuild_sched_domains_energy(void) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} + #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -238,13 +247,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = 1; - rebuild_sched_domains(); - sched_energy_update = 0; - mutex_unlock(&sched_energy_mutex); - } + if (state != sysctl_sched_energy_aware) + rebuild_sched_domains_energy(); } return ret; -- Gitee From 4544b1790b89cf72f5e55c1a8c0486f2ca5fc291 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Mar 2021 08:21:04 +0530 Subject: [PATCH 14/63] arch_topology: Allow multiple entities to provide sched_freq_tick() callback mainline inclusion from mainline-v5.13-rc1 commit 01e055c120a46e78650b5f903088badbbdaae9ad category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=01e055c120a46e78650b5f903088badbbdaae9ad ---------------------------------------------------------------------- This patch attempts to make it generic enough so other parts of the kernel can also provide their own implementation of scale_freq_tick() callback, which is called by the scheduler periodically to update the per-cpu arch_freq_scale variable. The implementations now need to provide 'struct scale_freq_data' for the CPUs for which they have hardware counters available, and a callback gets registered for each possible CPU in a per-cpu variable. The arch specific (or ARM AMU) counters are updated to adapt to this and they take the highest priority if they are available, i.e. they will be used instead of CPPC based counters for example. The special code to rebuild the sched domains, in case invariance status change for the system, is moved out of arm64 specific code and is added to arch_topology.c. Note that this also defines SCALE_FREQ_SOURCE_CPUFREQ but doesn't use it and it is added to show that cpufreq is also acts as source of information for FIE and will be used by default if no other counters are supported for a platform. Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Acked-by: Will Deacon # for arm64 Tested-by: Vincent Guittot Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- arch/arm64/include/asm/topology.h | 10 +--- arch/arm64/kernel/topology.c | 96 +++++++++++++------------------ drivers/base/arch_topology.c | 82 ++++++++++++++++++++++++-- include/linux/arch_topology.h | 14 ++++- 4 files changed, 131 insertions(+), 71 deletions(-) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 479a1ee0e161..5ebe51ef6d6c 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -17,17 +17,9 @@ int pcibus_to_node(struct pci_bus *bus); #include void update_freq_counters_refs(void); -void topology_scale_freq_tick(void); - -#ifdef CONFIG_ARM64_AMU_EXTN -/* - * Replace task scheduler's default counter-based - * frequency-invariance scale factor setting. - */ -#define arch_scale_freq_tick topology_scale_freq_tick -#endif /* CONFIG_ARM64_AMU_EXTN */ /* Replace task scheduler's default frequency-invariant accounting */ +#define arch_scale_freq_tick topology_scale_freq_tick #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index d22592bc2d8a..a320b03fcced 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -235,12 +235,47 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static DEFINE_STATIC_KEY_FALSE(amu_fie_key); -#define amu_freq_invariant() static_branch_unlikely(&amu_fie_key) +static void amu_scale_freq_tick(void) +{ + u64 prev_core_cnt, prev_const_cnt; + u64 core_cnt, const_cnt, scale; + + prev_const_cnt = this_cpu_read(arch_const_cycles_prev); + prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + + update_freq_counters_refs(); + + const_cnt = this_cpu_read(arch_const_cycles_prev); + core_cnt = this_cpu_read(arch_core_cycles_prev); + + if (unlikely(core_cnt <= prev_core_cnt || + const_cnt <= prev_const_cnt)) + return; + + /* + * /\core arch_max_freq_scale + * scale = ------- * -------------------- + * /\const SCHED_CAPACITY_SCALE + * + * See validate_cpu_freq_invariance_counters() for details on + * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. + */ + scale = core_cnt - prev_core_cnt; + scale *= this_cpu_read(arch_max_freq_scale); + scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, + const_cnt - prev_const_cnt); + + scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); + this_cpu_write(arch_freq_scale, (unsigned long)scale); +} + +static struct scale_freq_data amu_sfd = { + .source = SCALE_FREQ_SOURCE_ARCH, + .set_freq_scale = amu_scale_freq_tick, +}; static void amu_fie_setup(const struct cpumask *cpus) { - bool invariant; int cpu; /* We are already set since the last insmod of cpufreq driver */ @@ -257,13 +292,7 @@ static void amu_fie_setup(const struct cpumask *cpus) cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); - invariant = topology_scale_freq_invariant(); - - /* We aren't fully invariant yet */ - if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) - return; - - static_branch_enable(&amu_fie_key); + topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); pr_debug("CPUs[%*pbl]: counters will be used for FIE.", cpumask_pr_args(cpus)); @@ -310,53 +339,6 @@ static int __init init_amu_fie(void) } core_initcall(init_amu_fie); -bool arch_freq_counters_available(const struct cpumask *cpus) -{ - return amu_freq_invariant() && - cpumask_subset(cpus, amu_fie_cpus); -} - -void topology_scale_freq_tick(void) -{ - u64 prev_core_cnt, prev_const_cnt; - u64 core_cnt, const_cnt, scale; - int cpu = smp_processor_id(); - - if (!amu_freq_invariant()) - return; - - if (!cpumask_test_cpu(cpu, amu_fie_cpus)) - return; - - prev_const_cnt = this_cpu_read(arch_const_cycles_prev); - prev_core_cnt = this_cpu_read(arch_core_cycles_prev); - - update_freq_counters_refs(); - - const_cnt = this_cpu_read(arch_const_cycles_prev); - core_cnt = this_cpu_read(arch_core_cycles_prev); - - if (unlikely(core_cnt <= prev_core_cnt || - const_cnt <= prev_const_cnt)) - return; - - /* - * /\core arch_max_freq_scale - * scale = ------- * -------------------- - * /\const SCHED_CAPACITY_SCALE - * - * See validate_cpu_freq_invariance_counters() for details on - * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. - */ - scale = core_cnt - prev_core_cnt; - scale *= this_cpu_read(arch_max_freq_scale); - scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, - const_cnt - prev_const_cnt); - - scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); - this_cpu_write(arch_freq_scale, (unsigned long)scale); -} - #ifdef CONFIG_SCHED_SOFT_QUOTA static DEFINE_PER_CPU(int, sibling_idle) = 1; diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 3e3ca9814801..be39683e5b77 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -21,16 +21,90 @@ #include #include +static DEFINE_PER_CPU(struct scale_freq_data *, sft_data); +static struct cpumask scale_freq_counters_mask; +static bool scale_freq_invariant; + +static bool supports_scale_freq_counters(const struct cpumask *cpus) +{ + return cpumask_subset(cpus, &scale_freq_counters_mask); +} + bool topology_scale_freq_invariant(void) { return cpufreq_supports_freq_invariance() || - arch_freq_counters_available(cpu_online_mask); + supports_scale_freq_counters(cpu_online_mask); } -__weak bool arch_freq_counters_available(const struct cpumask *cpus) +static void update_scale_freq_invariant(bool status) { - return false; + if (scale_freq_invariant == status) + return; + + /* + * Task scheduler behavior depends on frequency invariance support, + * either cpufreq or counter driven. If the support status changes as + * a result of counter initialisation and use, retrigger the build of + * scheduling domains to ensure the information is propagated properly. + */ + if (topology_scale_freq_invariant() == status) { + scale_freq_invariant = status; + rebuild_sched_domains_energy(); + } +} + +void topology_set_scale_freq_source(struct scale_freq_data *data, + const struct cpumask *cpus) +{ + struct scale_freq_data *sfd; + int cpu; + + /* + * Avoid calling rebuild_sched_domains() unnecessarily if FIE is + * supported by cpufreq. + */ + if (cpumask_empty(&scale_freq_counters_mask)) + scale_freq_invariant = topology_scale_freq_invariant(); + + for_each_cpu(cpu, cpus) { + sfd = per_cpu(sft_data, cpu); + + /* Use ARCH provided counters whenever possible */ + if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) { + per_cpu(sft_data, cpu) = data; + cpumask_set_cpu(cpu, &scale_freq_counters_mask); + } + } + + update_scale_freq_invariant(true); +} + +void topology_clear_scale_freq_source(enum scale_freq_source source, + const struct cpumask *cpus) +{ + struct scale_freq_data *sfd; + int cpu; + + for_each_cpu(cpu, cpus) { + sfd = per_cpu(sft_data, cpu); + + if (sfd && sfd->source == source) { + per_cpu(sft_data, cpu) = NULL; + cpumask_clear_cpu(cpu, &scale_freq_counters_mask); + } + } + + update_scale_freq_invariant(false); } + +void topology_scale_freq_tick(void) +{ + struct scale_freq_data *sfd = *this_cpu_ptr(&sft_data); + + if (sfd) + sfd->set_freq_scale(); +} + DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, @@ -47,7 +121,7 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, * want to update the scale factor with information from CPUFREQ. * Instead the scale factor will be updated from arch_scale_freq_tick. */ - if (arch_freq_counters_available(cpus)) + if (supports_scale_freq_counters(cpus)) return; scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index c81f97a4a9a4..9dc94841803c 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -38,7 +38,19 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq); bool topology_scale_freq_invariant(void); -bool arch_freq_counters_available(const struct cpumask *cpus); +enum scale_freq_source { + SCALE_FREQ_SOURCE_CPUFREQ = 0, + SCALE_FREQ_SOURCE_ARCH, +}; + +struct scale_freq_data { + enum scale_freq_source source; + void (*set_freq_scale)(void); +}; + +void topology_scale_freq_tick(void); +void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus); +void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus); DECLARE_PER_CPU(unsigned long, thermal_pressure); -- Gitee From 3269802c33aa7bbe8d74323fee790752a2d7b6ba Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:21 +0000 Subject: [PATCH 15/63] cppc_cpufreq: clarify support for coordination types mainline inclusion from mainline-v5.11-rc1 commit bf76bb208f2b653306f2fc8f9c2a22f9890702bd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=bf76bb208f2b653306f2fc8f9c2a22f9890702bd ---------------------------------------------------------------------- The previous coordination type handling in the cppc_cpufreq init code created some confusion: the comment mentioned "Support only SW_ANY for now" while only the SW_ALL/ALL case resulted in a failure. The other coordination types (HW_ALL/HW, NONE) were silently supported. Clarify support for coordination types while describing in comments the intended behavior. Signed-off-by: Ionela Voinescu Acked-by: Viresh Kumar Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 83c6139ec163..fbee2b3f9492 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -261,7 +261,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; - int ret = 0; + int i, ret = 0; cpu_data->cpu = cpu; ret = cppc_get_perf_caps(cpu, caps); @@ -297,9 +297,13 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->transition_delay_us = cppc_cpufreq_get_transition_delay_us(cpu); policy->shared_type = cpu_data->shared_type; - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - int i; - + switch (policy->shared_type) { + case CPUFREQ_SHARED_TYPE_HW: + case CPUFREQ_SHARED_TYPE_NONE: + /* Nothing to be done - we'll have a policy for each CPU */ + break; + case CPUFREQ_SHARED_TYPE_ANY: + /* All CPUs in the domain will share a policy */ cpumask_copy(policy->cpus, cpu_data->shared_cpu_map); for_each_cpu(i, policy->cpus) { @@ -309,9 +313,10 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) memcpy(&all_cpu_data[i]->perf_caps, caps, sizeof(cpu_data->perf_caps)); } - } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) { - /* Support only SW_ANY for now. */ - pr_debug("Unsupported CPU co-ord type\n"); + break; + default: + pr_debug("Unsupported CPU co-ord type: %d\n", + policy->shared_type); return -EFAULT; } -- Gitee From 78b58248e3b4c03429d0a6be741b4d08b99a6431 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:22 +0000 Subject: [PATCH 16/63] cppc_cpufreq: expose information on frequency domains mainline inclusion from mainline-v5.11-rc1 commit cfdc589f4b5f94bf1a975b4a67d8163d533f6e9b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=cfdc589f4b5f94bf1a975b4a67d8163d533f6e9b ---------------------------------------------------------------------- Use the existing sysfs attribute "freqdomain_cpus" to expose information to userspace about CPUs in the same frequency domain. Signed-off-by: Ionela Voinescu Acked-by: Viresh Kumar Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- .../ABI/testing/sysfs-devices-system-cpu | 3 ++- drivers/cpufreq/cppc_cpufreq.c | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index bc59db0a75d2..66171d28ea64 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -264,7 +264,8 @@ Description: Discover CPUs in the same CPU frequency coordination domain attribute is useful for user space DVFS controllers to get better power/performance results for platforms using acpi-cpufreq. - This file is only present if the acpi-cpufreq driver is in use. + This file is only present if the acpi-cpufreq or the cppc-cpufreq + drivers are in use. What: /sys/devices/system/cpu/cpuX/cpufreq/auto_act_window Date: October 2024 diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index fbee2b3f9492..f09154e8a5f5 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -547,14 +547,24 @@ static ssize_t store_energy_perf(struct cpufreq_policy *policy, cpufreq_freq_attr_rw(auto_act_window); cpufreq_freq_attr_rw(energy_perf); +#endif // CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE + +static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf) +{ + unsigned int cpu = policy->cpu; + + return cpufreq_show_cpus(all_cpu_data[cpu]->shared_cpu_map, buf); +} +cpufreq_freq_attr_ro(freqdomain_cpus); static struct freq_attr *cppc_cpufreq_attr[] = { + &freqdomain_cpus, +#ifdef CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE &auto_act_window, &energy_perf, +#endif // CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE NULL, }; -#endif // CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE - static struct cpufreq_driver cppc_cpufreq_driver = { .flags = CPUFREQ_CONST_LOOPS, @@ -564,9 +574,7 @@ static struct cpufreq_driver cppc_cpufreq_driver = { .init = cppc_cpufreq_cpu_init, .stop_cpu = cppc_cpufreq_stop_cpu, .set_boost = cppc_cpufreq_set_boost, -#ifdef CONFIG_CPPC_CPUFREQ_SYSFS_INTERFACE .attr = cppc_cpufreq_attr, -#endif .name = "cppc_cpufreq", }; -- Gitee From 1a5a2231259ffcf9251912301a94226f6951fc6d Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:23 +0000 Subject: [PATCH 17/63] cppc_cpufreq: replace per-cpu data array with a list mainline inclusion from mainline-v5.11-rc1 commit a28b2bfc099c6b9caa6ef697660408e076a32019 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=a28b2bfc099c6b9caa6ef697660408e076a32019 ---------------------------------------------------------------------- The cppc_cpudata per-cpu storage was inefficient (1) additional to causing functional issues (2) when CPUs are hotplugged out, due to per-cpu data being improperly initialised. (1) The amount of information needed for CPPC performance control in its cpufreq driver depends on the domain (PSD) coordination type: ANY: One set of CPPC control and capability data (e.g desired performance, highest/lowest performance, etc) applies to all CPUs in the domain. ALL: Same as ANY. To be noted that this type is not currently supported. When supported, information about which CPUs belong to a domain is needed in order for frequency change requests to be sent to each of them. HW: It's necessary to store CPPC control and capability information for all the CPUs. HW will then coordinate the performance state based on their limitations and requests. NONE: Same as HW. No HW coordination is expected. Despite this, the previous initialisation code would indiscriminately allocate memory for all CPUs (all_cpu_data) and unnecessarily duplicate performance capabilities and the domain sharing mask and type for each possible CPU. (2) With the current per-cpu structure, when having ANY coordination, the cppc_cpudata cpu information is not initialised (will remain 0) for all CPUs in a policy, other than policy->cpu. When policy->cpu is hotplugged out, the driver will incorrectly use the uninitialised (0) value of the other CPUs when making frequency changes. Additionally, the previous values stored in the perf_ctrls.desired_perf will be lost when policy->cpu changes. Therefore replace the array of per cpu data with a list. The memory for each structure is allocated at policy init, where a single structure can be allocated per policy, not per cpu. In order to accommodate the struct list_head node in the cppc_cpudata structure, the now unused cpu and cur_policy variables are removed. For example, on a arm64 Juno platform with 6 CPUs: (0, 1, 2, 3) in PSD1, (4, 5) in PSD2 - ANY coordination, the memory allocation comparison shows: Before patch: - ANY coordination: total slack req alloc/free caller 0 0 0 0/1 _kernel_size_le_hi32+0x0xffff800008ff7810 0 0 0 0/6 _kernel_size_le_hi32+0x0xffff800008ff7808 128 80 48 1/0 _kernel_size_le_hi32+0x0xffff800008ffc070 768 0 768 6/0 _kernel_size_le_hi32+0x0xffff800008ffc0e4 After patch: - ANY coordination: total slack req alloc/free caller 256 0 256 2/0 _kernel_size_le_hi32+0x0xffff800008fed410 0 0 0 0/2 _kernel_size_le_hi32+0x0xffff800008fed274 Additional notes: - A pointer to the policy's cppc_cpudata is stored in policy->driver_data - Driver registration is skipped if _CPC entries are not present. Signed-off-by: Ionela Voinescu Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/acpi/cppc_acpi.c | 130 +++++++++---------------- drivers/cpufreq/cppc_cpufreq.c | 173 +++++++++++++++++---------------- include/acpi/cppc_acpi.h | 5 +- 3 files changed, 139 insertions(+), 169 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index ad895e437df9..0ba9838ce64b 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -441,108 +441,72 @@ bool acpi_cpc_valid(void) EXPORT_SYMBOL_GPL(acpi_cpc_valid); /** - * acpi_get_psd_map - Map the CPUs in a common freq domain. - * @all_cpu_data: Ptrs to CPU specific CPPC data including PSD info. + * acpi_get_psd_map - Map the CPUs in the freq domain of a given cpu + * @cpu: Find all CPUs that share a domain with cpu. + * @cpu_data: Pointer to CPU specific CPPC data including PSD info. * * Return: 0 for success or negative value for err. */ -static int __acpi_get_psd_map(struct cppc_cpudata **all_cpu_data, struct cpc_desc **cpc_pptr) +static int __acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data, struct cpc_desc **cpc_pptr) { - int count_target; - int retval = 0; - unsigned int i, j; - cpumask_var_t covered_cpus; - struct cppc_cpudata *pr, *match_pr; - struct acpi_psd_package *pdomain; - struct acpi_psd_package *match_pdomain; struct cpc_desc *cpc_ptr, *match_cpc_ptr; - - if (!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)) - return -ENOMEM; + struct acpi_psd_package *match_pdomain; + struct acpi_psd_package *pdomain; + int count_target, i; /* * Now that we have _PSD data from all CPUs, let's setup P-state * domain info. */ - for_each_possible_cpu(i) { - if (cpumask_test_cpu(i, covered_cpus)) - continue; - - pr = all_cpu_data[i]; - cpc_ptr = cpc_pptr[i]; - if (!cpc_ptr) { - retval = -EFAULT; - goto err_ret; - } + cpc_ptr = cpc_pptr[cpu]; + if (!cpc_ptr) + return -EFAULT; - pdomain = &(cpc_ptr->domain_info); - cpumask_set_cpu(i, pr->shared_cpu_map); - cpumask_set_cpu(i, covered_cpus); - if (pdomain->num_processors <= 1) - continue; + pdomain = &(cpc_ptr->domain_info); + cpumask_set_cpu(cpu, cpu_data->shared_cpu_map); + if (pdomain->num_processors <= 1) + return 0; - /* Validate the Domain info */ - count_target = pdomain->num_processors; - if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) - pr->shared_type = CPUFREQ_SHARED_TYPE_ALL; - else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) - pr->shared_type = CPUFREQ_SHARED_TYPE_HW; - else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) - pr->shared_type = CPUFREQ_SHARED_TYPE_ANY; - - for_each_possible_cpu(j) { - if (i == j) - continue; - - match_cpc_ptr = cpc_pptr[j]; - if (!match_cpc_ptr) { - retval = -EFAULT; - goto err_ret; - } + /* Validate the Domain info */ + count_target = pdomain->num_processors; + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_ALL; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_HW; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_ANY; - match_pdomain = &(match_cpc_ptr->domain_info); - if (match_pdomain->domain != pdomain->domain) - continue; + for_each_possible_cpu(i) { + if (i == cpu) + continue; - /* Here i and j are in the same domain */ - if (match_pdomain->num_processors != count_target) { - retval = -EFAULT; - goto err_ret; - } + match_cpc_ptr = cpc_pptr[i]; + if (!match_cpc_ptr) + goto err_fault; - if (pdomain->coord_type != match_pdomain->coord_type) { - retval = -EFAULT; - goto err_ret; - } + match_pdomain = &(match_cpc_ptr->domain_info); + if (match_pdomain->domain != pdomain->domain) + continue; - cpumask_set_cpu(j, covered_cpus); - cpumask_set_cpu(j, pr->shared_cpu_map); - } + /* Here i and cpu are in the same domain */ + if (match_pdomain->num_processors != count_target) + goto err_fault; - for_each_cpu(j, pr->shared_cpu_map) { - if (i == j) - continue; + if (pdomain->coord_type != match_pdomain->coord_type) + goto err_fault; - match_pr = all_cpu_data[j]; - match_pr->shared_type = pr->shared_type; - cpumask_copy(match_pr->shared_cpu_map, - pr->shared_cpu_map); - } + cpumask_set_cpu(i, cpu_data->shared_cpu_map); } - goto out; -err_ret: - for_each_possible_cpu(i) { - pr = all_cpu_data[i]; + return 0; - /* Assume no coordination on any error parsing domain info */ - cpumask_clear(pr->shared_cpu_map); - cpumask_set_cpu(i, pr->shared_cpu_map); - pr->shared_type = CPUFREQ_SHARED_TYPE_ALL; - } -out: - free_cpumask_var(covered_cpus); - return retval; +err_fault: + /* Assume no coordination on any error parsing domain info */ + cpumask_clear(cpu_data->shared_cpu_map); + cpumask_set_cpu(cpu, cpu_data->shared_cpu_map); + cpu_data->shared_type = CPUFREQ_SHARED_TYPE_ALL; + + return -EFAULT; } static acpi_status acpi_parse_cpc(acpi_handle handle, u32 lvl, void *data, @@ -586,7 +550,7 @@ static acpi_status acpi_parse_cpc(acpi_handle handle, u32 lvl, void *data, return AE_OK; } -int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) +int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data) { struct cpc_desc **cpc_pptr, *cpc_ptr; int parsed_core_num = 0; @@ -617,7 +581,7 @@ int acpi_get_psd_map(struct cppc_cpudata **all_cpu_data) goto out; } - ret = __acpi_get_psd_map(all_cpu_data, cpc_pptr); + ret = __acpi_get_psd_map(cpu, cpu_data, cpc_pptr); out: for_each_possible_cpu(i) { diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index f09154e8a5f5..d57fb570a006 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -30,13 +30,13 @@ #define DMI_PROCESSOR_MAX_SPEED 0x14 /* - * These structs contain information parsed from per CPU - * ACPI _CPC structures. - * e.g. For each CPU the highest, lowest supported - * performance capabilities, desired performance level - * requested etc. + * This list contains information parsed from per CPU ACPI _CPC and _PSD + * structures: e.g. the highest and lowest supported performance, capabilities, + * desired performance, level requested etc. Depending on the share_type, not + * all CPUs will have an entry in the list. */ -static struct cppc_cpudata **all_cpu_data; +static LIST_HEAD(cpu_data_list); + static bool boost_supported; struct cppc_workaround_oem_info { @@ -166,7 +166,7 @@ static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; unsigned int cpu = policy->cpu; struct cpufreq_freqs freqs; u32 desired_perf; @@ -200,7 +200,7 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; struct cppc_perf_caps *caps = &cpu_data->perf_caps; unsigned int cpu = policy->cpu; int ret; @@ -211,6 +211,12 @@ static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) if (ret) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->lowest_perf, cpu, ret); + + /* Remove CPU node from list and free driver data for policy */ + free_cpumask_var(cpu_data->shared_cpu_map); + list_del(&cpu_data->node); + kfree(policy->driver_data); + policy->driver_data = NULL; } /* @@ -256,25 +262,61 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) } #endif -static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) + +static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; - struct cppc_perf_caps *caps = &cpu_data->perf_caps; - unsigned int cpu = policy->cpu; - int i, ret = 0; + struct cppc_cpudata *cpu_data; + int ret; - cpu_data->cpu = cpu; - ret = cppc_get_perf_caps(cpu, caps); + cpu_data = kzalloc(sizeof(struct cppc_cpudata), GFP_KERNEL); + if (!cpu_data) + goto out; + + if (!zalloc_cpumask_var(&cpu_data->shared_cpu_map, GFP_KERNEL)) + goto free_cpu; + ret = acpi_get_psd_map(cpu, cpu_data); if (ret) { - pr_debug("Err reading CPU%d perf capabilities. ret:%d\n", - cpu, ret); - return ret; + pr_debug("Err parsing CPU%d PSD data: ret:%d\n", cpu, ret); + goto free_mask; + } + + ret = cppc_get_perf_caps(cpu, &cpu_data->perf_caps); + if (ret) { + pr_debug("Err reading CPU%d perf caps: ret:%d\n", cpu, ret); + goto free_mask; } /* Convert the lowest and nominal freq from MHz to KHz */ - caps->lowest_freq *= 1000; - caps->nominal_freq *= 1000; + cpu_data->perf_caps.lowest_freq *= 1000; + cpu_data->perf_caps.nominal_freq *= 1000; + + list_add(&cpu_data->node, &cpu_data_list); + + return cpu_data; + +free_mask: + free_cpumask_var(cpu_data->shared_cpu_map); +free_cpu: + kfree(cpu_data); +out: + return NULL; +} + +static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + struct cppc_cpudata *cpu_data; + struct cppc_perf_caps *caps; + int ret; + + cpu_data = cppc_cpufreq_get_cpu_data(cpu); + if (!cpu_data) { + pr_err("Error in acquiring _CPC/_PSD data for CPU%d.\n", cpu); + return -ENODEV; + } + caps = &cpu_data->perf_caps; + policy->driver_data = cpu_data; /* * Set min to lowest nonlinear perf to avoid any efficiency penalty (see @@ -303,16 +345,12 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) /* Nothing to be done - we'll have a policy for each CPU */ break; case CPUFREQ_SHARED_TYPE_ANY: - /* All CPUs in the domain will share a policy */ + /* + * All CPUs in the domain will share a policy and all cpufreq + * operations will use a single cppc_cpudata structure stored + * in policy->driver_data. + */ cpumask_copy(policy->cpus, cpu_data->shared_cpu_map); - - for_each_cpu(i, policy->cpus) { - if (unlikely(i == cpu)) - continue; - - memcpy(&all_cpu_data[i]->perf_caps, caps, - sizeof(cpu_data->perf_caps)); - } break; default: pr_debug("Unsupported CPU co-ord type: %d\n", @@ -320,8 +358,6 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) return -EFAULT; } - cpu_data->cur_policy = policy; - /* * If 'highest_perf' is greater than 'nominal_perf', we assume CPU Boost * is supported. @@ -405,9 +441,12 @@ static int cppc_get_perf_ctrs_sample(void *val) static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) { struct fb_ctr_pair fb_ctrs = { .cpu = cpu, }; - struct cppc_cpudata *cpu_data = all_cpu_data[cpu]; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cppc_cpudata *cpu_data = policy->driver_data; int ret; + cpufreq_cpu_put(policy); + if (cpu_has_amu_feat(cpu)) ret = smp_call_on_cpu(cpu, cppc_get_perf_ctrs_sample, &fb_ctrs, false); @@ -424,7 +463,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) { - struct cppc_cpudata *cpu_data = all_cpu_data[policy->cpu]; + struct cppc_cpudata *cpu_data = policy->driver_data; struct cppc_perf_caps *caps = &cpu_data->perf_caps; int ret; @@ -551,9 +590,9 @@ cpufreq_freq_attr_rw(energy_perf); static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf) { - unsigned int cpu = policy->cpu; + struct cppc_cpudata *cpu_data = policy->driver_data; - return cpufreq_show_cpus(all_cpu_data[cpu]->shared_cpu_map, buf); + return cpufreq_show_cpus(cpu_data->shared_cpu_map, buf); } cpufreq_freq_attr_ro(freqdomain_cpus); @@ -586,10 +625,13 @@ static struct cpufreq_driver cppc_cpufreq_driver = { */ static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) { - struct cppc_cpudata *cpu_data = all_cpu_data[cpu]; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cppc_cpudata *cpu_data = policy->driver_data; u64 desired_perf; int ret; + cpufreq_cpu_put(policy); + ret = cppc_get_desired_perf(cpu, &desired_perf); if (ret < 0) return -EIO; @@ -622,68 +664,33 @@ static void cppc_check_hisi_workaround(void) static int __init cppc_cpufreq_init(void) { - int i, ret = 0; - struct cppc_cpudata *cpu_data; - - if (acpi_disabled) + if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; - all_cpu_data = kcalloc(num_possible_cpus(), sizeof(void *), - GFP_KERNEL); - if (!all_cpu_data) - return -ENOMEM; - - for_each_possible_cpu(i) { - all_cpu_data[i] = kzalloc(sizeof(struct cppc_cpudata), GFP_KERNEL); - if (!all_cpu_data[i]) - goto out; - - cpu_data = all_cpu_data[i]; - if (!zalloc_cpumask_var(&cpu_data->shared_cpu_map, GFP_KERNEL)) - goto out; - } - - ret = acpi_get_psd_map(all_cpu_data); - if (ret) { - pr_debug("Error parsing PSD data. Aborting cpufreq registration.\n"); - goto out; - } + INIT_LIST_HEAD(&cpu_data_list); cppc_check_hisi_workaround(); - ret = cpufreq_register_driver(&cppc_cpufreq_driver); - if (ret) - goto out; + return cpufreq_register_driver(&cppc_cpufreq_driver); +} - return ret; +static inline void free_cpu_data(void) +{ + struct cppc_cpudata *iter, *tmp; -out: - for_each_possible_cpu(i) { - cpu_data = all_cpu_data[i]; - if (!cpu_data) - break; - free_cpumask_var(cpu_data->shared_cpu_map); - kfree(cpu_data); + list_for_each_entry_safe(iter, tmp, &cpu_data_list, node) { + free_cpumask_var(iter->shared_cpu_map); + list_del(&iter->node); + kfree(iter); } - kfree(all_cpu_data); - return -ENODEV; } static void __exit cppc_cpufreq_exit(void) { - struct cppc_cpudata *cpu_data; - int i; - cpufreq_unregister_driver(&cppc_cpufreq_driver); - for_each_possible_cpu(i) { - cpu_data = all_cpu_data[i]; - free_cpumask_var(cpu_data->shared_cpu_map); - kfree(cpu_data); - } - - kfree(all_cpu_data); + free_cpu_data(); } module_exit(cppc_cpufreq_exit); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 95c7d27cd788..e20a92ee5493 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -124,11 +124,10 @@ struct cppc_perf_fb_ctrs { /* Per CPU container for runtime CPPC management. */ struct cppc_cpudata { - int cpu; + struct list_head node; struct cppc_perf_caps perf_caps; struct cppc_perf_ctrls perf_ctrls; struct cppc_perf_fb_ctrs perf_fb_ctrs; - struct cpufreq_policy *cur_policy; unsigned int shared_type; cpumask_var_t shared_cpu_map; }; @@ -143,8 +142,8 @@ extern int cppc_set_auto_sel(int cpu, bool enable); extern int cppc_set_epp(int cpu, u64 epp_val); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); -extern int acpi_get_psd_map(struct cppc_cpudata **); extern bool acpi_cpc_valid(void); +extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); extern unsigned int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); -- Gitee From f248dcb7476cd3c2638af7b1a67280449f6fcce7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Jun 2021 09:54:39 +0530 Subject: [PATCH 18/63] cpufreq: CPPC: Migrate to ->exit() callback instead of ->stop_cpu() mainline inclusion from mainline-v5.14-rc1 commit 9357a380f90a89a168d505561d11f68272e0e768 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=9357a380f90a89a168d505561d11f68272e0e768 ---------------------------------------------------------------------- Commit 367dc4aa932b ("cpufreq: Add stop CPU callback to cpufreq_driver interface") added the ->stop_cpu() callback to allow the drivers to do clean up before the CPU is completely down and its state can't be modified. At that time the CPU hotplug framework used to call the cpufreq core's registered notifier for different events like CPU_DOWN_PREPARE and CPU_POST_DEAD. The ->stop_cpu() callback was called during the CPU_DOWN_PREPARE event. This is no longer the case, cpuhp_cpufreq_offline() is called only once by the CPU hotplug core now and we don't really need two separate callbacks for cpufreq drivers, i.e. ->stop_cpu() and -exit() callback itself. Migrate to using the ->exit() callback instead of ->stop_cpu(). Signed-off-by: Viresh Kumar [ rjw: Minor edits in the changelog and subject ] Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 46 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index d57fb570a006..81c7dcb369fd 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -198,27 +198,6 @@ static int cppc_verify_policy(struct cpufreq_policy_data *policy) return 0; } -static void cppc_cpufreq_stop_cpu(struct cpufreq_policy *policy) -{ - struct cppc_cpudata *cpu_data = policy->driver_data; - struct cppc_perf_caps *caps = &cpu_data->perf_caps; - unsigned int cpu = policy->cpu; - int ret; - - cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; - - ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) - pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", - caps->lowest_perf, cpu, ret); - - /* Remove CPU node from list and free driver data for policy */ - free_cpumask_var(cpu_data->shared_cpu_map); - list_del(&cpu_data->node); - kfree(policy->driver_data); - policy->driver_data = NULL; -} - /* * The PCC subspace describes the rate at which platform can accept commands * on the shared PCC channel (including READs which do not count towards freq @@ -377,6 +356,29 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) return ret; } +static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_perf_caps *caps = &cpu_data->perf_caps; + unsigned int cpu = policy->cpu; + int ret; + + cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; + + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); + if (ret) + pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", + caps->lowest_perf, cpu, ret); + + /* Remove CPU node from list and free driver data for policy */ + free_cpumask_var(cpu_data->shared_cpu_map); + list_del(&cpu_data->node); + kfree(policy->driver_data); + policy->driver_data = NULL; + + return 0; +} + static inline u64 get_delta(u64 t1, u64 t0) { if (t1 > t0 || t0 > ~(u32)0) @@ -611,7 +613,7 @@ static struct cpufreq_driver cppc_cpufreq_driver = { .target = cppc_cpufreq_set_target, .get = cppc_cpufreq_get_rate, .init = cppc_cpufreq_cpu_init, - .stop_cpu = cppc_cpufreq_stop_cpu, + .exit = cppc_cpufreq_cpu_exit, .set_boost = cppc_cpufreq_set_boost, .attr = cppc_cpufreq_attr, .name = "cppc_cpufreq", -- Gitee From 3083467c1057dc11db75b65c7ccf84ca069eb2ef Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 18 Jun 2021 13:31:27 +0530 Subject: [PATCH 19/63] cpufreq: CPPC: Fix potential memleak in cppc_cpufreq_cpu_init mainline inclusion from mainline-v5.14-rc1 commit fe2535a44904a77615a3af8e8fd7dafb98fb0e1b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=fe2535a44904a77615a3af8e8fd7dafb98fb0e1b ---------------------------------------------------------------------- It's a classic example of memleak, we allocate something, we fail and never free the resources. Make sure we free all resources on policy ->init() failures. Fixes: a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a list") Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 81c7dcb369fd..f3dfe54c0dfd 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -282,6 +282,16 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) return NULL; } +static void cppc_cpufreq_put_cpu_data(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + + list_del(&cpu_data->node); + free_cpumask_var(cpu_data->shared_cpu_map); + kfree(cpu_data); + policy->driver_data = NULL; +} + static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; @@ -334,7 +344,8 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) default: pr_debug("Unsupported CPU co-ord type: %d\n", policy->shared_type); - return -EFAULT; + ret = -EFAULT; + goto out; } /* @@ -349,10 +360,16 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) cpu_data->perf_ctrls.desired_perf = caps->nominal_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) + if (ret) { pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->nominal_perf, cpu, ret); + goto out; + } + + return 0; +out: + cppc_cpufreq_put_cpu_data(policy); return ret; } @@ -370,12 +387,7 @@ static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->lowest_perf, cpu, ret); - /* Remove CPU node from list and free driver data for policy */ - free_cpumask_var(cpu_data->shared_cpu_map); - list_del(&cpu_data->node); - kfree(policy->driver_data); - policy->driver_data = NULL; - + cppc_cpufreq_put_cpu_data(policy); return 0; } -- Gitee From 42e886f75e0189a0fc427505cb72804a29602309 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 18 Jun 2021 13:42:23 +0530 Subject: [PATCH 20/63] cpufreq: CPPC: Pass structure instance by reference mainline inclusion from mainline-v5.14-rc1 commit eead1840cbd31e553bf8ccdefbd5b065bf596b71 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=eead1840cbd31e553bf8ccdefbd5b065bf596b71 ---------------------------------------------------------------------- Don't pass structure instance by value, pass it by reference instead. Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index f3dfe54c0dfd..36627a0a3c3c 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -400,18 +400,18 @@ static inline u64 get_delta(u64 t1, u64 t0) } static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, - struct cppc_perf_fb_ctrs fb_ctrs_t0, - struct cppc_perf_fb_ctrs fb_ctrs_t1) + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1) { u64 delta_reference, delta_delivered; u64 reference_perf, delivered_perf; - reference_perf = fb_ctrs_t0.reference_perf; + reference_perf = fb_ctrs_t0->reference_perf; - delta_reference = get_delta(fb_ctrs_t1.reference, - fb_ctrs_t0.reference); - delta_delivered = get_delta(fb_ctrs_t1.delivered, - fb_ctrs_t0.delivered); + delta_reference = get_delta(fb_ctrs_t1->reference, + fb_ctrs_t0->reference); + delta_delivered = get_delta(fb_ctrs_t1->delivered, + fb_ctrs_t0->delivered); /* Check to avoid divide-by zero */ if (delta_reference || delta_delivered) @@ -471,8 +471,8 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) return ret; return cppc_get_rate_from_fbctrs(cpu_data, - fb_ctrs.fb_ctrs_t0, - fb_ctrs.fb_ctrs_t1); + &fb_ctrs.fb_ctrs_t0, + &fb_ctrs.fb_ctrs_t1); } static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) -- Gitee From a50dd12c3fa31a2945e230a63ff23413cb052627 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Mar 2021 08:25:27 +0530 Subject: [PATCH 21/63] arch_topology: Export arch_freq_scale and helpers mainline inclusion from mainline-v5.13-rc1 commit 2f5339582e7b540851bfb37e184d4dee0ab9e387 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=2f5339582e7b540851bfb37e184d4dee0ab9e387 ---------------------------------------------------------------------- It is possible now for other parts of the kernel to provide their own implementation of sched_freq_tick() and they can very well be modules themselves (like CPPC cpufreq driver, which is going to use these in a later commit). Export arch_freq_scale and topology_{set|clear}_scale_freq_source(). Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Tested-by: Vincent Guittot Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/base/arch_topology.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index be39683e5b77..3e7d15e25c22 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -78,6 +78,7 @@ void topology_set_scale_freq_source(struct scale_freq_data *data, update_scale_freq_invariant(true); } +EXPORT_SYMBOL_GPL(topology_set_scale_freq_source); void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus) @@ -96,6 +97,7 @@ void topology_clear_scale_freq_source(enum scale_freq_source source, update_scale_freq_invariant(false); } +EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source); void topology_scale_freq_tick(void) { @@ -106,6 +108,7 @@ void topology_scale_freq_tick(void) } DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq) -- Gitee From d84eb2ad2679d52d1cb027b90e28240fc49571fb Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 23 Jun 2020 15:49:40 +0530 Subject: [PATCH 22/63] cpufreq: CPPC: Add support for frequency invariance mainline inclusion from mainline-v5.14-rc1 commit 1eb5dde674f57b1a1918dab33f09e35cdd64eb07 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=1eb5dde674f57b1a1918dab33f09e35cdd64eb07 ---------------------------------------------------------------------- The Frequency Invariance Engine (FIE) is providing a frequency scaling correction factor that helps achieve more accurate load-tracking. Normally, this scaling factor can be obtained directly with the help of the cpufreq drivers as they know the exact frequency the hardware is running at. But that isn't the case for CPPC cpufreq driver. Another way of obtaining that is using the arch specific counter support, which is already present in kernel, but that hardware is optional for platforms. This patch updates the CPPC driver to register itself with the topology core to provide its own implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which gets called by the scheduler on every tick. Note that the arch specific counters have higher priority than CPPC counters, if available, though the CPPC driver doesn't need to have any special handling for that. On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we reach here from hard-irq context), which then schedules a normal work item and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable based on the counter updates since the last tick. To allow platforms to disable this CPPC counter-based frequency invariance support, this is all done under CONFIG_ACPI_CPPC_CPUFREQ_FIE, which is enabled by default. This also exports sched_setattr_nocheck() as the CPPC driver can be built as a module. Cc: linux-acpi@vger.kernel.org Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/Kconfig.arm | 10 ++ drivers/cpufreq/cppc_cpufreq.c | 255 +++++++++++++++++++++++++++++++-- include/linux/arch_topology.h | 1 + kernel/sched/core.c | 1 + 4 files changed, 252 insertions(+), 15 deletions(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 67e574d0d481..1280a1ec86d6 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -19,6 +19,16 @@ config ACPI_CPPC_CPUFREQ If in doubt, say N. +config ACPI_CPPC_CPUFREQ_FIE + bool "Frequency Invariance support for CPPC cpufreq driver" + depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY + default y + help + This extends frequency invariance support in the CPPC cpufreq driver, + by using CPPC delivered and reference performance counters. + + If in doubt, say N. + config CPPC_CPUFREQ_SYSFS_INTERFACE bool "Enable CPPC CPUFreq sysfs tuning interfaces" depends on ACPI_CPPC_CPUFREQ diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 36627a0a3c3c..e551a8ce3232 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -10,14 +10,18 @@ #define pr_fmt(fmt) "CPPC Cpufreq:" fmt +#include #include #include #include #include #include #include +#include +#include #include #include +#include #include @@ -63,6 +67,216 @@ struct fb_ctr_pair { struct cppc_perf_fb_ctrs fb_ctrs_t1; }; +#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE + +/* Frequency invariance support */ +struct cppc_freq_invariance { + int cpu; + struct irq_work irq_work; + struct kthread_work work; + struct cppc_perf_fb_ctrs prev_perf_fb_ctrs; + struct cppc_cpudata *cpu_data; +}; + +static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); +static struct kthread_worker *kworker_fie; + +static struct cpufreq_driver cppc_cpufreq_driver; +static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu); +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1); + +/** + * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance + * @work: The work item. + * + * The CPPC driver register itself with the topology core to provide its own + * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which + * gets called by the scheduler on every tick. + * + * Note that the arch specific counters have higher priority than CPPC counters, + * if available, though the CPPC driver doesn't need to have any special + * handling for that. + * + * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we + * reach here from hard-irq context), which then schedules a normal work item + * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable + * based on the counter updates since the last tick. + */ +static void cppc_scale_freq_workfn(struct kthread_work *work) +{ + struct cppc_freq_invariance *cppc_fi; + struct cppc_perf_fb_ctrs fb_ctrs = {0}; + struct cppc_cpudata *cpu_data; + unsigned long local_freq_scale; + u64 perf; + + cppc_fi = container_of(work, struct cppc_freq_invariance, work); + cpu_data = cppc_fi->cpu_data; + + if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) { + pr_warn("%s: failed to read perf counters\n", __func__); + return; + } + + perf = cppc_perf_from_fbctrs(cpu_data, &cppc_fi->prev_perf_fb_ctrs, + &fb_ctrs); + cppc_fi->prev_perf_fb_ctrs = fb_ctrs; + + perf <<= SCHED_CAPACITY_SHIFT; + local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf); + + /* This can happen due to counter's overflow */ + if (unlikely(local_freq_scale > 1024)) + local_freq_scale = 1024; + + per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale; +} + +static void cppc_irq_work(struct irq_work *irq_work) +{ + struct cppc_freq_invariance *cppc_fi; + + cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work); + kthread_queue_work(kworker_fie, &cppc_fi->work); +} + +static void cppc_scale_freq_tick(void) +{ + struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id()); + + /* + * cppc_get_perf_ctrs() can potentially sleep, call that from the right + * context. + */ + irq_work_queue(&cppc_fi->irq_work); +} + +static struct scale_freq_data cppc_sftd = { + .source = SCALE_FREQ_SOURCE_CPPC, + .set_freq_scale = cppc_scale_freq_tick, +}; + +static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ + struct cppc_freq_invariance *cppc_fi; + int cpu, ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + for_each_cpu(cpu, policy->cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + cppc_fi->cpu = cpu; + cppc_fi->cpu_data = policy->driver_data; + kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn); + init_irq_work(&cppc_fi->irq_work, cppc_irq_work); + + ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs); + if (ret) { + pr_warn("%s: failed to read perf counters for cpu:%d: %d\n", + __func__, cpu, ret); + + /* + * Don't abort if the CPU was offline while the driver + * was getting registered. + */ + if (cpu_online(cpu)) + return; + } + } + + /* Register for freq-invariance */ + topology_set_scale_freq_source(&cppc_sftd, policy->cpus); +} + +/* + * We free all the resources on policy's removal and not on CPU removal as the + * irq-work are per-cpu and the hotplug core takes care of flushing the pending + * irq-works (hint: smpcfd_dying_cpu()) on CPU hotplug. Even if the kthread-work + * fires on another CPU after the concerned CPU is removed, it won't harm. + * + * We just need to make sure to remove them all on policy->exit(). + */ +static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) +{ + struct cppc_freq_invariance *cppc_fi; + int cpu; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + /* policy->cpus will be empty here, use related_cpus instead */ + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, policy->related_cpus); + + for_each_cpu(cpu, policy->related_cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + irq_work_sync(&cppc_fi->irq_work); + kthread_cancel_work_sync(&cppc_fi->work); + } +} + +static void __init cppc_freq_invariance_init(void) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; + int ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + kworker_fie = kthread_create_worker(0, "cppc_fie"); + if (IS_ERR(kworker_fie)) + return; + + ret = sched_setattr_nocheck(kworker_fie->task, &attr); + if (ret) { + pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__, + ret); + kthread_destroy_worker(kworker_fie); + return; + } +} + +static void cppc_freq_invariance_exit(void) +{ + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + kthread_destroy_worker(kworker_fie); + kworker_fie = NULL; +} + +#else +static inline void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_freq_invariance_init(void) +{ +} + +static inline void cppc_freq_invariance_exit(void) +{ +} +#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */ + /* Callback function used to retrieve the max frequency from DMI */ static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) { @@ -366,6 +580,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) goto out; } + cppc_cpufreq_cpu_fie_init(policy); return 0; out: @@ -380,6 +595,8 @@ static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy) unsigned int cpu = policy->cpu; int ret; + cppc_cpufreq_cpu_fie_exit(policy); + cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); @@ -399,12 +616,12 @@ static inline u64 get_delta(u64 t1, u64 t0) return (u32)t1 - (u32)t0; } -static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, - struct cppc_perf_fb_ctrs *fb_ctrs_t0, - struct cppc_perf_fb_ctrs *fb_ctrs_t1) +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1) { u64 delta_reference, delta_delivered; - u64 reference_perf, delivered_perf; + u64 reference_perf; reference_perf = fb_ctrs_t0->reference_perf; @@ -413,14 +630,11 @@ static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, delta_delivered = get_delta(fb_ctrs_t1->delivered, fb_ctrs_t0->delivered); - /* Check to avoid divide-by zero */ - if (delta_reference || delta_delivered) - delivered_perf = (reference_perf * delta_delivered) / - delta_reference; - else - delivered_perf = cpu_data->perf_ctrls.desired_perf; + /* Check to avoid divide-by zero and invalid delivered_perf */ + if (!delta_reference || !delta_delivered) + return cpu_data->perf_ctrls.desired_perf; - return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); + return (reference_perf * delta_delivered) / delta_reference; } static int cppc_get_perf_ctrs_sample(void *val) @@ -457,6 +671,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) struct fb_ctr_pair fb_ctrs = { .cpu = cpu, }; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); struct cppc_cpudata *cpu_data = policy->driver_data; + u64 delivered_perf; int ret; cpufreq_cpu_put(policy); @@ -470,9 +685,11 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) if (ret) return ret; - return cppc_get_rate_from_fbctrs(cpu_data, - &fb_ctrs.fb_ctrs_t0, - &fb_ctrs.fb_ctrs_t1); + delivered_perf = cppc_perf_from_fbctrs(cpu_data, + &fb_ctrs.fb_ctrs_t0, + &fb_ctrs.fb_ctrs_t1); + + return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); } static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) @@ -678,14 +895,21 @@ static void cppc_check_hisi_workaround(void) static int __init cppc_cpufreq_init(void) { + int ret; + if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; INIT_LIST_HEAD(&cpu_data_list); cppc_check_hisi_workaround(); + cppc_freq_invariance_init(); - return cpufreq_register_driver(&cppc_cpufreq_driver); + ret = cpufreq_register_driver(&cppc_cpufreq_driver); + if (ret) + cppc_freq_invariance_exit(); + + return ret; } static inline void free_cpu_data(void) @@ -703,6 +927,7 @@ static inline void free_cpu_data(void) static void __exit cppc_cpufreq_exit(void) { cpufreq_unregister_driver(&cppc_cpufreq_driver); + cppc_freq_invariance_exit(); free_cpu_data(); } diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 9dc94841803c..dea46128b2da 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -41,6 +41,7 @@ bool topology_scale_freq_invariant(void); enum scale_freq_source { SCALE_FREQ_SOURCE_CPUFREQ = 0, SCALE_FREQ_SOURCE_ARCH, + SCALE_FREQ_SOURCE_CPPC, }; struct scale_freq_data { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5d2937935b47..74423f98ddf9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6639,6 +6639,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -- Gitee From 188087eab3b787d68dea469789de65e93eaadc6f Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Fri, 13 Jun 2025 15:27:49 +0800 Subject: [PATCH 23/63] cpufreq: CPPC: Don't warn on failing to read perf counters on offline cpus driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- Reading perf counters on offline cpus should be expected to fail, e.g. it returns -EFAULT as counters are shown to be 0. Remove the unnecessary warning print on this failure path. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Jie Zhan Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e551a8ce3232..c79d6d9c9412 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -174,16 +174,14 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) init_irq_work(&cppc_fi->irq_work, cppc_irq_work); ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs); - if (ret) { - pr_warn("%s: failed to read perf counters for cpu:%d: %d\n", - __func__, cpu, ret); - + if (ret && cpu_online(cpu)) { /* * Don't abort if the CPU was offline while the driver * was getting registered. */ - if (cpu_online(cpu)) - return; + pr_debug("%s: failed to read perf counters for cpu:%d: %d\n", + __func__, cpu, ret); + return; } } -- Gitee From a19c1ab25f230328838a0c17aef011b862b97224 Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Fri, 13 Jun 2025 15:27:50 +0800 Subject: [PATCH 24/63] cpufreq: CPPC: Fix error handling in cppc_scale_freq_workfn() driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- Perf counters could be 0 if the cpu is in a low-power idle state. Just try it again next time and update the frequency scale when the cpu is active and perf counters successfully return. Also, remove the FIE source on an actual failure. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Jie Zhan Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index c79d6d9c9412..379c7793a38f 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -111,12 +111,23 @@ static void cppc_scale_freq_workfn(struct kthread_work *work) struct cppc_cpudata *cpu_data; unsigned long local_freq_scale; u64 perf; + int ret; cppc_fi = container_of(work, struct cppc_freq_invariance, work); cpu_data = cppc_fi->cpu_data; - if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) { + ret = cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs); + /* + * Perf counters could be 0 if the cpu is in a low-power idle state. + * Just try it again next time. + */ + if (ret == -EFAULT) + return; + + if (ret) { pr_warn("%s: failed to read perf counters\n", __func__); + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, + cpu_data->shared_cpu_map); return; } -- Gitee From c00a3b8d82186d21aac9f473209f8b611ce63131 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 8 Oct 2024 11:09:18 +0000 Subject: [PATCH 25/63] cppc_cpufreq: Fix possible null pointer dereference stable inclusion from stable-v6.6.33 commit f84b9b25d045e67a7eee5e73f21278c8ab06713c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f84b9b25d045e67a7eee5e73f21278c8ab06713c -------------------------------- [ Upstream commit cf7de25878a1f4508c69dc9f6819c21ba177dbfe ] cppc_cpufreq_get_rate() and hisi_cppc_cpufreq_get_rate() can be called from different places with various parameters. So cpufreq_cpu_get() can return null as 'policy' in some circumstances. Fix this bug by adding null return check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a list") Signed-off-by: Aleksandr Mishin Signed-off-by: Viresh Kumar Signed-off-by: Sasha Levin Conflicts: drivers/cpufreq/cppc_cpufreq.c [Context conflict] Signed-off-by: ZhangPeng Signed-off-by: Liu Mingrui Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 379c7793a38f..e7a078878d2a 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -679,10 +679,15 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) { struct fb_ctr_pair fb_ctrs = { .cpu = cpu, }; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_cpudata *cpu_data; u64 delivered_perf; int ret; + if (!policy) + return -ENODEV; + + cpu_data = policy->driver_data; + cpufreq_cpu_put(policy); if (cpu_has_amu_feat(cpu)) @@ -866,10 +871,15 @@ static struct cpufreq_driver cppc_cpufreq_driver = { static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - struct cppc_cpudata *cpu_data = policy->driver_data; + struct cppc_cpudata *cpu_data; u64 desired_perf; int ret; + if (!policy) + return -ENODEV; + + cpu_data = policy->driver_data; + cpufreq_cpu_put(policy); ret = cppc_get_desired_perf(cpu, &desired_perf); -- Gitee From 8870795b0e7f19fa8712d7a3f6c64f8e6bc06aac Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 21 Aug 2025 15:15:57 +0800 Subject: [PATCH 26/63] cpufreq: cppc: Fix invalid return value in .get() callback mainline inclusion from mainline-v6.16-rc1 commit 2b8e6b58889c672e1ae3601d9b2b070be4dc2fbc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2b8e6b58889c672e1ae3601d9b2b070be4dc2fbc ---------------------------------------------------------------------- Returning a negative error code in a function with an unsigned return type is a pretty bad idea. It is probably worse when the justification for the change is "our static analisys tool found it". Fixes: cf7de25878a1 ("cppc_cpufreq: Fix possible null pointer dereference") Signed-off-by: Marc Zyngier Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Reviewed-by: Lifeng Zheng Signed-off-by: Viresh Kumar Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e7a078878d2a..97d30f0be14e 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -684,7 +684,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) int ret; if (!policy) - return -ENODEV; + return 0; cpu_data = policy->driver_data; -- Gitee From 0105755fb4234a3cf16923f39bd6f019e8510d08 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:15:58 +0800 Subject: [PATCH 27/63] cpufreq: cppc: Fix invalid return value in hisi_cppc_cpufreq_get_rate() driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- Returning a negative error code in a function with an unsigned return type is a pretty bad idea. Return 0 is enough when something wrong. Fixes: cf7de25878a1 ("cppc_cpufreq: Fix possible null pointer dereference") Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 97d30f0be14e..c3155f546c82 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -876,7 +876,7 @@ static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) int ret; if (!policy) - return -ENODEV; + return 0; cpu_data = policy->driver_data; @@ -884,7 +884,7 @@ static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu) ret = cppc_get_desired_perf(cpu, &desired_perf); if (ret < 0) - return -EIO; + return 0; return cppc_cpufreq_perf_to_khz(cpu_data, desired_perf); } -- Gitee From e3aa26daaf7b50f2ba8ce0e5f0a6e8c04705011a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 16 Mar 2021 16:54:03 +0100 Subject: [PATCH 28/63] ACPI: CPPC: Add emtpy stubs of functions for CONFIG_ACPI_CPPC_LIB unset mainline inclusion from mainline-v5.13-rc1 commit 8a02d99876362f35bc918097440445de18e3c47c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=8a02d99876362f35bc918097440445de18e3c47c ---------------------------------------------------------------------- For convenience, add empty stubs of library functions defined in cppc_acpi.c for the CONFIG_ACPI_CPPC_LIB unset case. Because one of them needs to return CPUFREQ_ETERNAL, include linux/cpufreq.h into the CPPC library header file and drop the direct inclusion of it from cppc_acpi.c. Signed-off-by: Rafael J. Wysocki Tested-by: Chen Yu Signed-off-by: huwentao --- drivers/acpi/cppc_acpi.c | 1 - include/acpi/cppc_acpi.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 0ba9838ce64b..7c4b9f63ae1e 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -33,7 +33,6 @@ #define pr_fmt(fmt) "ACPI CPPC: " fmt -#include #include #include #include diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index e20a92ee5493..54c5d21e2827 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -11,6 +11,7 @@ #define _CPPC_ACPI_H #include +#include #include #include @@ -132,6 +133,7 @@ struct cppc_cpudata { cpumask_var_t shared_cpu_map; }; +#ifdef CONFIG_ACPI_CPPC_LIB extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf); @@ -148,5 +150,43 @@ extern unsigned int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val); +#else /* !CONFIG_ACPI_CPPC_LIB */ +static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf) +{ + return -ENOTSUPP; +} +static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) +{ + return -ENOTSUPP; +} +static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) +{ + return -ENOTSUPP; +} +static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) +{ + return -ENOTSUPP; +} +static inline bool acpi_cpc_valid(void) +{ + return false; +} +static inline unsigned int cppc_get_transition_latency(int cpu) +{ + return CPUFREQ_ETERNAL; +} +static inline bool cpc_ffh_supported(void) +{ + return false; +} +static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + return -ENOTSUPP; +} +static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + return -ENOTSUPP; +} +#endif /* !CONFIG_ACPI_CPPC_LIB */ #endif /* _CPPC_ACPI_H*/ -- Gitee From cdac1221d33cf146a453ce5d66d480d38f8b5e5b Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Mon, 12 Sep 2022 15:37:22 -0500 Subject: [PATCH 29/63] ACPI: CPPC: Disable FIE if registers in PCC regions mainline inclusion from mainline-v6.1-rc1 commit ae2df912d1a557a3548be83da20851ac55f42ab3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=ae2df912d1a557a3548be83da20851ac55f42ab3 ---------------------------------------------------------------------- PCC regions utilize a mailbox to set/retrieve register values used by the CPPC code. This is fine as long as the operations are infrequent. With the FIE code enabled though the overhead can range from 2-11% of system CPU overhead (ex: as measured by top) on Arm based machines. So, before enabling FIE assure none of the registers used by cppc_get_perf_ctrs() are in the PCC region. Finally, add a module parameter which can override the PCC region detection at boot or module reload. Signed-off-by: Jeremy Linton Acked-by: Viresh Kumar Reviewed-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/acpi/cppc_acpi.c | 42 ++++++++++++++++++++++++++++++++++ drivers/cpufreq/cppc_cpufreq.c | 25 ++++++++++++++++---- include/acpi/cppc_acpi.h | 5 ++++ 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 7c4b9f63ae1e..c2477bd58efc 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1404,6 +1404,48 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps) } EXPORT_SYMBOL_GPL(cppc_get_perf_caps); +/** + * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region. + * + * CPPC has flexibility about how CPU performance counters are accessed. + * One of the choices is PCC regions, which can have a high access latency. This + * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time. + * + * Return: true if any of the counters are in PCC regions, false otherwise + */ +bool cppc_perf_ctrs_in_pcc(void) +{ + int cpu; + + for_each_present_cpu(cpu) { + struct cpc_register_resource *ref_perf_reg; + struct cpc_desc *cpc_desc; + + cpc_desc = per_cpu(cpc_desc_ptr, cpu); + + if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) || + CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) || + CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME])) + return true; + + + ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF]; + + /* + * If reference perf register is not supported then we should + * use the nominal perf value + */ + if (!CPC_SUPPORTED(ref_perf_reg)) + ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF]; + + if (CPC_IN_PCC(ref_perf_reg)) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc); + /** * cppc_get_perf_ctrs - Read a CPU's performance feedback counters. * @cpunum: CPU from which to read counters. diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index c3155f546c82..8274f8dd1054 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -67,7 +67,15 @@ struct fb_ctr_pair { struct cppc_perf_fb_ctrs fb_ctrs_t1; }; +static enum { + FIE_UNSET = -1, + FIE_ENABLED, + FIE_DISABLED +} fie_disabled = FIE_UNSET; + #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE +module_param(fie_disabled, int, 0444); +MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); /* Frequency invariance support */ struct cppc_freq_invariance { @@ -174,7 +182,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) struct cppc_freq_invariance *cppc_fi; int cpu, ret; - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled) return; for_each_cpu(cpu, policy->cpus) { @@ -213,7 +221,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) struct cppc_freq_invariance *cppc_fi; int cpu; - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled) return; /* policy->cpus will be empty here, use related_cpus instead */ @@ -243,7 +251,15 @@ static void __init cppc_freq_invariance_init(void) }; int ret; - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled != FIE_ENABLED && fie_disabled != FIE_DISABLED) { + fie_disabled = FIE_ENABLED; + if (cppc_perf_ctrs_in_pcc()) { + pr_info("FIE not enabled on systems with registers in PCC\n"); + fie_disabled = FIE_DISABLED; + } + } + + if (fie_disabled) return; kworker_fie = kthread_create_worker(0, "cppc_fie"); @@ -261,7 +277,7 @@ static void __init cppc_freq_invariance_init(void) static void cppc_freq_invariance_exit(void) { - if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + if (fie_disabled) return; kthread_destroy_worker(kworker_fie); @@ -905,6 +921,7 @@ static void cppc_check_hisi_workaround(void) wa_info[i].oem_revision == tbl->oem_revision) { /* Overwrite the get() callback */ cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate; + fie_disabled = FIE_DISABLED; break; } } diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 54c5d21e2827..f1532779ba16 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -144,6 +144,7 @@ extern int cppc_set_auto_sel(int cpu, bool enable); extern int cppc_set_epp(int cpu, u64 epp_val); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); +extern bool cppc_perf_ctrs_in_pcc(void); extern bool acpi_cpc_valid(void); extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); extern unsigned int cppc_get_transition_latency(int cpu); @@ -167,6 +168,10 @@ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) { return -ENOTSUPP; } +static inline bool cppc_perf_ctrs_in_pcc(void) +{ + return false; +} static inline bool acpi_cpc_valid(void) { return false; -- Gitee From 58d98182c760d754a2d9410bf8fe4f8514462d1b Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 15 Aug 2021 21:07:29 +0800 Subject: [PATCH 30/63] cpufreq: remove useless INIT_LIST_HEAD() mainline inclusion from mainline-v5.16-rc1 commit 6065a672679f0502ede024a03db82c03534a5e00 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=6065a672679f0502ede024a03db82c03534a5e00 ---------------------------------------------------------------------- list cpu_data_list has been inited staticly through LIST_HEAD, so there's no need to call another INIT_LIST_HEAD. Simply remove it from cppc_cpufreq_init. Signed-off-by: Han Wang Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 8274f8dd1054..f3cf3cdfbe3f 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -936,8 +936,6 @@ static int __init cppc_cpufreq_init(void) if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; - INIT_LIST_HEAD(&cpu_data_list); - cppc_check_hisi_workaround(); cppc_freq_invariance_init(); -- Gitee From 086d7de26ef7cf5baec565927b3d2f656435851d Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 25 Apr 2022 14:38:07 +0200 Subject: [PATCH 31/63] cpufreq: CPPC: Add per_cpu efficiency_class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.19-rc1 commit d3c3db41df7e1bdefc9c68073070b62ce3b260bd category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=d3c3db41df7e1bdefc9c68073070b62ce3b260bd ---------------------------------------------------------------------- In ACPI, describing power efficiency of CPUs can be done through the following arm specific field: ACPI 6.4, s5.2.12.14 'GIC CPU Interface (GICC) Structure', 'Processor Power Efficiency Class field': Describes the relative power efficiency of the associated pro- cessor. Lower efficiency class numbers are more efficient than higher ones (e.g. efficiency class 0 should be treated as more efficient than efficiency class 1). However, absolute values of this number have no meaning: 2 isn’t necessarily half as efficient as 1. The efficiency_class field is stored in the GicC structure of the ACPI MADT table and it's currently supported in Linux for arm64 only. Thus, this new functionality is introduced for arm64 only. To allow the cppc_cpufreq driver to know and preprocess the efficiency_class values of all the CPUs, add a per_cpu efficiency_class variable to store them. At least 2 different efficiency classes must be present, otherwise there is no use in creating an Energy Model. The efficiency_class values are squeezed in [0:#efficiency_class-1] while conserving the order. For instance, efficiency classes of: [111, 212, 250] will be mapped to: [0 (was 111), 1 (was 212), 2 (was 250)]. Each policy being independently registered in the driver, populating the per_cpu efficiency_class is done only once at the driver initialization. This prevents from having each policy re-searching the efficiency_class values of other CPUs. The EM will be registered in a following patch. The patch also exports acpi_cpu_get_madt_gicc() to fetch the GicC structure of the ACPI MADT table for each CPU. Acked-by: Catalin Marinas Signed-off-by: Pierre Gondois Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- arch/arm64/kernel/smp.c | 1 + drivers/cpufreq/cppc_cpufreq.c | 42 ++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 23707812f87a..84704529569b 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -574,6 +574,7 @@ struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu) { return &cpu_madt_gicc[cpu]; } +EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc); /* * acpi_map_gic_cpu_interface - parse processor MADT entry diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index f3cf3cdfbe3f..d525c3c92def 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -472,12 +472,53 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) return delay_us; } +static DEFINE_PER_CPU(unsigned int, efficiency_class); + +static int populate_efficiency_class(void) +{ + struct acpi_madt_generic_interrupt *gicc; + DECLARE_BITMAP(used_classes, 256) = {}; + int class, cpu, index; + + for_each_possible_cpu(cpu) { + gicc = acpi_cpu_get_madt_gicc(cpu); + class = gicc->efficiency_class; + bitmap_set(used_classes, class, 1); + } + + if (bitmap_weight(used_classes, 256) <= 1) { + pr_debug("Efficiency classes are all equal (=%d). " + "No EM registered", class); + return -EINVAL; + } + + /* + * Squeeze efficiency class values on [0:#efficiency_class-1]. + * Values are per spec in [0:255]. + */ + index = 0; + for_each_set_bit(class, used_classes, 256) { + for_each_possible_cpu(cpu) { + gicc = acpi_cpu_get_madt_gicc(cpu); + if (gicc->efficiency_class == class) + per_cpu(efficiency_class, cpu) = index; + } + index++; + } + + return 0; +} + #else static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; } +static int populate_efficiency_class(void) +{ + return 0; +} #endif @@ -938,6 +979,7 @@ static int __init cppc_cpufreq_init(void) cppc_check_hisi_workaround(); cppc_freq_invariance_init(); + populate_efficiency_class(); ret = cpufreq_register_driver(&cppc_cpufreq_driver); if (ret) -- Gitee From baa73831b23d86cf55e42d8b2dc4091ebb6439ed Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 21 Mar 2022 09:57:22 +0000 Subject: [PATCH 32/63] PM: EM: Add .get_cost() callback mainline inclusion from mainline-v5.19-rc1 commit bdc21a4d286c6917fe966fd765de99411095b1b4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=bdc21a4d286c6917fe966fd765de99411095b1b4 ---------------------------------------------------------------------- The Energy Model (EM) supports devices which report abstract power scale, not only real Watts. The primary goal for EM is to enable the Energy Aware Scheduler (EAS) for a given platform. Some of the platforms might not be able to deliver proper power values. The only information that they might have is the relative efficiency between CPU types. Thus, it makes sense to remove some restrictions in the EM framework and introduce a mechanism which would support those platforms. What is crucial for EAS to operate is the 'cost' field in the EM. The 'cost' is calculated internally in EM framework based on knowledge from 'power' values. The 'cost' values must be strictly increasing. The existing API with its 'power' value size restrictions cannot guarantee that the 'cost' will meet this requirement. Since the platform is missing this detailed information, but has only efficiency details, introduce a new custom callback in the EM framework. The new callback would allow to provide the 'cost' values which reflect efficiency of the CPUs. This would allow to provide EAS information which has different relation than what would be forced by the EM internal formulas calculating 'cost' values. Thanks to this new callback it is possible to create a system view for EAS which has no overlapping performance states across many Performance Domains. Signed-off-by: Lukasz Luba Reviewed-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- include/linux/energy_model.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 5f04a2b35e80..97b0c9c1c31c 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -89,8 +89,29 @@ struct em_data_callback { */ int (*active_power)(unsigned long *power, unsigned long *freq, struct device *dev); + + /** + * get_cost() - Provide the cost at the given performance state of + * a device + * @dev : Device for which we do this operation (can be a CPU) + * @freq : Frequency at the performance state in kHz + * @cost : The cost value for the performance state + * (modified) + * + * In case of CPUs, the cost is the one of a single CPU in the domain. + * It is expected to fit in the [0, EM_MAX_POWER] range due to internal + * usage in EAS calculation. + * + * Return 0 on success, or appropriate error value in case of failure. + */ + int (*get_cost)(struct device *dev, unsigned long freq, + unsigned long *cost); }; -#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) \ + { .active_power = _active_power_cb, \ + .get_cost = _cost_cb } +#define EM_DATA_CB(_active_power_cb) \ + EM_ADV_DATA_CB(_active_power_cb, NULL) struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); @@ -198,6 +219,7 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) #else struct em_data_callback {}; +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { } #define EM_DATA_CB(_active_power_cb) { } static inline -- Gitee From 7e43efae588111f0f913f6470fd9f7950afc4865 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 10 Aug 2021 12:04:33 +0530 Subject: [PATCH 33/63] cpufreq: Add callback to register with energy model mainline inclusion from mainline-v5.15-rc1 commit c17495b01b72b53bd290f442d39b060e015c7aea category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=c17495b01b72b53bd290f442d39b060e015c7aea ---------------------------------------------------------------------- Many cpufreq drivers register with the energy model for each policy and do exactly the same thing. Follow the footsteps of thermal-cooling, to get it done from the cpufreq core itself. Provide a new callback, which will be called, if present, by the cpufreq core at the right moment (more on that in the code's comment). Also provide a generic implementation that uses dev_pm_opp_of_register_em(). This also allows us to register with the EM at a later point of time, compared to ->init(), from where the EM core can access cpufreq policy directly using cpufreq_cpu_get() type of helpers and perform other work, like marking few frequencies inefficient, this will be done separately. Reviewed-by: Quentin Perret Reviewed-by: Lukasz Luba Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 13 +++++++++++++ include/linux/cpufreq.h | 14 ++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 20cd91468d92..dfcb6fc75758 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1536,6 +1536,19 @@ static int cpufreq_online(unsigned int cpu) write_lock_irqsave(&cpufreq_driver_lock, flags); list_add(&policy->policy_list, &cpufreq_policy_list); write_unlock_irqrestore(&cpufreq_driver_lock, flags); + + /* + * Register with the energy model before + * sched_cpufreq_governor_change() is called, which will result + * in rebuilding of the sched domains, which should only be done + * once the energy model is properly initialized for the policy + * first. + * + * Also, this should be called before the policy is registered + * with cooling framework. + */ + if (cpufreq_driver->register_em) + cpufreq_driver->register_em(policy); } ret = cpufreq_init_policy(policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d4a0374617aa..547cddaab2fb 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -9,10 +9,12 @@ #define _LINUX_CPUFREQ_H #include +#include #include #include #include #include +#include #include #include #include @@ -385,6 +387,12 @@ struct cpufreq_driver { /* platform specific boost support code */ bool boost_enabled; int (*set_boost)(struct cpufreq_policy *policy, int state); + + /* + * Set by drivers that want to register with the energy model after the + * policy is properly initialized, but before the governor is started. + */ + void (*register_em)(struct cpufreq_policy *policy); }; /* flags */ @@ -1072,4 +1080,10 @@ unsigned int cpufreq_generic_get(unsigned int cpu); void cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy) +{ + dev_pm_opp_of_register_em(get_cpu_device(policy->cpu), + policy->related_cpus); +} #endif /* _LINUX_CPUFREQ_H */ -- Gitee From 016a61b58eb346176d47d66ce263dd00334b26ea Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 21 Mar 2022 09:57:25 +0000 Subject: [PATCH 34/63] PM: EM: Change the order of arguments in the .active_power() callback mainline inclusion from mainline-v5.19-rc1 commit 75a3a99a5a9886af13be44e640cb415ebda80db2 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=75a3a99a5a9886af13be44e640cb415ebda80db2 ---------------------------------------------------------------------- The .active_power() callback passes the device pointer when it's called. Aligned with a convetion present in other subsystems and pass the 'dev' as a first argument. It looks more cleaner. Adjust all affected drivers which implement that API callback. Suggested-by: Ionela Voinescu Signed-off-by: Lukasz Luba Reviewed-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- Documentation/power/energy-model.rst | 4 ++-- drivers/cpufreq/scmi-cpufreq.c | 4 ++-- drivers/opp/of.c | 4 ++-- include/linux/energy_model.h | 6 +++--- kernel/power/energy_model.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index a6fb986abe3c..5f03b6f47d9e 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -117,8 +117,8 @@ EM framework:: -> drivers/cpufreq/foo_cpufreq.c - 01 static int est_power(unsigned long *mW, unsigned long *KHz, - 02 struct device *dev) + 01 static int est_power(struct device *dev, unsigned long *mW, + 02 unsigned long *KHz) 03 { 04 long freq, power; 05 diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index bb1389f276d7..4b86b074a874 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -96,8 +96,8 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) } static int __maybe_unused -scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, - struct device *cpu_dev) +scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power, + unsigned long *KHz) { unsigned long Hz; int ret, domain; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 3d7adc0de128..6873085e11e0 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1240,8 +1240,8 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); * Returns -EINVAL if the power calculation failed because of missing * parameters, 0 otherwise. */ -static int __maybe_unused _get_power(unsigned long *mW, unsigned long *kHz, - struct device *dev) +static int __maybe_unused _get_power(struct device *dev, unsigned long *mW, + unsigned long *kHz) { struct dev_pm_opp *opp; struct device_node *np; diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 97b0c9c1c31c..47506c394841 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -71,11 +71,11 @@ struct em_data_callback { /** * active_power() - Provide power at the next performance state of * a device + * @dev : Device for which we do this operation (can be a CPU) * @power : Active power at the performance state in mW * (modified) * @freq : Frequency at the performance state in kHz * (modified) - * @dev : Device for which we do this operation (can be a CPU) * * active_power() must find the lowest performance state of 'dev' above * 'freq' and update 'power' and 'freq' to the matching active power @@ -87,8 +87,8 @@ struct em_data_callback { * * Return 0 on success. */ - int (*active_power)(unsigned long *power, unsigned long *freq, - struct device *dev); + int (*active_power)(struct device *dev, unsigned long *power, + unsigned long *freq); /** * get_cost() - Provide the cost at the given performance state of diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 334173fe6940..0dab668f8c9e 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -107,7 +107,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, dev); + ret = cb->active_power(dev, &power, &freq); if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); -- Gitee From 74b5fe4a6b0cf770ccb4bb11ff5224aaf1317432 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 25 Apr 2022 14:38:08 +0200 Subject: [PATCH 35/63] cpufreq: CPPC: Register EM based on efficiency class information mainline inclusion from mainline-v5.19-rc1 commit 740fcdc2c20ecf855b36b919d7fa1b872b5a7eae category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=740fcdc2c20ecf855b36b919d7fa1b872b5a7eae ---------------------------------------------------------------------- Performance states and energy consumption values are not advertised in ACPI. In the GicC structure of the MADT table, the "Processor Power Efficiency Class field" (called efficiency class from now) allows to describe the relative energy efficiency of CPUs. To leverage the EM and EAS, the CPPC driver creates a set of artificial performance states and registers them in the Energy Model (EM), such as: - Every 20 capacity unit, a performance state is created. - The energy cost of each performance state gradually increases. No power value is generated as only the cost is used in the EM. During task placement, a task can raise the frequency of its whole pd. This can make EAS place a task on a pd with CPUs that are individually less energy efficient. As cost values are artificial, and to place tasks on CPUs with the lower efficiency class, a gap in cost values is generated for adjacent efficiency classes. E.g.: - efficiency class = 0, capacity is in [0-1024], so cost values are in [0: 51] (one performance state every 20 capacity unit) - efficiency class = 1, capacity is in [0-1024], cost values are in [1*gap+0: 1*gap+51]. The value of the cost gap is chosen to absorb a the energy of 4 CPUs at their maximum capacity. This means that between: 1- a pd of 4 CPUs, each of them being used at almost their full capacity. Their efficiency class is N. 2- a CPU using almost none of its capacity. Its efficiency class is N+1 EAS will choose the first option. This patch also populates the (struct cpufreq_driver).register_em callback if the valid efficiency_class ACPI values are provided. Signed-off-by: Pierre Gondois Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/base/arch_topology.c | 1 + drivers/cpufreq/cppc_cpufreq.c | 144 +++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 3e7d15e25c22..7f947e5734c2 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -134,6 +134,7 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, } DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(cpu_scale); void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) { diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index d525c3c92def..e358178d26ef 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -473,6 +473,134 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) } static DEFINE_PER_CPU(unsigned int, efficiency_class); +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy); + +/* Create an artificial performance state every CPPC_EM_CAP_STEP capacity unit. */ +#define CPPC_EM_CAP_STEP (20) +/* Increase the cost value by CPPC_EM_COST_STEP every performance state. */ +#define CPPC_EM_COST_STEP (1) +/* Add a cost gap correspnding to the energy of 4 CPUs. */ +#define CPPC_EM_COST_GAP (4 * SCHED_CAPACITY_SCALE * CPPC_EM_COST_STEP \ + / CPPC_EM_CAP_STEP) + +static unsigned int get_perf_level_count(struct cpufreq_policy *policy) +{ + struct cppc_perf_caps *perf_caps; + unsigned int min_cap, max_cap; + struct cppc_cpudata *cpu_data; + int cpu = policy->cpu; + + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu); + min_cap = div_u64(max_cap * perf_caps->lowest_perf, perf_caps->highest_perf); + if ((min_cap == 0) || (max_cap < min_cap)) + return 0; + return 1 + max_cap / CPPC_EM_CAP_STEP - min_cap / CPPC_EM_CAP_STEP; +} + +/* + * The cost is defined as: + * cost = power * max_frequency / frequency + */ +static inline unsigned long compute_cost(int cpu, int step) +{ + return CPPC_EM_COST_GAP * per_cpu(efficiency_class, cpu) + + step * CPPC_EM_COST_STEP; +} + +static int cppc_get_cpu_power(struct device *cpu_dev, + unsigned long *power, unsigned long *KHz) +{ + unsigned long perf_step, perf_prev, perf, perf_check; + unsigned int min_step, max_step, step, step_check; + unsigned long prev_freq = *KHz; + unsigned int min_cap, max_cap; + struct cpufreq_policy *policy; + + struct cppc_perf_caps *perf_caps; + struct cppc_cpudata *cpu_data; + + policy = cpufreq_cpu_get_raw(cpu_dev->id); + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu_dev->id); + min_cap = div_u64(max_cap * perf_caps->lowest_perf, + perf_caps->highest_perf); + + perf_step = CPPC_EM_CAP_STEP * perf_caps->highest_perf / max_cap; + min_step = min_cap / CPPC_EM_CAP_STEP; + max_step = max_cap / CPPC_EM_CAP_STEP; + + perf_prev = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + step = perf_prev / perf_step; + + if (step > max_step) + return -EINVAL; + + if (min_step == max_step) { + step = max_step; + perf = perf_caps->highest_perf; + } else if (step < min_step) { + step = min_step; + perf = perf_caps->lowest_perf; + } else { + step++; + if (step == max_step) + perf = perf_caps->highest_perf; + else + perf = step * perf_step; + } + + *KHz = cppc_cpufreq_perf_to_khz(cpu_data, perf); + perf_check = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + step_check = perf_check / perf_step; + + /* + * To avoid bad integer approximation, check that new frequency value + * increased and that the new frequency will be converted to the + * desired step value. + */ + while ((*KHz == prev_freq) || (step_check != step)) { + perf++; + *KHz = cppc_cpufreq_perf_to_khz(cpu_data, perf); + perf_check = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + step_check = perf_check / perf_step; + } + + /* + * With an artificial EM, only the cost value is used. Still the power + * is populated such as 0 < power < EM_MAX_POWER. This allows to add + * more sense to the artificial performance states. + */ + *power = compute_cost(cpu_dev->id, step); + + return 0; +} + +static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, + unsigned long *cost) +{ + unsigned long perf_step, perf_prev; + struct cppc_perf_caps *perf_caps; + struct cpufreq_policy *policy; + struct cppc_cpudata *cpu_data; + unsigned int max_cap; + int step; + + policy = cpufreq_cpu_get_raw(cpu_dev->id); + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu_dev->id); + + perf_prev = cppc_cpufreq_khz_to_perf(cpu_data, KHz); + perf_step = CPPC_EM_CAP_STEP * perf_caps->highest_perf / max_cap; + step = perf_prev / perf_step; + + *cost = compute_cost(cpu_dev->id, step); + + return 0; +} static int populate_efficiency_class(void) { @@ -505,10 +633,23 @@ static int populate_efficiency_class(void) } index++; } + cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; return 0; } +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data; + struct em_data_callback em_cb = + EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); + + cpu_data = policy->driver_data; + em_dev_register_perf_domain(get_cpu_device(policy->cpu), + get_perf_level_count(policy), &em_cb, + cpu_data->shared_cpu_map); +} + #else static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) @@ -519,6 +660,9 @@ static int populate_efficiency_class(void) { return 0; } +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ +} #endif -- Gitee From a0a7222d8d215b33ef10138554c86abc84529cb9 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:15:59 +0800 Subject: [PATCH 36/63] cpufreq: CPPC: Remove forward declaration of hisi_cppc_cpufreq_get_rate() driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- After commit ae2df912d1a5 ("ACPI: CPPC: Disable FIE if registers in PCC regions"), the only place uses hisi_cppc_cpufreq_get_rate() is cppc_check_hisi_workaround(), which is after the implementation of hisi_cppc_cpufreq_get_rate(). A forward declaration of hisi_cppc_cpufreq_get_rate() is unnecessarily. Fixes: ae2df912d1a5 ("ACPI: CPPC: Disable FIE if registers in PCC regions") Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e358178d26ef..238c1c75f2f2 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -90,7 +90,6 @@ static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); static struct kthread_worker *kworker_fie; static struct cpufreq_driver cppc_cpufreq_driver; -static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu); static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, struct cppc_perf_fb_ctrs *fb_ctrs_t0, struct cppc_perf_fb_ctrs *fb_ctrs_t1); -- Gitee From 09958e2ec0014b0b9852ee331d11b1ab7f8220b6 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:00 +0800 Subject: [PATCH 37/63] cpufreq: CPPC: Remove cpu_data_list mainline inclusion from mainline-v6.17-rc2 commit d80a75624051b817043431f847470fb4680f2582 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d80a75624051b817043431f847470fb4680f2582 ---------------------------------------------------------------------- After commit a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a list"), cpu_data can be got from policy->driver_data, so cpu_data_list is not actually needed and can be removed. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250526113057.3086513-2-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 25 ------------------------- include/acpi/cppc_acpi.h | 1 - 2 files changed, 26 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 238c1c75f2f2..e2de648951d2 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -33,14 +33,6 @@ /* Offest in the DMI processor structure for the max frequency */ #define DMI_PROCESSOR_MAX_SPEED 0x14 -/* - * This list contains information parsed from per CPU ACPI _CPC and _PSD - * structures: e.g. the highest and lowest supported performance, capabilities, - * desired performance, level requested etc. Depending on the share_type, not - * all CPUs will have an entry in the list. - */ -static LIST_HEAD(cpu_data_list); - static bool boost_supported; struct cppc_workaround_oem_info { @@ -693,8 +685,6 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) cpu_data->perf_caps.lowest_freq *= 1000; cpu_data->perf_caps.nominal_freq *= 1000; - list_add(&cpu_data->node, &cpu_data_list); - return cpu_data; free_mask: @@ -709,7 +699,6 @@ static void cppc_cpufreq_put_cpu_data(struct cpufreq_policy *policy) { struct cppc_cpudata *cpu_data = policy->driver_data; - list_del(&cpu_data->node); free_cpumask_var(cpu_data->shared_cpu_map); kfree(cpu_data); policy->driver_data = NULL; @@ -1131,24 +1120,10 @@ static int __init cppc_cpufreq_init(void) return ret; } -static inline void free_cpu_data(void) -{ - struct cppc_cpudata *iter, *tmp; - - list_for_each_entry_safe(iter, tmp, &cpu_data_list, node) { - free_cpumask_var(iter->shared_cpu_map); - list_del(&iter->node); - kfree(iter); - } - -} - static void __exit cppc_cpufreq_exit(void) { cpufreq_unregister_driver(&cppc_cpufreq_driver); cppc_freq_invariance_exit(); - - free_cpu_data(); } module_exit(cppc_cpufreq_exit); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index f1532779ba16..5d24633e3177 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -125,7 +125,6 @@ struct cppc_perf_fb_ctrs { /* Per CPU container for runtime CPPC management. */ struct cppc_cpudata { - struct list_head node; struct cppc_perf_caps perf_caps; struct cppc_perf_ctrls perf_ctrls; struct cppc_perf_fb_ctrs perf_fb_ctrs; -- Gitee From 16ff035285a16e431a4079770f022acc646f9d98 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:01 +0800 Subject: [PATCH 38/63] cpufreq: CPPC: Do not return a value from populate_efficiency_class() mainline inclusion from mainline-v6.17-rc2 commit 3d5978ea6cbc4df192d0ea1800ef5d55b28b965e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3d5978ea6cbc4df192d0ea1800ef5d55b28b965e ---------------------------------------------------------------------- The return value of populate_efficiency_class() is never needed and the result of it doesn't affect the initialization of cppc_cpufreq. It makes more sense to change it into a void function. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250526113057.3086513-3-zhenglifeng1@huawei.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e2de648951d2..6b19a66b29db 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -593,7 +593,7 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, return 0; } -static int populate_efficiency_class(void) +static void populate_efficiency_class(void) { struct acpi_madt_generic_interrupt *gicc; DECLARE_BITMAP(used_classes, 256) = {}; @@ -608,7 +608,7 @@ static int populate_efficiency_class(void) if (bitmap_weight(used_classes, 256) <= 1) { pr_debug("Efficiency classes are all equal (=%d). " "No EM registered", class); - return -EINVAL; + return; } /* @@ -625,8 +625,6 @@ static int populate_efficiency_class(void) index++; } cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; - - return 0; } static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) @@ -647,9 +645,8 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; } -static int populate_efficiency_class(void) +static void populate_efficiency_class(void) { - return 0; } static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) { -- Gitee From 0c1b694fdf30dff6f314d483a2f3f7db295d65e5 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:02 +0800 Subject: [PATCH 39/63] cpufreq: CPPC: Remove forward declaration of cppc_cpufreq_register_em() mainline inclusion from mainline-v6.17-rc2 commit c83a92df2fc60bf0b3130cdf0bc2104d61750317 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c83a92df2fc60bf0b3130cdf0bc2104d61750317 ---------------------------------------------------------------------- cppc_cpufreq_register_em() is only used in populate_efficiency_class(). A forward declaration of it is not necessary. Move cppc_cpufreq_register_em() in front of populate_efficiency_class() and remove the forward declaration of cppc_cpufreq_register_em(). No functional change. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250526113057.3086513-4-zhenglifeng1@huawei.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 6b19a66b29db..9ad7d2ac2049 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -464,7 +464,6 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) } static DEFINE_PER_CPU(unsigned int, efficiency_class); -static void cppc_cpufreq_register_em(struct cpufreq_policy *policy); /* Create an artificial performance state every CPPC_EM_CAP_STEP capacity unit. */ #define CPPC_EM_CAP_STEP (20) @@ -593,6 +592,18 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, return 0; } +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data; + struct em_data_callback em_cb = + EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); + + cpu_data = policy->driver_data; + em_dev_register_perf_domain(get_cpu_device(policy->cpu), + get_perf_level_count(policy), &em_cb, + cpu_data->shared_cpu_map); +} + static void populate_efficiency_class(void) { struct acpi_madt_generic_interrupt *gicc; @@ -627,18 +638,6 @@ static void populate_efficiency_class(void) cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; } -static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) -{ - struct cppc_cpudata *cpu_data; - struct em_data_callback em_cb = - EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); - - cpu_data = policy->driver_data; - em_dev_register_perf_domain(get_cpu_device(policy->cpu), - get_perf_level_count(policy), &em_cb, - cpu_data->shared_cpu_map); -} - #else static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) -- Gitee From 64a82d0ce17d2a8cafab08289cfb585c031d6427 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:03 +0800 Subject: [PATCH 40/63] cpufreq: Contain scaling_cur_freq.attr in cpufreq_attrs mainline inclusion from mainline-v6.17-rc2 commit 2e554cfa259fe07085a4fcff7d2ec4b7041bbd9c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2e554cfa259fe07085a4fcff7d2ec4b7041bbd9c ---------------------------------------------------------------------- After commit c034b02e213d ("cpufreq: expose scaling_cur_freq sysfs file for set_policy() drivers"), the file scaling_cur_freq is exposed to all drivers. No need to create this file separately. It's better to be contained in cpufreq_attrs. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250623133402.3120230-4-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index dfcb6fc75758..d4c6196d0b45 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -965,6 +965,7 @@ static struct attribute *default_attrs[] = { &cpuinfo_min_freq.attr, &cpuinfo_max_freq.attr, &cpuinfo_transition_latency.attr, + &scaling_cur_freq.attr, &scaling_min_freq.attr, &scaling_max_freq.attr, &affected_cpus.attr, @@ -1083,10 +1084,6 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) return ret; } - ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); - if (ret) - return ret; - if (cpufreq_driver->bios_limit) { ret = sysfs_create_file(&policy->kobj, &bios_limit.attr); if (ret) -- Gitee From 0d0db8ce66a3d2750e1fb61c3858b8641bff31bb Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Tue, 6 Apr 2021 15:02:33 +0800 Subject: [PATCH 41/63] docs/zh_CN: add cpu-freq cpu-drivers.rst translation mainline inclusion from mainline-v5.13-rc1 commit 8b6d5ae8a9968c0a3807587d33b5952cdf71de1b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=8b6d5ae8a9968c0a3807587d33b5952cdf71de1b ---------------------------------------------------------------------- This patch translates Documention/cpu-freq/cpu-drivers.rst into Chinese. Signed-off-by: Yanteng Si Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20210406070239.19910-3-siyanteng@loongson.cn Signed-off-by: Jonathan Corbet Signed-off-by: huwentao --- .../zh_CN/cpu-freq/cpu-drivers.rst | 259 ++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst new file mode 100644 index 000000000000..0ca2cb646666 --- /dev/null +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -0,0 +1,259 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../disclaimer-zh_CN.rst + +:Original: :doc:`../../../cpu-freq/cpu-drivers` +:Translator: Yanteng Si + +.. _cn_cpu-drivers.rst: + + +======================================= +如何实现一个新的CPUFreq处理器驱动程序? +======================================= + +作者: + + + - Dominik Brodowski + - Rafael J. Wysocki + - Viresh Kumar + +.. Contents + + 1. 怎么做? + 1.1 初始化 + 1.2 Per-CPU 初始化 + 1.3 验证 + 1.4 target/target_index 或 setpolicy? + 1.5 target/target_index + 1.6 setpolicy + 1.7 get_intermediate 与 target_intermediate + 2. 频率表助手 + + + +1. 怎么做? +=========== + +如此,你刚刚得到了一个全新的CPU/芯片组及其数据手册,并希望为这个CPU/芯片组添加cpufreq +支持?很好,这里有一些至关重要的提示: + + +1.1 初始化 +---------- + +首先,在__initcall_level_7 (module_init())或更靠后的函数中检查这个内核是否 +运行在正确的CPU和正确的芯片组上。如果是,则使用cpufreq_register_driver()向 +CPUfreq核心层注册一个cpufreq_driver结构体。 + +结构体cpufreq_driver应该包含什么成员? + + .name - 驱动的名字。 + + .init - 一个指向per-policy初始化函数的指针。 + + .verify - 一个指向"verification"函数的指针。 + + .setpolicy 或 .fast_switch 或 .target 或 .target_index - 差异见 + 下文。 + +并且可选择 + + .flags - cpufreq核的提示。 + + .driver_data - cpufreq驱动程序的特定数据。 + + .resolve_freq - 返回最适合目标频率的频率。不过并不能改变频率。 + + .get_intermediate 和 target_intermediate - 用于在改变CPU频率时切换到稳定 + 的频率。 + + .get - 返回CPU的当前频率。 + + .bios_limit - 返回HW/BIOS对CPU的最大频率限制值。 + + .exit - 一个指向per-policy清理函数的指针,该函数在cpu热插拔过程的CPU_POST_DEAD + 阶段被调用。 + + .stop_cpu - 一个指向per-policy停止函数的指针,该函数在cpu热插拔过程的CPU_DOWN_PREPARE + 阶段被调用。 + + .suspend - 一个指向per-policy暂停函数的指针,该函数在关中断且在该策略的调节器停止 + 后被调用。 + + .resume - 一个指向per-policy恢复函数的指针,该函数在关中断且在调节器再一次开始前被 + 调用。 + + .ready - 一个指向per-policy准备函数的指针,该函数在策略完全初始化之后被调用。 + + .attr - 一个指向NULL结尾的"struct freq_attr"列表的指针,该函数允许导出值到 + sysfs。 + + .boost_enabled - 如果设置,则启用提升(boost)频率。 + + .set_boost - 一个指向per-policy函数的指针,该函数用来开启/关闭提升(boost)频率功能。 + + +1.2 Per-CPU 初始化 +------------------ + +每当一个新的CPU被注册到设备模型中,或者在cpufreq驱动注册自己之后,如果此CPU的cpufreq策 +略不存在,则会调用per-policy的初始化函数cpufreq_driver.init。请注意,.init()和.exit()程序 +只对策略调用一次,而不是对策略管理的每个CPU调用一次。它需要一个 ``struct cpufreq_policy +*policy`` 作为参数。现在该怎么做呢? + +如果有必要,请在你的CPU上激活CPUfreq功能支持。 + +然后,驱动程序必须填写以下数值: + ++-----------------------------------+--------------------------------------+ +|policy->cpuinfo.min_freq 和 | | +|policy->cpuinfo.max_freq | 该CPU支持的最低和最高频率(kHz) | +| | | +| | | ++-----------------------------------+--------------------------------------+ +|policy->cpuinfo.transition_latency | | +| | CPU在两个频率之间切换所需的时间,以 | +| | 纳秒为单位(如适用,否则指定 | +| | CPUFREQ_ETERNAL) | ++-----------------------------------+--------------------------------------+ +|policy->cur | 该CPU当前的工作频率(如适用) | +| | | ++-----------------------------------+--------------------------------------+ +|policy->min, | | +|policy->max, | | +|policy->policy and, if necessary, | | +|policy->governor | 必须包含该cpu的 “默认策略”。稍后 | +| | 会用这些值调用 | +| | cpufreq_driver.verify and either | +| | cpufreq_driver.setpolicy or | +| | cpufreq_driver.target/target_index | +| | | ++-----------------------------------+--------------------------------------+ +|policy->cpus | 用与这个CPU一起做DVFS的(在线+离线) | +| | CPU(即与它共享时钟/电压轨)的掩码更新 | +| | 这个 | +| | | ++-----------------------------------+--------------------------------------+ + +对于设置其中的一些值(cpuinfo.min[max]_freq, policy->min[max]),频率表助手可能会有帮 +助。关于它们的更多信息,请参见第2节。 + + +1.3 验证 +-------- + +当用户决定设置一个新的策略(由 “policy,governor,min,max组成”)时,必须对这个策略进行验证, +以便纠正不兼容的值。为了验证这些值,cpufreq_verify_within_limits(``struct cpufreq_policy +*policy``, ``unsigned int min_freq``, ``unsigned int max_freq``)函数可能会有帮助。 +关于频率表助手的详细内容请参见第2节。 + +您需要确保至少有一个有效频率(或工作范围)在 policy->min 和 policy->max 范围内。如果有必 +要,先增加policy->max,只有在没有办法的情况下,才减少policy->min。 + + +1.4 target 或 target_index 或 setpolicy 或 fast_switch? +------------------------------------------------------- + +大多数cpufreq驱动甚至大多数cpu频率升降算法只允许将CPU频率设置为预定义的固定值。对于这些,你 +可以使用->target(),->target_index()或->fast_switch()回调。 + +有些cpufreq功能的处理器可以自己在某些限制之间切换频率。这些应使用->setpolicy()回调。 + + +1.5. target/target_index +------------------------ + +target_index调用有两个参数:``struct cpufreq_policy * policy``和``unsigned int`` +索引(于列出的频率表)。 + +当调用这里时,CPUfreq驱动必须设置新的频率。实际频率必须由freq_table[index].frequency决定。 + +它应该总是在错误的情况下恢复到之前的频率(即policy->restore_freq),即使我们之前切换到中间频率。 + +已弃用 +---------- +目标调用有三个参数。``struct cpufreq_policy * policy``, unsigned int target_frequency, +unsigned int relation. + +CPUfreq驱动在调用这里时必须设置新的频率。实际的频率必须使用以下规则来确定。 + +- 紧跟 "目标频率"。 +- policy->min <= new_freq <= policy->max (这必须是有效的!!!) +- 如果 relation==CPUFREQ_REL_L,尝试选择一个高于或等于 target_freq 的 new_freq。("L代表 + 最低,但不能低于") +- 如果 relation==CPUFREQ_REL_H,尝试选择一个低于或等于 target_freq 的 new_freq。("H代表 + 最高,但不能高于") + +这里,频率表助手可能会帮助你--详见第2节。 + +1.6. fast_switch +---------------- + +这个函数用于从调度器的上下文进行频率切换。并非所有的驱动都要实现它,因为不允许在这个回调中睡眠。这 +个回调必须经过高度优化,以尽可能快地进行切换。 + +这个函数有两个参数: ``struct cpufreq_policy *policy`` 和 ``unsigned int target_frequency``。 + + +1.7 setpolicy +------------- + +setpolicy调用只需要一个``struct cpufreq_policy * policy``作为参数。需要将处理器内或芯片组内动态频 +率切换的下限设置为policy->min,上限设置为policy->max,如果支持的话,当policy->policy为 +CPUFREQ_POLICY_PERFORMANCE时选择面向性能的设置,当CPUFREQ_POLICY_POWERSAVE时选择面向省电的设置。 +也可以查看drivers/cpufreq/longrun.c中的参考实现。 + +1.8 get_intermediate 和 target_intermediate +-------------------------------------------- + +仅适用于 target_index() 和 CPUFREQ_ASYNC_NOTIFICATION 未设置的驱动。 + +get_intermediate应该返回一个平台想要切换到的稳定的中间频率,target_intermediate()应该将CPU设置为 +该频率,然后再跳转到'index'对应的频率。核心会负责发送通知,驱动不必在target_intermediate()或 +target_index()中处理。 + +在驱动程序不想因为某个目标频率切换到中间频率的情况下,它们可以从get_intermediate()中返回'0'。在这种情况 +下,核心将直接调用->target_index()。 + +注意:->target_index()应该在失败的情况下恢复到policy->restore_freq,因为core会为此发送通知。 + + +2. 频率表助手 +============= + +由于大多数cpufreq处理器只允许被设置为几个特定的频率,因此,一个带有一些函数的 “频率表”可能会辅助处理器驱动 +程序的一些工作。这样的 "频率表" 由一个cpufreq_frequency_table条目构成的数组组成,"driver_data" 中包 +含了驱动程序的具体数值,"frequency" 中包含了相应的频率,并设置了标志。在表的最后,需要添加一个 +cpufreq_frequency_table条目,频率设置为CPUFREQ_TABLE_END。而如果想跳过表中的一个条目,则将频率设置为 +CPUFREQ_ENTRY_INVALID。这些条目不需要按照任何特定的顺序排序,但如果它们是cpufreq 核心会对它们进行快速的DVFS, +因为搜索最佳匹配会更快。 + +如果策略在其policy->freq_table字段中包含一个有效的指针,cpufreq表就会被核心自动验证。 + +cpufreq_frequency_table_verify()保证至少有一个有效的频率在policy->min和policy->max范围内,并且所有其他 +标准都被满足。这对->verify调用很有帮助。 + +cpufreq_frequency_table_target()是对应于->target阶段的频率表助手。只要把数值传递给这个函数,这个函数就会返 +回包含CPU要设置的频率的频率表条目。 + +以下宏可以作为cpufreq_frequency_table的迭代器。 + +cpufreq_for_each_entry(pos, table) - 遍历频率表的所有条目。 + +cpufreq_for_each_valid_entry(pos, table) - 该函数遍历所有条目,不包括CPUFREQ_ENTRY_INVALID频率。 +使用参数 "pos"-一个``cpufreq_frequency_table * `` 作为循环变量,使用参数 "table"-作为你想迭代 +的``cpufreq_frequency_table * `` 。 + +例如:: + + struct cpufreq_frequency_table *pos, *driver_freq_table; + + cpufreq_for_each_entry(pos, driver_freq_table) { + /* Do something with pos */ + pos->frequency = ... + } + +如果你需要在driver_freq_table中处理pos的位置,不要减去指针,因为它的代价相当高。相反,使用宏 +cpufreq_for_each_entry_idx() 和 cpufreq_for_each_valid_entry_idx() 。 -- Gitee From da38a61f8a159764d9e389f7c05816bba16bb9ee Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Jun 2021 11:31:14 +0530 Subject: [PATCH 42/63] cpufreq: powernv: Migrate to ->exit() callback instead of ->stop_cpu() mainline inclusion from mainline-v5.14-rc1 commit 952da0c9ab5b047665442dc239cee36d5c9edb98 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=952da0c9ab5b047665442dc239cee36d5c9edb98 ---------------------------------------------------------------------- Commit 367dc4aa932b ("cpufreq: Add stop CPU callback to cpufreq_driver interface") added the ->stop_cpu() callback to allow the drivers to do clean up before the CPU is completely down and its state can't be modified. At that time the CPU hotplug framework used to call the cpufreq core's registered notifier for different events like CPU_DOWN_PREPARE and CPU_POST_DEAD. The ->stop_cpu() callback was called during the CPU_DOWN_PREPARE event. This is no longer the case, cpuhp_cpufreq_offline() is called only once by the CPU hotplug core now and we don't really need two separate callbacks for cpufreq drivers, i.e. ->stop_cpu() and ->exit(), as everything can be done from the ->exit() callback itself. Migrate to using the ->exit() callback instead of ->stop_cpu(). Signed-off-by: Viresh Kumar [ rjw: Minor changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/powernv-cpufreq.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 8977e4de5915..6fbb46b2f6da 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -876,7 +876,15 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - /* timer is deleted in cpufreq_cpu_stop() */ + struct powernv_smp_call_data freq_data; + struct global_pstate_info *gpstates = policy->driver_data; + + freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min); + freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min); + smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); + if (gpstates) + del_timer_sync(&gpstates->timer); + kfree(policy->driver_data); return 0; @@ -1008,18 +1016,6 @@ static struct notifier_block powernv_cpufreq_opal_nb = { .priority = 0, }; -static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy) -{ - struct powernv_smp_call_data freq_data; - struct global_pstate_info *gpstates = policy->driver_data; - - freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min); - freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min); - smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1); - if (gpstates) - del_timer_sync(&gpstates->timer); -} - static unsigned int powernv_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { @@ -1043,7 +1039,6 @@ static struct cpufreq_driver powernv_cpufreq_driver = { .target_index = powernv_cpufreq_target_index, .fast_switch = powernv_fast_switch, .get = powernv_cpufreq_get, - .stop_cpu = powernv_cpufreq_stop_cpu, .attr = powernv_cpu_freq_attr, }; -- Gitee From b4ec4fcd7ee01caf1737e4a4351194574345e365 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 30 Jun 2021 18:44:46 +0200 Subject: [PATCH 43/63] cpufreq: intel_pstate: Combine ->stop_cpu() and ->offline() mainline inclusion from mainline-v5.14-rc1 commit 49d6feef94c9f47ac4030563058f8a36267597b0 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=49d6feef94c9f47ac4030563058f8a36267597b0 ---------------------------------------------------------------------- Combine the ->stop_cpu() and ->offline() callback routines for intel_pstate in the active mode so as to avoid setting the ->stop_cpu callback pointer which is going to be dropped from the framework. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Signed-off-by: huwentao --- drivers/cpufreq/intel_pstate.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 0cec5148b69f..36ce05e4c983 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2490,7 +2490,7 @@ static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy) return 0; } -static int intel_pstate_cpu_offline(struct cpufreq_policy *policy) +static int intel_cpufreq_cpu_offline(struct cpufreq_policy *policy) { struct cpudata *cpu = all_cpu_data[policy->cpu]; @@ -2535,11 +2535,11 @@ static int intel_pstate_cpu_online(struct cpufreq_policy *policy) return 0; } -static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) +static int intel_pstate_cpu_offline(struct cpufreq_policy *policy) { - pr_debug("CPU %d stopping\n", policy->cpu); - intel_pstate_clear_update_util_hook(policy->cpu); + + return intel_cpufreq_cpu_offline(policy); } static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) @@ -2612,7 +2612,6 @@ static struct cpufreq_driver intel_pstate = { .resume = intel_pstate_resume, .init = intel_pstate_cpu_init, .exit = intel_pstate_cpu_exit, - .stop_cpu = intel_pstate_stop_cpu, .offline = intel_pstate_cpu_offline, .online = intel_pstate_cpu_online, .update_limits = intel_pstate_update_limits, @@ -2870,7 +2869,7 @@ static struct cpufreq_driver intel_cpufreq = { .fast_switch = intel_cpufreq_fast_switch, .init = intel_cpufreq_cpu_init, .exit = intel_cpufreq_cpu_exit, - .offline = intel_pstate_cpu_offline, + .offline = intel_cpufreq_cpu_offline, .online = intel_pstate_cpu_online, .suspend = intel_pstate_suspend, .resume = intel_pstate_resume, -- Gitee From 73af5520fd5aa00e474fc10ef919dcfa4712b64b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Jun 2021 09:54:42 +0530 Subject: [PATCH 44/63] cpufreq: Remove the ->stop_cpu() driver callback mainline inclusion from mainline-v5.14-rc1 commit 3e0f897fd92662f0ff21ca1759d724a9ad574858 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=3e0f897fd92662f0ff21ca1759d724a9ad574858 ---------------------------------------------------------------------- Now that all users of ->stop_cpu() have been migrated to using other callbacks, drop it from the core. Signed-off-by: Viresh Kumar [ rjw: Minor edits in the subject and changelog ] Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- Documentation/cpu-freq/cpu-drivers.rst | 3 --- Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst | 3 --- drivers/cpufreq/cpufreq.c | 3 --- include/linux/cpufreq.h | 1 - 4 files changed, 10 deletions(-) diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index a697278ce190..74fac797c396 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -71,9 +71,6 @@ And optionally .exit - A pointer to a per-policy cleanup function called during CPU_POST_DEAD phase of cpu hotplug process. - .stop_cpu - A pointer to a per-policy stop function called during - CPU_DOWN_PREPARE phase of cpu hotplug process. - .suspend - A pointer to a per-policy suspend function which is called with interrupts disabled and _after_ the governor is stopped for the policy. diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst index 0ca2cb646666..9570e9c9e939 100644 --- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -76,9 +76,6 @@ CPUfreq核心层注册一个cpufreq_driver结构体。 .exit - 一个指向per-policy清理函数的指针,该函数在cpu热插拔过程的CPU_POST_DEAD 阶段被调用。 - .stop_cpu - 一个指向per-policy停止函数的指针,该函数在cpu热插拔过程的CPU_DOWN_PREPARE - 阶段被调用。 - .suspend - 一个指向per-policy暂停函数的指针,该函数在关中断且在该策略的调节器停止 后被调用。 diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d4c6196d0b45..9ad4b21dcf35 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1666,9 +1666,6 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) policy->cdev = NULL; } - if (cpufreq_driver->stop_cpu) - cpufreq_driver->stop_cpu(policy); - if (has_target()) cpufreq_exit_governor(policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 547cddaab2fb..be1a36076941 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -375,7 +375,6 @@ struct cpufreq_driver { int (*online)(struct cpufreq_policy *policy); int (*offline)(struct cpufreq_policy *policy); int (*exit)(struct cpufreq_policy *policy); - void (*stop_cpu)(struct cpufreq_policy *policy); int (*suspend)(struct cpufreq_policy *policy); int (*resume)(struct cpufreq_policy *policy); -- Gitee From 2bb460f6c2a55fbc4b1f986981df3769e3caacf8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 7 May 2024 14:25:11 +0800 Subject: [PATCH 45/63] cpufreq: Don't unregister cpufreq cooling on CPU hotplug stable inclusion from stable-v6.6.27 commit 3ba4aceb68f013f36fbfd0a6aab5df439b4198d4 bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=3ba4aceb68f013f36fbfd0a6aab5df439b4198d4 -------------------------------- [ Upstream commit c4d61a529db788d2e52654f5b02c8d1de4952c5b ] Offlining a CPU and bringing it back online is a common operation and it happens frequently during system suspend/resume, where the non-boot CPUs are hotplugged out during suspend and brought back at resume. The cpufreq core already tries to make this path as fast as possible as the changes are only temporary in nature and full cleanup of resources isn't required in this case. For example the drivers can implement online()/offline() callbacks to avoid a lot of tear down of resources. On similar lines, there is no need to unregister the cpufreq cooling device during suspend / resume, but only while the policy is getting removed. Moreover, unregistering the cpufreq cooling device is resulting in an unwanted outcome, where the system suspend is eventually aborted in the process. Currently, during system suspend the cpufreq core unregisters the cooling device, which in turn removes a kobject using device_del() and that generates a notification to the userspace via uevent broadcast. This causes system suspend to abort in some setups. This was also earlier reported (indirectly) by Roman [1]. Maybe there is another way around to fixing that problem properly, but this change makes sense anyways. Move the registering and unregistering of the cooling device to policy creation and removal times onlyy. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218521 Reported-by: Manaf Meethalavalappu Pallikunhi Reported-by: Roman Stratiienko Link: https://patchwork.kernel.org/project/linux-pm/patch/20220710164026.541466-1-r.stratiienko@gmail.com/ [1] Tested-by: Manaf Meethalavalappu Pallikunhi Signed-off-by: Viresh Kumar Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin Signed-off-by: ZhangPeng Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 9ad4b21dcf35..e47817b6d785 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1563,7 +1563,8 @@ static int cpufreq_online(unsigned int cpu) if (cpufreq_driver->ready) cpufreq_driver->ready(policy); - if (cpufreq_thermal_control_enabled(cpufreq_driver)) + /* Register cpufreq cooling only for a new policy */ + if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ @@ -1661,11 +1662,6 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) else policy->last_policy = policy->policy; - if (cpufreq_thermal_control_enabled(cpufreq_driver)) { - cpufreq_cooling_unregister(policy->cdev); - policy->cdev = NULL; - } - if (has_target()) cpufreq_exit_governor(policy); @@ -1730,6 +1726,15 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) return; } + /* + * Unregister cpufreq cooling once all the CPUs of the policy are + * removed. + */ + if (cpufreq_thermal_control_enabled(cpufreq_driver)) { + cpufreq_cooling_unregister(policy->cdev); + policy->cdev = NULL; + } + /* We did light-weight exit earlier, do full tear down now */ if (cpufreq_driver->offline && cpufreq_driver->exit) cpufreq_driver->exit(policy); -- Gitee From 658ad5c46d26cc3478025aea32e3f4d2a40749e6 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:04 +0800 Subject: [PATCH 46/63] cpufreq: Remove duplicate check in __cpufreq_offline() mainline inclusion from mainline-v6.17-rc2 commit 5d6ecaaa922611ec3ca067723ccefafb543010ee category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5d6ecaaa922611ec3ca067723ccefafb543010ee ---------------------------------------------------------------------- The has_target() checks in __cpufreq_offline() are duplicate. Remove one of them and put the operations of exiting governor together with storing last governor's name. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250623133402.3120230-5-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e47817b6d785..975c5c0e5814 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1656,14 +1656,13 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) return; } - if (has_target()) + if (has_target()) { strncpy(policy->last_governor, policy->governor->name, CPUFREQ_NAME_LEN); - else - policy->last_policy = policy->policy; - - if (has_target()) cpufreq_exit_governor(policy); + } else { + policy->last_policy = policy->policy; + } /* * Perform the ->offline() during light-weight tear-down, as -- Gitee From 71e0c225615567f825eceba2ea8a21c4efd33500 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:05 +0800 Subject: [PATCH 47/63] cpufreq: Hold cpufreq_driver_lock when assigning cpufreq_driver->set_boost driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U ---------------------------------------------------------------------- Hold the lock to avoid concurrency problems in cpufreq_enable_boost_support() when assigning cpufreq_driver->set_boost. Fixes: 7a6c79f2fe53 ("cpufreq: Simplify core code related to boost support") Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 975c5c0e5814..f0a53880561d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2759,13 +2759,17 @@ static void remove_boost_sysfs_file(void) int cpufreq_enable_boost_support(void) { + unsigned long flags; + if (!cpufreq_driver) return -EINVAL; if (cpufreq_boost_supported()) return 0; + write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver->set_boost = cpufreq_boost_set_sw; + write_unlock_irqrestore(&cpufreq_driver_lock, flags); /* This will get removed on driver unregister */ return create_boost_sysfs_file(); -- Gitee From 5db0748bc065c0884fcd6022c70434a7037deff5 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:06 +0800 Subject: [PATCH 48/63] cpufreq: Initialize cpufreq-based frequency-invariance later mainline inclusion from mainline-v6.17-rc2 commit 2a6c727387062a2ea79eb6cf5004820cb1b0afe2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2a6c727387062a2ea79eb6cf5004820cb1b0afe2 ---------------------------------------------------------------------- The cpufreq-based invariance is enabled in cpufreq_register_driver(), but never disabled after registration fails. Move the invariance initialization to where all other initializations have been successfully done to solve this problem. Fixes: 874f63531064 ("cpufreq: report whether cpufreq supports Frequency Invariance (FI)") Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-2-zhenglifeng1@huawei.com [ rjw: New subject ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index f0a53880561d..059c1b5c57a0 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -3082,15 +3082,6 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) cpufreq_driver = driver_data; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - /* - * Mark support for the scheduler's frequency invariance engine for - * drivers that implement target(), target_index() or fast_switch(). - */ - if (!cpufreq_driver->setpolicy) { - static_branch_enable_cpuslocked(&cpufreq_freq_invariance); - pr_debug("supports frequency invariance"); - } - if (driver_data->setpolicy) driver_data->flags |= CPUFREQ_CONST_LOOPS; @@ -3122,6 +3113,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) hp_online = ret; ret = 0; + /* + * Mark support for the scheduler's frequency invariance engine for + * drivers that implement target(), target_index() or fast_switch(). + */ + if (!cpufreq_driver->setpolicy) { + static_branch_enable_cpuslocked(&cpufreq_freq_invariance); + pr_debug("supports frequency invariance"); + } + pr_debug("driver %s up and running\n", driver_data->name); goto out; -- Gitee From b3f98de02312c41d3f643dbc44e9f2dd82e54280 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:08 +0800 Subject: [PATCH 49/63] cpufreq: Move the check of cpufreq_driver->get into cpufreq_verify_current_freq() mainline inclusion from mainline-v6.17-rc2 commit 908981d85f86c5e2b39dfe0b2267c6d44d9c48f7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=908981d85f86c5e2b39dfe0b2267c6d44d9c48f7 ---------------------------------------------------------------------- Move the check of cpufreq_driver->get into cpufreq_verify_current_freq() in case of calling it without check. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-4-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 059c1b5c57a0..49cec8b57a7e 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1771,6 +1771,9 @@ static unsigned int cpufreq_verify_current_freq(struct cpufreq_policy *policy, b { unsigned int new_freq; + if (!cpufreq_driver->get) + return 0; + new_freq = cpufreq_driver->get(policy->cpu); if (!new_freq) return 0; @@ -1885,8 +1888,7 @@ unsigned int cpufreq_get(unsigned int cpu) if (policy) { down_read(&policy->rwsem); - if (cpufreq_driver->get) - ret_freq = __cpufreq_get(policy); + ret_freq = __cpufreq_get(policy); up_read(&policy->rwsem); cpufreq_cpu_put(policy); @@ -2408,8 +2410,7 @@ int cpufreq_start_governor(struct cpufreq_policy *policy) pr_debug("%s: for CPU %u\n", __func__, policy->cpu); - if (cpufreq_driver->get) - cpufreq_verify_current_freq(policy, false); + cpufreq_verify_current_freq(policy, false); if (policy->governor->start) { ret = policy->governor->start(policy); -- Gitee From bf717edbb8af64633c905afca61632922a7a3fa0 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Thu, 21 Aug 2025 15:16:09 +0800 Subject: [PATCH 50/63] cpufreq: Exit governor when failed to start old governor mainline inclusion from mainline-v6.17-rc2 commit 0ae204405095abfbc2d694ee0fbb49bcbbe55c57 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0ae204405095abfbc2d694ee0fbb49bcbbe55c57 ---------------------------------------------------------------------- Detect the result of starting old governor in cpufreq_set_policy(). If it fails, exit the governor and clear policy->governor. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-5-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Hongye Lin Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 49cec8b57a7e..d9494e1d6a80 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2618,10 +2618,12 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, pr_debug("starting governor %s failed\n", policy->governor->name); if (old_gov) { policy->governor = old_gov; - if (cpufreq_init_governor(policy)) + if (cpufreq_init_governor(policy)) { policy->governor = NULL; - else - cpufreq_start_governor(policy); + } else if (cpufreq_start_governor(policy)) { + cpufreq_exit_governor(policy); + policy->governor = NULL; + } } return ret; -- Gitee From b5b9d202dd9598549b4c676a691b2fe1c07e648c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Jun 2021 14:27:50 +0530 Subject: [PATCH 51/63] arch_topology: Avoid use-after-free for scale_freq_data mainline inclusion from mainline-v5.14-rc1 commit 83150f5d05f065fb5c12c612f119015cabdcc124 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=83150f5d05f065fb5c12c612f119015cabdcc124 ---------------------------------------------------------------------- Currently topology_scale_freq_tick() (which gets called from scheduler_tick()) may end up using a pointer to "struct scale_freq_data", which was previously cleared by topology_clear_scale_freq_source(), as there is no protection in place here. The users of topology_clear_scale_freq_source() though needs a guarantee that the previously cleared scale_freq_data isn't used anymore, so they can free the related resources. Since topology_scale_freq_tick() is called from scheduler tick, we don't want to add locking in there. Use the RCU update mechanism instead (which is already used by the scheduler's utilization update path) to guarantee race free updates here. synchronize_rcu() makes sure that all RCU critical sections that started before it is called, will finish before it returns. And so the callers of topology_clear_scale_freq_source() don't need to worry about their callback getting called anymore. Cc: Paul E. McKenney Fixes: 01e055c120a4 ("arch_topology: Allow multiple entities to provide sched_freq_tick() callback") Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar Signed-off-by: huwentao --- drivers/base/arch_topology.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 7f947e5734c2..ca6e77ae43d0 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -18,10 +18,11 @@ #include #include #include +#include #include #include -static DEFINE_PER_CPU(struct scale_freq_data *, sft_data); +static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; @@ -66,16 +67,20 @@ void topology_set_scale_freq_source(struct scale_freq_data *data, if (cpumask_empty(&scale_freq_counters_mask)) scale_freq_invariant = topology_scale_freq_invariant(); + rcu_read_lock(); + for_each_cpu(cpu, cpus) { - sfd = per_cpu(sft_data, cpu); + sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); /* Use ARCH provided counters whenever possible */ if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) { - per_cpu(sft_data, cpu) = data; + rcu_assign_pointer(per_cpu(sft_data, cpu), data); cpumask_set_cpu(cpu, &scale_freq_counters_mask); } } + rcu_read_unlock(); + update_scale_freq_invariant(true); } EXPORT_SYMBOL_GPL(topology_set_scale_freq_source); @@ -86,22 +91,32 @@ void topology_clear_scale_freq_source(enum scale_freq_source source, struct scale_freq_data *sfd; int cpu; + rcu_read_lock(); + for_each_cpu(cpu, cpus) { - sfd = per_cpu(sft_data, cpu); + sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); if (sfd && sfd->source == source) { - per_cpu(sft_data, cpu) = NULL; + rcu_assign_pointer(per_cpu(sft_data, cpu), NULL); cpumask_clear_cpu(cpu, &scale_freq_counters_mask); } } + rcu_read_unlock(); + + /* + * Make sure all references to previous sft_data are dropped to avoid + * use-after-free races. + */ + synchronize_rcu(); + update_scale_freq_invariant(false); } EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source); void topology_scale_freq_tick(void) { - struct scale_freq_data *sfd = *this_cpu_ptr(&sft_data); + struct scale_freq_data *sfd = rcu_dereference_sched(*this_cpu_ptr(&sft_data)); if (sfd) sfd->set_freq_scale(); -- Gitee From 27540d87f65758681b490e19b79bd2047bf5f586 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 2 Nov 2021 18:01:43 +0000 Subject: [PATCH 52/63] Documentation: power: Add description about new callback for EM registration mainline inclusion from mainline-v5.16-rc2 commit d704aa0d44ade12660d4d7220b2a8d785b7b4247 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=d704aa0d44ade12660d4d7220b2a8d785b7b4247 ---------------------------------------------------------------------- The Energy Model (EM) registration for CPUs should now be done using a dedicated callback added recently into CPUFreq framework and drivers. Commit c17495b01b72 ("cpufreq: Add callback to register with energy model") The callback guaranties that the EM registration is called at the right time during driver setup. To avoid mistakes update the documentation to align with the existing code implementation. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- Documentation/power/energy-model.rst | 29 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index 5f03b6f47d9e..8c9e784d684f 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -110,6 +110,10 @@ More details about the above APIs can be found in include/linux/energy_model.h. 3. Example driver ----------------- +The CPUFreq framework supports dedicated callback for registering +the EM for a given CPU(s) 'policy' object: cpufreq_driver::register_em(). +That callback has to be implemented properly for a given driver, +because the framework would call it at the right time during setup. This section provides a simple example of a CPUFreq driver registering a performance domain in the Energy Model framework using the (fake) 'foo' protocol. The driver implements an est_power() function to be provided to the @@ -139,24 +143,21 @@ EM framework:: 20 return 0; 21 } 22 - 23 static int foo_cpufreq_init(struct cpufreq_policy *policy) + 23 static void foo_cpufreq_register_em(struct cpufreq_policy *policy) 24 { 25 struct em_data_callback em_cb = EM_DATA_CB(est_power); 26 struct device *cpu_dev; - 27 int nr_opp, ret; + 27 int nr_opp; 28 29 cpu_dev = get_cpu_device(cpumask_first(policy->cpus)); 30 - 31 /* Do the actual CPUFreq init work ... */ - 32 ret = do_foo_cpufreq_init(policy); - 33 if (ret) - 34 return ret; - 35 - 36 /* Find the number of OPPs for this policy */ - 37 nr_opp = foo_get_nr_opp(policy); + 31 /* Find the number of OPPs for this policy */ + 32 nr_opp = foo_get_nr_opp(policy); + 33 + 34 /* And register the new performance domain */ + 35 em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus); + 37 } 38 - 39 /* And register the new performance domain */ - 40 em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus); - 41 - 42 return 0; - 43 } + 39 static struct cpufreq_driver foo_cpufreq_driver = { + 40 .register_em = foo_cpufreq_register_em, + 41 }; -- Gitee From 0042a7107bd5fa47adcda1e0a4682fe1047d29be Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 7 May 2024 14:13:19 +0800 Subject: [PATCH 53/63] cpufreq: Fix per-policy boost behavior on SoCs using cpufreq_boost_set_sw() stable inclusion from stable-v6.6.23 commit 9d47d2e7f82dec7cab09996bdb9062786eb827f6 bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9d47d2e7f82dec7cab09996bdb9062786eb827f6 -------------------------------- [ Upstream commit f37a4d6b4a2c77414e8b9d25dd5ee31537ce9b00 ] In the existing code, per-policy flags don't have any impact i.e. if cpufreq_driver boost is enabled and boost is disabled for one or more of the policies, the cpufreq driver will behave as if boost is enabled. Fix this by incorporating per-policy boost flag in the policy->max computation used in cpufreq_frequency_table_cpuinfo and setting the default per-policy boost to mirror the cpufreq_driver boost flag. Fixes: 218a06a79d9a ("cpufreq: Support per-policy performance boost") Reported-by: Dietmar Eggemann Reviewed-by: Viresh Kumar Reviewed-by: Dhruva Gole Signed-off-by: Sibi Sankar Tested-by:Yipeng Zou Reviewed-by: Yipeng Zou Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin Signed-off-by: ZhangPeng Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 18 ++++++++++++------ drivers/cpufreq/freq_table.c | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d9494e1d6a80..29b58b143a42 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -638,14 +638,16 @@ static ssize_t store_local_boost(struct cpufreq_policy *policy, if (policy->boost_enabled == enable) return count; + policy->boost_enabled = enable; + cpus_read_lock(); ret = cpufreq_driver->set_boost(policy, enable); cpus_read_unlock(); - if (ret) + if (ret) { + policy->boost_enabled = !policy->boost_enabled; return ret; - - policy->boost_enabled = enable; + } return count; } @@ -1410,6 +1412,9 @@ static int cpufreq_online(unsigned int cpu) goto out_free_policy; } + /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ + policy->boost_enabled = cpufreq_boost_enabled() && policy_has_boost_freq(policy); + /* * The initialization has succeeded and the policy is online. * If there is a problem with its frequency table, take it @@ -2714,11 +2719,12 @@ int cpufreq_boost_trigger_state(int state) get_online_cpus(); for_each_active_policy(policy) { + policy->boost_enabled = state; ret = cpufreq_driver->set_boost(policy, state); - if (ret) + if (ret) { + policy->boost_enabled = !policy->boost_enabled; goto err_reset_state; - - policy->boost_enabled = state; + } } put_online_cpus(); diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index d3f756f7b5a0..fe9c087afe01 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -40,7 +40,7 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, cpufreq_for_each_valid_entry(pos, table) { freq = pos->frequency; - if (!cpufreq_boost_enabled() + if ((!cpufreq_boost_enabled() || !policy->boost_enabled) && (pos->flags & CPUFREQ_BOOST_FREQ)) continue; -- Gitee From ad0ed5db8623bf542fc28044b7ae67f854c6d0e6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 1 Sep 2021 14:41:55 +0530 Subject: [PATCH 54/63] cpufreq: acpi: Remove acpi_cpufreq_cpu_ready() mainline inclusion from mainline-v5.15-rc1 commit 692a3b9a89947b27fc76d40b2613b33286a1690b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=692a3b9a89947b27fc76d40b2613b33286a1690b ---------------------------------------------------------------------- The ready() callback was implemented earlier for acpi-cpufreq driver as we wanted to use policy->cpuinfo.max_freq for which the policy was required to be verified. That is no longer the case and we can do the pr_warn() right from ->init() callback now. Remove acpi_cpufreq_cpu_ready(). Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/acpi-cpufreq.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 3f9150909126..d71c5af338ad 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -889,6 +889,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->fast_switch_possible = !acpi_pstate_strict && !(policy_is_shared(policy) && policy->shared_type != CPUFREQ_SHARED_TYPE_ANY); + if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) + pr_warn(FW_WARN "P-state 0 is not max freq\n"); + return result; err_unreg: @@ -918,16 +921,6 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) return 0; } -static void acpi_cpufreq_cpu_ready(struct cpufreq_policy *policy) -{ - struct acpi_processor_performance *perf = per_cpu_ptr(acpi_perf_data, - policy->cpu); - unsigned int freq = policy->freq_table[0].frequency; - - if (perf->states[0].core_frequency * 1000 != freq) - pr_warn(FW_WARN "P-state 0 is not max freq\n"); -} - static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data = policy->driver_data; @@ -955,7 +948,6 @@ static struct cpufreq_driver acpi_cpufreq_driver = { .bios_limit = acpi_processor_get_bios_limit, .init = acpi_cpufreq_cpu_init, .exit = acpi_cpufreq_cpu_exit, - .ready = acpi_cpufreq_cpu_ready, .resume = acpi_cpufreq_resume, .name = "acpi-cpufreq", .attr = acpi_cpufreq_attr, -- Gitee From d0e0b5fcbc25c59b0ace32c299f0522925c6cf24 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Wed, 2 Nov 2022 14:59:57 -0500 Subject: [PATCH 55/63] cpufreq: ACPI: Defer setting boost MSRs mainline inclusion from mainline-v6.2-rc1 commit 13fdbc8b8da6a2325cad3359c9a70504b0ff2f93 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=13fdbc8b8da6a2325cad3359c9a70504b0ff2f93 ---------------------------------------------------------------------- When acpi-cpufreq is loaded, boost is enabled on every CPU (by setting an MSR) before the driver is registered with cpufreq. This can be very time consuming, because it is done with a CPU hotplug startup callback, and cpuhp_setup_state() schedules the callback (cpufreq_boost_online()) to run on each CPU one at a time, waiting for each to run before calling the next. If cpufreq_register_driver() fails--if, for example, there are no ACPI P-states present--this is wasted time. Since cpufreq already sets up a CPU hotplug startup callback if and when acpi-cpufreq is registered, set the boost MSRs in acpi_cpufreq_cpu_init(), which is called by the cpufreq cpuhp callback. This allows acpi-cpufreq to exit quickly if it is loaded but not needed. On one system with 192 CPUs, this patch speeds up boot by about 30 seconds. Signed-off-by: Stuart Hayes Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/acpi-cpufreq.c | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index d71c5af338ad..b99de0999560 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -534,15 +534,6 @@ static void free_acpi_perf_data(void) free_percpu(acpi_perf_data); } -static int cpufreq_boost_online(unsigned int cpu) -{ - /* - * On the CPU_UP path we simply keep the boost-disable flag - * in sync with the current global state. - */ - return boost_set_msr(acpi_cpufreq_driver.boost_enabled); -} - static int cpufreq_boost_down_prep(unsigned int cpu) { /* @@ -892,6 +883,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) pr_warn(FW_WARN "P-state 0 is not max freq\n"); + set_boost(policy, acpi_cpufreq_driver.boost_enabled); + return result; err_unreg: @@ -911,6 +904,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) pr_debug("%s\n", __func__); + cpufreq_boost_down_prep(policy->cpu); policy->fast_switch_possible = false; policy->driver_data = NULL; acpi_processor_unregister_performance(data->acpi_perf_cpu); @@ -967,25 +961,9 @@ static void __init acpi_cpufreq_boost_init(void) acpi_cpufreq_driver.set_boost = set_boost; acpi_cpufreq_driver.boost_enabled = boost_state(0); - /* - * This calls the online callback on all online cpu and forces all - * MSRs to the same value. - */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpufreq/acpi:online", - cpufreq_boost_online, cpufreq_boost_down_prep); - if (ret < 0) { - pr_err("acpi_cpufreq: failed to register hotplug callbacks\n"); - return; - } acpi_cpufreq_online = ret; } -static void acpi_cpufreq_boost_exit(void) -{ - if (acpi_cpufreq_online > 0) - cpuhp_remove_state_nocalls(acpi_cpufreq_online); -} - static int __init acpi_cpufreq_init(void) { int ret; @@ -1027,7 +1005,6 @@ static int __init acpi_cpufreq_init(void) ret = cpufreq_register_driver(&acpi_cpufreq_driver); if (ret) { free_acpi_perf_data(); - acpi_cpufreq_boost_exit(); } return ret; } @@ -1036,8 +1013,6 @@ static void __exit acpi_cpufreq_exit(void) { pr_debug("%s\n", __func__); - acpi_cpufreq_boost_exit(); - cpufreq_unregister_driver(&acpi_cpufreq_driver); free_acpi_perf_data(); -- Gitee From c9920c880c85c850741d396946e6be090ba98c62 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Mon, 5 Dec 2022 11:57:44 -0600 Subject: [PATCH 56/63] cpufreq: ACPI: Only set boost MSRs on supported CPUs mainline inclusion from mainline-v6.2-rc1 commit 442046328f27e30ce2c830759af89f42e7169bc1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=442046328f27e30ce2c830759af89f42e7169bc1 ---------------------------------------------------------------------- Stop trying to set boost MSRs on CPUs that don't support boost. This corrects a bug in the recent patch "Defer setting boost MSRs". Fixes: 13fdbc8b8da6 ("cpufreq: ACPI: Defer setting boost MSRs") Signed-off-by: Stuart Hayes Reported-by: Borislav Petkov (AMD) Tested-by: Borislav Petkov (AMD) Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/acpi-cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index b99de0999560..7d74a21f7c8f 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -883,7 +883,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) pr_warn(FW_WARN "P-state 0 is not max freq\n"); - set_boost(policy, acpi_cpufreq_driver.boost_enabled); + if (acpi_cpufreq_driver.set_boost) + set_boost(policy, acpi_cpufreq_driver.boost_enabled); return result; -- Gitee From c9cc6fbcf85453b6cb0737cb06fe22c629457ccb Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 26 Jun 2024 15:47:23 -0500 Subject: [PATCH 57/63] cpufreq: ACPI: Mark boost policy as enabled when setting boost stable inclusion from stable-v6.6.41 commit 2ca2fd474d862c9b144ea4ed100ee4591f52d574 bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=2ca2fd474d862c9b144ea4ed100ee4591f52d574 -------------------------------- commit d92467ad9d9ee63a700934b9228a989ef671d511 upstream. When boost is set for CPUs using acpi-cpufreq, the policy is not updated which can cause boost to be incorrectly not reported. Fixes: 218a06a79d9a ("cpufreq: Support per-policy performance boost") Link: https://patch.msgid.link/20240626204723.6237-2-mario.limonciello@amd.com Suggested-by: Viresh Kumar Suggested-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Acked-by: Viresh Kumar Cc: All applicable Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman Signed-off-by: ZhangPeng Signed-off-by: huwentao --- drivers/cpufreq/acpi-cpufreq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 7d74a21f7c8f..41a3c81614d4 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -883,8 +883,10 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) pr_warn(FW_WARN "P-state 0 is not max freq\n"); - if (acpi_cpufreq_driver.set_boost) + if (acpi_cpufreq_driver.set_boost) { set_boost(policy, acpi_cpufreq_driver.boost_enabled); + policy->boost_enabled = acpi_cpufreq_driver.boost_enabled; + } return result; -- Gitee From a9c5ca6c5abdc12292953fafdd5d357787715dcf Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Mon, 13 Oct 2025 03:14:35 +0000 Subject: [PATCH 58/63] cpufreq: Initialize cpufreq-based invariance before subsys mainline inclusion from mainline-v6.17-rc4 commit 8ffe28b4e8d8b18cb2f2933410322c24f039d5d6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8ffe28b4e8d8b18cb2f2933410322c24f039d5d6 ---------------------------------------------------------------------- commit 2a6c72738706 ("cpufreq: Initialize cpufreq-based frequency-invariance later") postponed the frequency invariance initialization to avoid disabling it in the error case. This isn't locking safe, instead move the initialization up before the subsys interface is registered (which will rebuild the sched_domains) and add the corresponding disable on the error path. Observed lockdep without this patch: [ 0.989686] ====================================================== [ 0.989688] WARNING: possible circular locking dependency detected [ 0.989690] 6.17.0-rc4-cix-build+ #31 Tainted: G S [ 0.989691] ------------------------------------------------------ [ 0.989692] swapper/0/1 is trying to acquire lock: [ 0.989693] ffff800082ada7f8 (sched_energy_mutex){+.+.}-{4:4}, at: rebuild_sched_domains_energy+0x30/0x58 [ 0.989705] but task is already holding lock: [ 0.989706] ffff000088c89bc8 (&policy->rwsem){+.+.}-{4:4}, at: cpufreq_online+0x7f8/0xbe0 [ 0.989713] which lock already depends on the new lock. Fixes: 2a6c72738706 ("cpufreq: Initialize cpufreq-based frequency-invariance later") Signed-off-by: Christian Loehle Signed-off-by: Rafael J. Wysocki Signed-off-by: Xinyu Zheng Signed-off-by: huwentao --- drivers/cpufreq/cpufreq.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 29b58b143a42..de1cc7a3f26c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -3100,6 +3100,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) goto err_null_driver; } + /* + * Mark support for the scheduler's frequency invariance engine for + * drivers that implement target(), target_index() or fast_switch(). + */ + if (!cpufreq_driver->setpolicy) { + static_branch_enable_cpuslocked(&cpufreq_freq_invariance); + pr_debug("cpufreq: supports frequency invariance\n"); + } + ret = subsys_interface_register(&cpufreq_interface); if (ret) goto err_boost_unreg; @@ -3122,21 +3131,14 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) hp_online = ret; ret = 0; - /* - * Mark support for the scheduler's frequency invariance engine for - * drivers that implement target(), target_index() or fast_switch(). - */ - if (!cpufreq_driver->setpolicy) { - static_branch_enable_cpuslocked(&cpufreq_freq_invariance); - pr_debug("supports frequency invariance"); - } - pr_debug("driver %s up and running\n", driver_data->name); goto out; err_if_unreg: subsys_interface_unregister(&cpufreq_interface); err_boost_unreg: + if (!cpufreq_driver->setpolicy) + static_branch_disable_cpuslocked(&cpufreq_freq_invariance); remove_boost_sysfs_file(); err_null_driver: write_lock_irqsave(&cpufreq_driver_lock, flags); -- Gitee From dc2908e43018869e8d7341aab55f12f1f4566841 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Sat, 28 Dec 2024 06:49:45 +0000 Subject: [PATCH 59/63] cpufreq: CPPC: Fix possible null-ptr-deref for cppc_get_cpu_cost() stable inclusion from stable-v6.6.64 commit f05ef81db63889f6f14eb77fd140dac6cedb6f7f category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/ID1B9U CVE: CVE-2024-53230 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=f05ef81db63889f6f14eb77fd140dac6cedb6f7f -------------------------------- [ Upstream commit 1a1374bb8c5926674973d849feed500bc61ad535 ] cpufreq_cpu_get_raw() may return NULL if the cpu is not in policy->cpus cpu mask and it will cause null pointer dereference, so check NULL for cppc_get_cpu_cost(). Fixes: 740fcdc2c20e ("cpufreq: CPPC: Register EM based on efficiency class information") Signed-off-by: Jinjie Ruan Signed-off-by: Viresh Kumar Signed-off-by: Sasha Levin Signed-off-by: Jinjie Ruan Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 9ad7d2ac2049..37f78688a099 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -579,6 +579,9 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, int step; policy = cpufreq_cpu_get_raw(cpu_dev->id); + if (!policy) + return 0; + cpu_data = policy->driver_data; perf_caps = &cpu_data->perf_caps; max_cap = arch_scale_cpu_capacity(cpu_dev->id); -- Gitee From 0e7c2cdcae5ab2a8b75bafeaaadcc5f8d754b1a1 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 8 Jan 2025 08:43:35 +0000 Subject: [PATCH 60/63] cpufreq: CPPC: Fix possible null-ptr-deref for cpufreq_cpu_get_raw() stable inclusion from stable-v6.6.64 commit e07570a8f2cfc51260c6266cb8e1bd4777a610d6 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/ID1B9U CVE: CVE-2024-53231 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=e07570a8f2cfc51260c6266cb8e1bd4777a610d6 -------------------------------- cpufreq_cpu_get_raw() may return NULL if the cpu is not in policy->cpus cpu mask and it will cause null pointer dereference. Fixes: 740fcdc2c20e ("cpufreq: CPPC: Register EM based on efficiency class information") Signed-off-by: Jinjie Ruan Signed-off-by: Viresh Kumar Signed-off-by: Yuntao Liu Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 37f78688a099..e5a1c7939fc9 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -512,6 +512,9 @@ static int cppc_get_cpu_power(struct device *cpu_dev, struct cppc_cpudata *cpu_data; policy = cpufreq_cpu_get_raw(cpu_dev->id); + if (!policy) + return 0; + cpu_data = policy->driver_data; perf_caps = &cpu_data->perf_caps; max_cap = arch_scale_cpu_capacity(cpu_dev->id); -- Gitee From 7bcee5d0e7c1920b5e7230767bb9a4732a6e2018 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 30 May 2022 12:04:24 +0200 Subject: [PATCH 61/63] cpufreq: CPPC: Fix unused-function warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.19-rc1 commit da4363457f777906d49d765398f5227657c82ef9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=da4363457f777906d49d765398f5227657c82ef9 ---------------------------------------------------------------------- Building the cppc_cpufreq driver with for arm64 with CONFIG_ENERGY_MODEL=n triggers the following warnings: drivers/cpufreq/cppc_cpufreq.c:550:12: error: ‘cppc_get_cpu_cost’ defined but not used [-Werror=unused-function] 550 | static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, | ^~~~~~~~~~~~~~~~~ drivers/cpufreq/cppc_cpufreq.c:481:12: error: ‘cppc_get_cpu_power’ defined but not used [-Werror=unused-function] 481 | static int cppc_get_cpu_power(struct device *cpu_dev, | ^~~~~~~~~~~~~~~~~~ Move the Energy Model related functions into specific guards. This allows to fix the warning and prevent doing extra work when the Energy Model is not present. Fixes: 740fcdc2c20e ("cpufreq: CPPC: Register EM based on efficiency class information") Reported-by: Shaokun Zhang Signed-off-by: Pierre Gondois Tested-by: Shaokun Zhang Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index e5a1c7939fc9..824f205e4428 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -462,6 +462,14 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) return delay_us; } +#else +static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) +{ + return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; +} +#endif + +#if defined(CONFIG_ARM64) && defined(CONFIG_ENERGY_MODEL) static DEFINE_PER_CPU(unsigned int, efficiency_class); @@ -645,20 +653,11 @@ static void populate_efficiency_class(void) } #else - -static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) -{ - return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; -} static void populate_efficiency_class(void) { } -static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) -{ -} #endif - static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) { struct cppc_cpudata *cpu_data; -- Gitee From ca818808b91020d5aa29b5332650c7bb760b767d Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Sat, 21 May 2022 11:24:38 +0800 Subject: [PATCH 62/63] cpufreq: CPPC: Fix build error without CONFIG_ACPI_CPPC_CPUFREQ_FIE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v5.19-rc1 commit a3f083e04a87662559bbadddc02f4739b78e743a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=a3f083e04a87662559bbadddc02f4739b78e743a ---------------------------------------------------------------------- If CONFIG_ACPI_CPPC_CPUFREQ_FIE is not set, building fails: drivers/cpufreq/cppc_cpufreq.c: In function ‘populate_efficiency_class’: drivers/cpufreq/cppc_cpufreq.c:584:2: error: ‘cppc_cpufreq_driver’ undeclared (first use in this function); did you mean ‘cpufreq_driver’? cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; ^~~~~~~~~~~~~~~~~~~ cpufreq_driver Make declare of cppc_cpufreq_driver out of CONFIG_ACPI_CPPC_CPUFREQ_FIE to fix this. Fixes: 740fcdc2c20e ("cpufreq: CPPC: Register EM based on efficiency class information") Signed-off-by: Zheng Bin Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/cpufreq/cppc_cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 824f205e4428..adf2a2bdacea 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -65,6 +65,8 @@ static enum { FIE_DISABLED } fie_disabled = FIE_UNSET; +static struct cpufreq_driver cppc_cpufreq_driver; + #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE module_param(fie_disabled, int, 0444); MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)"); @@ -81,7 +83,6 @@ struct cppc_freq_invariance { static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); static struct kthread_worker *kworker_fie; -static struct cpufreq_driver cppc_cpufreq_driver; static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, struct cppc_perf_fb_ctrs *fb_ctrs_t0, struct cppc_perf_fb_ctrs *fb_ctrs_t1); -- Gitee From 30f4a188d0a9fc8364eb8292ebcb210851a50e51 Mon Sep 17 00:00:00 2001 From: huwentao Date: Fri, 17 Oct 2025 18:15:56 +0800 Subject: [PATCH 63/63] openeuler_defconfig: Enable CONFIG_ACPI_CPPC_CPUFREQ_FIE hulk inclusion category: featrue bugzilla: https://gitee.com/openeuler/kernel/issues/ID1B9U -------------------------------- Enable CONFIG_ACPI_CPPC_CPUFREQ_FIE in openeuler_defconfig. Signed-off-by: huwentao --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 9d58c2d8eb5e..b5c7f7ec7b6c 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -669,6 +669,7 @@ CONFIG_CPU_FREQ_GOV_SEEP=m # # CONFIG_CPUFREQ_DT is not set CONFIG_ACPI_CPPC_CPUFREQ=y +CONFIG_ACPI_CPPC_CPUFREQ_FIE=y CONFIG_ARM_SCPI_CPUFREQ=m # CONFIG_ARM_QCOM_CPUFREQ_HW is not set # end of CPU Frequency scaling -- Gitee