Re: [PATCH 1/2] platform/x86: Add support for Uncore frequency control
From: Andy Shevchenko
Date: Wed Dec 18 2019 - 11:51:18 EST
On Sat, Dec 14, 2019 at 2:15 AM Srinivas Pandruvada
<srinivas.pandruvada@xxxxxxxxxxxxxxx> wrote:
>
> Some server users set limits on the uncore frequency using MSR 620H, while
> running latency sensitive workloads. Here uncore frequency controls
> RING/LLC(last-level cache) clocks.
>
> But MSR control is not always possible from the user space, so this driver
> provides a sysfs interface to set max and min frequency limits. This MSR
> 620H is a die scoped in multi-die system or package scoped in non multi-die
> systems.
>
> When this driver is loaded, a new directory is created under
> /sys/devices/system/cpu.
>
> For example on a two package Skylake server:
> $cd /sys/devices/system/cpu/intel_uncore_frequency
>
> $ls
> package_00_die_00 package_01_die_00
>
> $ls package_00_die_00
> max_freq_khz min_freq_khz power_up_max_freq_khz
> power_up_min_freq_khz
>
> $grep . *
> max_freq_khz:2400000
> min_freq_khz:1200000
> power_up_max_freq_khz:2400000
> power_up_min_freq_khz:1200000
>
> Here, power_up_max_freq_khz and power_up_min_freq_khz are read only
> attributes to show power up values of max and min frequencies respectively.
> Other attributes are read-write, so that users can modify.
>
Acked-by: Andy Shevchenko <andy.shevchenko@xxxxxxxxx>
in case it goes thru different (doc?) tree.
> Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@xxxxxxxxxxxxxxx>
> ---
> drivers/platform/x86/Kconfig | 11 +
> drivers/platform/x86/Makefile | 1 +
> drivers/platform/x86/intel-uncore-frequency.c | 434 ++++++++++++++++++
> 3 files changed, 446 insertions(+)
> create mode 100644 drivers/platform/x86/intel-uncore-frequency.c
>
> diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
> index 27d5b40fb717..6013c3b96cfd 100644
> --- a/drivers/platform/x86/Kconfig
> +++ b/drivers/platform/x86/Kconfig
> @@ -1337,6 +1337,17 @@ config PCENGINES_APU2
> To compile this driver as a module, choose M here: the module
> will be called pcengines-apuv2.
>
> +config INTEL_UNCORE_FREQ_CONTROL
> + tristate "Intel Uncore frequency control driver"
> + depends on X86_64
> + help
> + This driver allows control of uncore frequency limits on
> + supported server platforms.
> + Uncore frequency controls RING/LLC (last-level cache) clocks.
> +
> + To compile this driver as a module, choose M here: the module
> + will be called intel-uncore-frequency.
> +
> source "drivers/platform/x86/intel_speed_select_if/Kconfig"
>
> config SYSTEM76_ACPI
> diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
> index 42d85a00be4e..3747b1f07cf1 100644
> --- a/drivers/platform/x86/Makefile
> +++ b/drivers/platform/x86/Makefile
> @@ -105,3 +105,4 @@ obj-$(CONFIG_INTEL_ATOMISP2_PM) += intel_atomisp2_pm.o
> obj-$(CONFIG_PCENGINES_APU2) += pcengines-apuv2.o
> obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += intel_speed_select_if/
> obj-$(CONFIG_SYSTEM76_ACPI) += system76_acpi.o
> +obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL) += intel-uncore-frequency.o
> diff --git a/drivers/platform/x86/intel-uncore-frequency.c b/drivers/platform/x86/intel-uncore-frequency.c
> new file mode 100644
> index 000000000000..82ee5a3107cf
> --- /dev/null
> +++ b/drivers/platform/x86/intel-uncore-frequency.c
> @@ -0,0 +1,434 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Intel Uncore Frequency Setting
> + * Copyright (c) 2019, Intel Corporation.
> + * All rights reserved.
> + *
> + * Provide interface to set MSR 620 at a granularity of per die. On CPU online,
> + * one control CPU is identified per die to read/write limit. This control CPU
> + * is changed, if the CPU state is changed to offline. When the last CPU is
> + * offline in a die then remove the sysfs object for that die.
> + * The majority of actual code is related to sysfs create and read/write
> + * attributes.
> + *
> + * Author: Srinivas Pandruvada <srinivas.pandruvada@xxxxxxxxxxxxxxx>
> + */
> +
> +#include <linux/cpu.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/suspend.h>
> +#include <asm/cpu_device_id.h>
> +#include <asm/intel-family.h>
> +
> +#define MSR_UNCORE_RATIO_LIMIT 0x620
> +#define UNCORE_FREQ_KHZ_MULTIPLIER 100000
> +
> +/**
> + * struct uncore_data - Encapsulate all uncore data
> + * @stored_uncore_data: Last user changed MSR 620 value, which will be restored
> + * on system resume.
> + * @power_up_min_freq_khz: Sampled minimum uncore frequency at driver init
> + * @power_up_max_freq_khz: Sampled maximum uncore frequency at driver init
> + * @control_cpu: Designated CPU for a die to read/write
> + * @valid: Mark the data valid/invalid
> + *
> + * This structure is used to encapsulate all data related to uncore sysfs
> + * settings for a die/package.
> + */
> +struct uncore_data {
> + struct kobject kobj;
> + u64 stored_uncore_data;
> + u32 power_up_min_freq_khz;
> + u32 power_up_max_freq_khz;
> + int control_cpu;
> + bool valid;
> +};
> +
> +#define to_uncore_data(a) container_of(a, struct uncore_data, kobj)
> +
> +/* Max instances for uncore data, one for each die */
> +static int uncore_max_entries __read_mostly;
> +/* Storage for uncore data for all instances */
> +static struct uncore_data *uncore_instances;
> +/* Root of the all uncore sysfs kobjs */
> +struct kobject uncore_root_kobj;
> +/* Stores the CPU mask of the target CPUs to use during uncore read/write */
> +static cpumask_t uncore_cpu_mask;
> +/* CPU online callback register instance */
> +static enum cpuhp_state uncore_hp_state __read_mostly;
> +/* Mutex to control all mutual exclusions */
> +static DEFINE_MUTEX(uncore_lock);
> +
> +struct uncore_attr {
> + struct attribute attr;
> + ssize_t (*show)(struct kobject *kobj,
> + struct attribute *attr, char *buf);
> + ssize_t (*store)(struct kobject *kobj,
> + struct attribute *attr, const char *c, ssize_t count);
> +};
> +
> +#define define_one_uncore_ro(_name) \
> +static struct uncore_attr _name = \
> +__ATTR(_name, 0444, show_##_name, NULL)
> +
> +#define define_one_uncore_rw(_name) \
> +static struct uncore_attr _name = \
> +__ATTR(_name, 0644, show_##_name, store_##_name)
> +
> +#define show_uncore_data(member_name) \
> + static ssize_t show_##member_name(struct kobject *kobj, \
> + struct attribute *attr, \
> + char *buf) \
> + { \
> + struct uncore_data *data = to_uncore_data(kobj); \
> + return scnprintf(buf, PAGE_SIZE, "%u\n", \
> + data->member_name); \
> + } \
> + define_one_uncore_ro(member_name)
> +
> +show_uncore_data(power_up_min_freq_khz);
> +show_uncore_data(power_up_max_freq_khz);
> +
> +/* Common function to read MSR 0x620 and read min/max */
> +static int uncore_read_ratio(struct uncore_data *data, unsigned int *min,
> + unsigned int *max)
> +{
> + u64 cap;
> + int ret;
> +
> + ret = rdmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, &cap);
> + if (ret)
> + return ret;
> +
> + *max = (cap & 0x7F) * UNCORE_FREQ_KHZ_MULTIPLIER;
> + *min = ((cap & GENMASK(14, 8)) >> 8) * UNCORE_FREQ_KHZ_MULTIPLIER;
> +
> + return 0;
> +}
> +
> +/* Common function to set min/max ratios to be used by sysfs callbacks */
> +static int uncore_write_ratio(struct uncore_data *data, unsigned int input,
> + int set_max)
> +{
> + int ret;
> + u64 cap;
> +
> + mutex_lock(&uncore_lock);
> +
> + input /= UNCORE_FREQ_KHZ_MULTIPLIER;
> + if (!input || input > 0x7F) {
> + ret = -EINVAL;
> + goto finish_write;
> + }
> +
> + rdmsrl(MSR_UNCORE_RATIO_LIMIT, cap);
> + if (set_max) {
> + cap &= ~0x7F;
> + cap |= input;
> + } else {
> + cap &= ~GENMASK(14, 8);
> + cap |= (input << 8);
> + }
> +
> + ret = wrmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, cap);
> + if (ret)
> + goto finish_write;
> +
> + data->stored_uncore_data = cap;
> +
> +finish_write:
> + mutex_unlock(&uncore_lock);
> +
> + return ret;
> +}
> +
> +static ssize_t store_min_max_freq_khz(struct kobject *kobj,
> + struct attribute *attr,
> + const char *buf, ssize_t count,
> + int min_max)
> +{
> + struct uncore_data *data = to_uncore_data(kobj);
> + unsigned int input;
> +
> + if (kstrtouint(buf, 10, &input))
> + return -EINVAL;
> +
> + uncore_write_ratio(data, input, min_max);
> +
> + return count;
> +}
> +
> +static ssize_t show_min_max_freq_khz(struct kobject *kobj,
> + struct attribute *attr,
> + char *buf, int min_max)
> +{
> + struct uncore_data *data = to_uncore_data(kobj);
> + unsigned int min, max;
> + int ret;
> +
> + mutex_lock(&uncore_lock);
> + ret = uncore_read_ratio(data, &min, &max);
> + mutex_unlock(&uncore_lock);
> + if (ret)
> + return ret;
> +
> + if (min_max)
> + return sprintf(buf, "%u\n", max);
> +
> + return sprintf(buf, "%u\n", min);
> +}
> +
> +#define store_uncore_min_max(name, min_max) \
> + static ssize_t store_##name(struct kobject *kobj, \
> + struct attribute *attr, \
> + const char *buf, ssize_t count) \
> + { \
> + \
> + return store_min_max_freq_khz(kobj, attr, buf, count, \
> + min_max); \
> + }
> +
> +#define show_uncore_min_max(name, min_max) \
> + static ssize_t show_##name(struct kobject *kobj, \
> + struct attribute *attr, char *buf) \
> + { \
> + \
> + return show_min_max_freq_khz(kobj, attr, buf, min_max); \
> + }
> +
> +store_uncore_min_max(min_freq_khz, 0);
> +store_uncore_min_max(max_freq_khz, 1);
> +
> +show_uncore_min_max(min_freq_khz, 0);
> +show_uncore_min_max(max_freq_khz, 1);
> +
> +define_one_uncore_rw(min_freq_khz);
> +define_one_uncore_rw(max_freq_khz);
> +
> +static struct attribute *uncore_attrs[] = {
> + &power_up_min_freq_khz.attr,
> + &power_up_max_freq_khz.attr,
> + &max_freq_khz.attr,
> + &min_freq_khz.attr,
> + NULL
> +};
> +
> +static struct kobj_type uncore_ktype = {
> + .sysfs_ops = &kobj_sysfs_ops,
> + .default_attrs = uncore_attrs,
> +};
> +
> +static struct kobj_type uncore_root_ktype = {
> + .sysfs_ops = &kobj_sysfs_ops,
> +};
> +
> +/* Caller provides protection */
> +static struct uncore_data *uncore_get_instance(unsigned int cpu)
> +{
> + int id = topology_logical_die_id(cpu);
> +
> + if (id >= 0 && id < uncore_max_entries)
> + return &uncore_instances[id];
> +
> + return NULL;
> +}
> +
> +static void uncore_add_die_entry(int cpu)
> +{
> + struct uncore_data *data;
> +
> + mutex_lock(&uncore_lock);
> + data = uncore_get_instance(cpu);
> + if (!data) {
> + mutex_unlock(&uncore_lock);
> + return;
> + }
> +
> + if (data->valid) {
> + /* control cpu changed */
> + data->control_cpu = cpu;
> + } else {
> + char str[64];
> + int ret;
> +
> + memset(data, 0, sizeof(*data));
> + sprintf(str, "package_%02d_die_%02d",
> + topology_physical_package_id(cpu),
> + topology_die_id(cpu));
> +
> + uncore_read_ratio(data, &data->power_up_min_freq_khz,
> + &data->power_up_max_freq_khz);
> +
> + ret = kobject_init_and_add(&data->kobj, &uncore_ktype,
> + &uncore_root_kobj, str);
> + if (!ret) {
> + data->control_cpu = cpu;
> + data->valid = true;
> + }
> + }
> + mutex_unlock(&uncore_lock);
> +}
> +
> +/* Last CPU in this die is offline, so remove sysfs entries */
> +static void uncore_remove_die_entry(int cpu)
> +{
> + struct uncore_data *data;
> +
> + mutex_lock(&uncore_lock);
> + data = uncore_get_instance(cpu);
> + if (data) {
> + kobject_put(&data->kobj);
> + data->control_cpu = -1;
> + data->valid = false;
> + }
> + mutex_unlock(&uncore_lock);
> +}
> +
> +static int uncore_event_cpu_online(unsigned int cpu)
> +{
> + int target;
> +
> + /* Check if there is an online cpu in the package for uncore MSR */
> + target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu));
> + if (target < nr_cpu_ids)
> + return 0;
> +
> + /* Use this CPU on this die as a control CPU */
> + cpumask_set_cpu(cpu, &uncore_cpu_mask);
> + uncore_add_die_entry(cpu);
> +
> + return 0;
> +}
> +
> +static int uncore_event_cpu_offline(unsigned int cpu)
> +{
> + int target;
> +
> + /* Check if existing cpu is used for uncore MSRs */
> + if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
> + return 0;
> +
> + /* Find a new cpu to set uncore MSR */
> + target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
> +
> + if (target < nr_cpu_ids) {
> + cpumask_set_cpu(target, &uncore_cpu_mask);
> + uncore_add_die_entry(target);
> + } else {
> + uncore_remove_die_entry(cpu);
> + }
> +
> + return 0;
> +}
> +
> +static int uncore_pm_notify(struct notifier_block *nb, unsigned long mode,
> + void *_unused)
> +{
> + int cpu;
> +
> + switch (mode) {
> + case PM_POST_HIBERNATION:
> + case PM_POST_RESTORE:
> + case PM_POST_SUSPEND:
> + for_each_cpu(cpu, &uncore_cpu_mask) {
> + struct uncore_data *data;
> + int ret;
> +
> + data = uncore_get_instance(cpu);
> + if (!data || !data->valid || !data->stored_uncore_data)
> + continue;
> +
> + ret = wrmsrl_on_cpu(cpu, MSR_UNCORE_RATIO_LIMIT,
> + data->stored_uncore_data);
> + if (ret)
> + return ret;
> + }
> + break;
> + default:
> + break;
> + }
> + return 0;
> +}
> +
> +static struct notifier_block uncore_pm_nb = {
> + .notifier_call = uncore_pm_notify,
> +};
> +
> +#define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
> +
> +static const struct x86_cpu_id intel_uncore_cpu_ids[] = {
> + ICPU(INTEL_FAM6_BROADWELL_G),
> + ICPU(INTEL_FAM6_BROADWELL_X),
> + ICPU(INTEL_FAM6_BROADWELL_D),
> + ICPU(INTEL_FAM6_SKYLAKE_X),
> + ICPU(INTEL_FAM6_ICELAKE_X),
> + ICPU(INTEL_FAM6_ICELAKE_D),
> + {}
> +};
> +
> +static int __init intel_uncore_init(void)
> +{
> + const struct x86_cpu_id *id;
> + int ret;
> +
> + id = x86_match_cpu(intel_uncore_cpu_ids);
> + if (!id)
> + return -ENODEV;
> +
> + uncore_max_entries = topology_max_packages() *
> + topology_max_die_per_package();
> + uncore_instances = kcalloc(uncore_max_entries,
> + sizeof(*uncore_instances), GFP_KERNEL);
> + if (!uncore_instances)
> + return -ENOMEM;
> +
> + ret = kobject_init_and_add(&uncore_root_kobj, &uncore_root_ktype,
> + &cpu_subsys.dev_root->kobj,
> + "intel_uncore_frequency");
> + if (ret)
> + goto err_free;
> +
> + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> + "platform/x86/uncore-freq:online",
> + uncore_event_cpu_online,
> + uncore_event_cpu_offline);
> + if (ret < 0)
> + goto err_rem_kobj;
> +
> + uncore_hp_state = ret;
> +
> + ret = register_pm_notifier(&uncore_pm_nb);
> + if (ret)
> + goto err_rem_state;
> +
> + return 0;
> +
> +err_rem_state:
> + cpuhp_remove_state(uncore_hp_state);
> +err_rem_kobj:
> + kobject_put(&uncore_root_kobj);
> +err_free:
> + kfree(uncore_instances);
> +
> + return ret;
> +}
> +module_init(intel_uncore_init)
> +
> +static void __exit intel_uncore_exit(void)
> +{
> + int i;
> +
> + unregister_pm_notifier(&uncore_pm_nb);
> + cpuhp_remove_state(uncore_hp_state);
> + for (i = 0; i < uncore_max_entries; ++i) {
> + if (uncore_instances[i].valid)
> + kobject_put(&uncore_instances[i].kobj);
> + }
> + kobject_put(&uncore_root_kobj);
> + kfree(uncore_instances);
> +}
> +module_exit(intel_uncore_exit)
> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_DESCRIPTION("Intel Uncore Frequency Limits Driver");
> --
> 2.17.2
>
--
With Best Regards,
Andy Shevchenko