Re: [linux, dev-4.10, 6/6] drivers/hwmon: Add a driver for a generic PECI hwmon

From: Jae Hyun Yoo
Date: Thu Jan 11 2018 - 14:47:13 EST


On 1/10/2018 1:47 PM, Guenter Roeck wrote:
On Tue, Jan 09, 2018 at 02:31:26PM -0800, Jae Hyun Yoo wrote:
This commit adds driver implementation for a generic PECI hwmon.

Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@xxxxxxxxxxxxxxx>
---
drivers/hwmon/Kconfig | 6 +
drivers/hwmon/Makefile | 1 +
drivers/hwmon/peci-hwmon.c | 953 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 960 insertions(+)
create mode 100644 drivers/hwmon/peci-hwmon.c

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 9256dd0..3a62c60 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1234,6 +1234,12 @@ config SENSORS_NCT7904
This driver can also be built as a module. If so, the module
will be called nct7904.
+config SENSORS_PECI_HWMON
+ tristate "PECI hwmon support"
+ depends on ASPEED_PECI
+ help
+ If you say yes here you get support for the generic PECI hwmon driver.
+
config SENSORS_NSA320
tristate "ZyXEL NSA320 and compatible fan speed and temperature sensors"
depends on GPIOLIB && OF
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index 98000fc..41d43a5 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -131,6 +131,7 @@ obj-$(CONFIG_SENSORS_NCT7802) += nct7802.o
obj-$(CONFIG_SENSORS_NCT7904) += nct7904.o
obj-$(CONFIG_SENSORS_NSA320) += nsa320-hwmon.o
obj-$(CONFIG_SENSORS_NTC_THERMISTOR) += ntc_thermistor.o
+obj-$(CONFIG_SENSORS_PECI_HWMON) += peci-hwmon.o
obj-$(CONFIG_SENSORS_PC87360) += pc87360.o
obj-$(CONFIG_SENSORS_PC87427) += pc87427.o
obj-$(CONFIG_SENSORS_PCF8591) += pcf8591.o
diff --git a/drivers/hwmon/peci-hwmon.c b/drivers/hwmon/peci-hwmon.c
new file mode 100644
index 0000000..2d2a288
--- /dev/null
+++ b/drivers/hwmon/peci-hwmon.c
@@ -0,0 +1,953 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017 Intel Corporation
+
+#include <linux/delay.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/syscalls.h>
+#include <misc/peci.h>

misc, not linux ? That seems wrong.


You are right. I'll fix it.

+
+#define DEVICE_NAME "peci-hwmon"
+#define HWMON_NAME "peci_hwmon"
+
+#define CPU_ID_MAX 8 /* Max CPU number configured by socket ID */
+#define DIMM_NUMS_MAX 16 /* Max DIMM numbers (channel ranks x 2) */
+#define CORE_NUMS_MAX 28 /* Max core numbers (max on SKX Platinum) */

I won't insist, but it would be better if this were dynamic,
otherwise we'll end up having to increase the defines in the future.


Right. As you said, these values should be manually adjusted in the future if CPU architecture has been changed so better implement it as dynamic. I will check again a way of getting these values from client CPU thru PECI connection.

+#define TEMP_TYPE_PECI 6 /* Sensor type 6: Intel PECI */
+#define CORE_INDEX_OFFSET 100 /* sysfs filename start offset for core temp */
+#define DIMM_INDEX_OFFSET 200 /* sysfs filename start offset for DIMM temp */

Did you test with the "sensors" command to ensure that this works,
with the large gaps in index values ?

Overall, I am not very happy with the indexing. Since each sensor as
a label, it might be better to just make it dynamic.


Okay, that makes sense. Since all attributes has its own label, indexing gap wouldn't be needed even in case of CPU architecture change happens. I'll remove the indexing gap.

+#define TEMP_NAME_HEADER_LEN 4 /* sysfs temp type header length */
+#define OF_DIMM_NUMS_DEFAULT 16 /* default dimm-nums setting */
+
+#define CORE_TEMP_ATTRS 5
+#define DIMM_TEMP_ATTRS 2
+#define ATTR_NAME_LEN 24
+
+#define UPDATE_INTERVAL_MIN HZ
+
+enum sign_t {
+ POS,
+ NEG
+};
+
+struct cpuinfo_t {
+ bool valid;
+ u32 dib;
+ u8 cpuid;
+ u8 platform_id;
+ u32 microcode;
+ u8 logical_thread_nums;
+};
+
+struct temp_data_t {
+ bool valid;
+ s32 value;
+ unsigned long last_updated;
+};
+
+struct temp_group_t {
+ struct temp_data_t tjmax;
+ struct temp_data_t tcontrol;
+ struct temp_data_t tthrottle;
+ struct temp_data_t dts_margin;
+ struct temp_data_t die;
+ struct temp_data_t core[CORE_NUMS_MAX];
+ struct temp_data_t dimm[DIMM_NUMS_MAX];
+};
+
+struct core_temp_attr_group_t {
+ struct sensor_device_attribute sd_attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS];
+ char attr_name[CORE_NUMS_MAX][CORE_TEMP_ATTRS][ATTR_NAME_LEN];
+ struct attribute *attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS + 1];
+ struct attribute_group attr_group[CORE_NUMS_MAX];
+};
+
+struct dimm_temp_attr_group_t {
+ struct sensor_device_attribute sd_attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS];
+ char attr_name[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS][ATTR_NAME_LEN];
+ struct attribute *attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS + 1];
+ struct attribute_group attr_group[DIMM_NUMS_MAX];
+};
+
+struct peci_hwmon {
+ struct device *dev;
+ struct device *hwmon_dev;
+ char name[NAME_MAX];
+ const struct attribute_group **groups;
+ struct cpuinfo_t cpuinfo;
+ struct temp_group_t temp;
+ u32 cpu_id;
+ bool show_core;
+ u32 core_nums;
+ u32 dimm_nums;
+ atomic_t core_group_created;
+ struct core_temp_attr_group_t core;
+ struct dimm_temp_attr_group_t dimm;
+};
+
+enum label_t {
+ L_DIE,
+ L_DTS,
+ L_TCONTROL,
+ L_TTHROTTLE,
+ L_MAX
+};
+
+static const char *peci_label[L_MAX] = {
+ "Die temperature\n",
+ "DTS thermal margin to Tcontrol\n",
+ "Tcontrol temperature\n",
+ "Tthrottle temperature\n",

"temperature" is redundant for a temperature label.


Agreed. Will remove the redundant string.

+};
+
+static DEFINE_MUTEX(peci_hwmon_lock);
+
+static int create_core_temp_group(struct peci_hwmon *priv, int core_no);

Please avoid forward declarations.


Will fix it.

+
+

Please run your patches throuch checkpatch --strict and fix what it reports,
or provide a reason why you don't.


Thanks for the tip. I didn't know about the --strict option before. Checked that the option reports more check points such as this multiple blank line case. I will run all patches again using --strict option.

+static int xfer_peci_msg(int cmd, void *pmsg)
+{
+ int rc;
+
+ mutex_lock(&peci_hwmon_lock);
+ rc = peci_ioctl(NULL, cmd, (unsigned long)pmsg);
+ mutex_unlock(&peci_hwmon_lock);
+
+ return rc;
+}
+
+static int get_cpuinfo(struct peci_hwmon *priv)
+{
+ struct peci_get_dib_msg dib_msg;
+ struct peci_rd_pkg_cfg_msg cfg_msg;
+ int rc, i;
+
+ if (!priv->cpuinfo.valid) {
+ dib_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+
+ rc = xfer_peci_msg(PECI_IOC_GET_DIB, (void *)&dib_msg);
+ if (rc < 0)
+ return rc;
+
+ priv->cpuinfo.dib = dib_msg.dib;
+
+ cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ cfg_msg.index = MBX_INDEX_CPU_ID;
+ cfg_msg.param = 0;
+ cfg_msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
+ if (rc < 0)
+ return rc;
+
+ priv->cpuinfo.cpuid = cfg_msg.pkg_config[0];
+
+ cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ cfg_msg.index = MBX_INDEX_CPU_ID;
+ cfg_msg.param = 1;
+ cfg_msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
+ if (rc < 0)
+ return rc;
+
+ priv->cpuinfo.platform_id = cfg_msg.pkg_config[0];
+
+ cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ cfg_msg.index = MBX_INDEX_CPU_ID;
+ cfg_msg.param = 3;
+ cfg_msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
+ if (rc < 0)
+ return rc;
+
+ priv->cpuinfo.logical_thread_nums = cfg_msg.pkg_config[0] + 1;
+
+ cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ cfg_msg.index = MBX_INDEX_CPU_ID;
+ cfg_msg.param = 4;
+ cfg_msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
+ if (rc < 0)
+ return rc;
+
+ priv->cpuinfo.microcode = (cfg_msg.pkg_config[3] << 24) |
+ (cfg_msg.pkg_config[2] << 16) |
+ (cfg_msg.pkg_config[1] << 8) |
+ cfg_msg.pkg_config[0];
+
+ priv->core_nums = priv->cpuinfo.logical_thread_nums / 2;

This seems to assume a 1:2 relationship between number of threads and
number of CPUs, which is incorrect.


You are right. This can't cover all CPUs. Will find a proper way.

+
+ if (priv->show_core &&
+ atomic_inc_return(&priv->core_group_created) == 1) {
+ for (i = 0; i < priv->core_nums; i++) {
+ rc = create_core_temp_group(priv, i);

This is messy. Sensor groups should be created before or during
hwmon registration, not at some arbitrary later time.

I don't know the logic behind this, but if it is supposed to track CPUs
coming online and going offline it is the wrong approach.


Agreed. This driver wouldn't make communication with CPUs at all if CPUs are powered down so we don't need to leave this driver as inserted. As you commented below, if a agent does insert/remove this driver module while tracking CPU power state, this delayed creation logic wouldn't be needed. I will rewrite it.

+ if (rc != 0) {
+ dev_err(priv->dev,
+ "Failed to create core temp group\n");
+ for (--i; i >= 0; i--) {
+ sysfs_remove_group(
+ &priv->hwmon_dev->kobj,
+ &priv->core.attr_group[i]);
+ }
+ atomic_set(&priv->core_group_created,
+ 0);
+ return rc;
+ }
+ }
+ }
+
+ priv->cpuinfo.valid = true;
+ }
+
+ return 0;
+}
+
+static int get_tjmax(struct peci_hwmon *priv)
+{
+ struct peci_rd_pkg_cfg_msg msg;
+ int rc;
+
+ rc = get_cpuinfo(priv);
+ if (rc < 0)
+ return rc;
+
+ if (!priv->temp.tjmax.valid) {
+ msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ msg.index = MBX_INDEX_TEMP_TARGET;
+ msg.param = 0;
+ msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+ if (rc < 0)
+ return rc;
+
+ priv->temp.tjmax.value = (s32)msg.pkg_config[2] * 1000;
+ priv->temp.tjmax.valid = true;
+ }
+
+ return 0;
+}
+
+static int get_tcontrol(struct peci_hwmon *priv)
+{
+ struct peci_rd_pkg_cfg_msg msg;
+ s32 tcontrol_margin;
+ int rc;
+
+ if (priv->temp.tcontrol.valid &&
+ time_before(jiffies, priv->temp.tcontrol.last_updated +
+ UPDATE_INTERVAL_MIN))
+ return 0;
+

Is the delay necessary ? Otherwise I would suggest to drop it.
It adds a lot of complexity to the driver. Also, if the user polls
values more often, that is presumably on purpose.


I was intended to reduce traffic on PECI bus because it's low speed single wired bus, and temperature values don't change frequently because the value is sampled and averaged in CPU itself. I'll keep this.

+ rc = get_tjmax(priv);
+ if (rc < 0)
+ return rc;
+
+ msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ msg.index = MBX_INDEX_TEMP_TARGET;
+ msg.param = 0;
+ msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+ if (rc < 0)
+ return rc;
+
+ tcontrol_margin = msg.pkg_config[1];
+ tcontrol_margin = ((tcontrol_margin ^ 0x80) - 0x80) * 1000;
+
+ priv->temp.tcontrol.value = priv->temp.tjmax.value - tcontrol_margin;
+
+ if (!priv->temp.tcontrol.valid) {
+ priv->temp.tcontrol.last_updated = INITIAL_JIFFIES;
+ priv->temp.tcontrol.valid = true;
+ } else {
+ priv->temp.tcontrol.last_updated = jiffies;
+ }
+
+ return 0;
+}
+
+static int get_tthrottle(struct peci_hwmon *priv)
+{
+ struct peci_rd_pkg_cfg_msg msg;
+ s32 tthrottle_offset;
+ int rc;
+
+ if (priv->temp.tthrottle.valid &&
+ time_before(jiffies, priv->temp.tthrottle.last_updated +
+ UPDATE_INTERVAL_MIN))
+ return 0;
+
+ rc = get_tjmax(priv);
+ if (rc < 0)
+ return rc;
+
+ msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ msg.index = MBX_INDEX_TEMP_TARGET;
+ msg.param = 0;
+ msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+ if (rc < 0)
+ return rc;
+
+ tthrottle_offset = (msg.pkg_config[3] & 0x2f) * 1000;
+ priv->temp.tthrottle.value = priv->temp.tjmax.value - tthrottle_offset;
+
+ if (!priv->temp.tthrottle.valid) {
+ priv->temp.tthrottle.last_updated = INITIAL_JIFFIES;
+ priv->temp.tthrottle.valid = true;
+ } else {
+ priv->temp.tthrottle.last_updated = jiffies;
+ }
+
+ return 0;
+}
+
+static int get_die_temp(struct peci_hwmon *priv)
+{
+ struct peci_get_temp_msg msg;
+ int rc;
+
+ if (priv->temp.die.valid &&
+ time_before(jiffies, priv->temp.die.last_updated +
+ UPDATE_INTERVAL_MIN))
+ return 0;
+
+ rc = get_tjmax(priv);
+ if (rc < 0)
+ return rc;
+
+ msg.target = PECI_BASE_ADDR + priv->cpu_id;
+
+ rc = xfer_peci_msg(PECI_IOC_GET_TEMP, (void *)&msg);
+ if (rc < 0)
+ return rc;
+
+ priv->temp.die.value = priv->temp.tjmax.value +
+ ((s32)msg.temp_raw * 1000 / 64);
+
+ if (!priv->temp.die.valid) {
+ priv->temp.die.last_updated = INITIAL_JIFFIES;
+ priv->temp.die.valid = true;
+ } else {
+ priv->temp.die.last_updated = jiffies;
+ }
+
+ return 0;
+}
+
+static int get_dts_margin(struct peci_hwmon *priv)
+{
+ struct peci_rd_pkg_cfg_msg msg;
+ s32 dts_margin;
+ int rc;
+
+ if (priv->temp.dts_margin.valid &&
+ time_before(jiffies, priv->temp.dts_margin.last_updated +
+ UPDATE_INTERVAL_MIN))
+ return 0;
+
Are all those values expected to change dynamically, or are some static ?
Static values do not have to be re-read repeatedly but can be cached
permanently.


All values expected to change dynamically except the Tjmax value which is fused in CPU package, but the Tjmax varies on each CPU package so we need to read the Tjmax at least once. Current implementation uses cached Tjmax value after the first reading on the value.

+ rc = get_cpuinfo(priv);
+ if (rc < 0)
+ return rc;
+
+ msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ msg.index = MBX_INDEX_DTS_MARGIN;
+ msg.param = 0;
+ msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+ if (rc < 0)
+ return rc;
+
+ dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
+
+ /*
+ * Processors return a value of DTS reading in 10.6 format
+ * (10 bits signed decimal, 6 bits fractional).
+ * Error codes:
+ * 0x8000: General sensor error
+ * 0x8001: Reserved
+ * 0x8002: Underflow on reading value
+ * 0x8003-0x81ff: Reserved
+ */
+ if (dts_margin >= 0x8000 && dts_margin <= 0x81ff)
+ return -1;
+
+ dts_margin = ((dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
+
The above code is repeated several times. Please consider moving it
into a function to reduce duplication.


Okay. I will move it to a function.

+ priv->temp.dts_margin.value = dts_margin;
+
+ if (!priv->temp.dts_margin.valid) {
+ priv->temp.dts_margin.last_updated = INITIAL_JIFFIES;
+ priv->temp.dts_margin.valid = true;
+ } else {
+ priv->temp.dts_margin.last_updated = jiffies;
+ }
+
+ return 0;
+}
+
+static int get_core_temp(struct peci_hwmon *priv, int core_index)
+{
+ struct peci_rd_pkg_cfg_msg msg;
+ s32 core_dts_margin;
+ int rc;
+
+ if (priv->temp.core[core_index].valid &&
+ time_before(jiffies, priv->temp.core[core_index].last_updated +
+ UPDATE_INTERVAL_MIN))
+ return 0;
+
+ rc = get_tjmax(priv);
+ if (rc < 0)
+ return rc;
+
+ msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ msg.index = MBX_INDEX_PER_CORE_DTS_TEMP;
+ msg.param = core_index;
+ msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+ if (rc < 0)
+ return rc;
+
+ core_dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
+
+ /*
+ * Processors return a value of the core DTS reading in 10.6 format
+ * (10 bits signed decimal, 6 bits fractional).
+ * Error codes:
+ * 0x8000: General sensor error
+ * 0x8001: Reserved
+ * 0x8002: Underflow on reading value
+ * 0x8003-0x81ff: Reserved
+ */
+ if (core_dts_margin >= 0x8000 && core_dts_margin <= 0x81ff)
+ return -1;
+
+ core_dts_margin = ((core_dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
+
+ priv->temp.core[core_index].value = priv->temp.tjmax.value +
+ core_dts_margin;
+
+ if (!priv->temp.core[core_index].valid) {
+ priv->temp.core[core_index].last_updated = INITIAL_JIFFIES;
+ priv->temp.core[core_index].valid = true;
+ } else {
+ priv->temp.core[core_index].last_updated = jiffies;
+ }
+
+ return 0;
+}
+
+static int get_dimm_temp(struct peci_hwmon *priv, int dimm_index)
+{
+ struct peci_rd_pkg_cfg_msg msg;
+ int channel_rank = dimm_index / 2;
+ int dimm_order = dimm_index % 2;
+ int rc;
+
+ if (priv->temp.core[dimm_index].valid &&
+ time_before(jiffies, priv->temp.core[dimm_index].last_updated +
+ UPDATE_INTERVAL_MIN))
+ return 0;
+
+ rc = get_cpuinfo(priv);
+ if (rc < 0)
+ return rc;
+
+ msg.target = PECI_BASE_ADDR + priv->cpu_id;
+ msg.index = MBX_INDEX_DDR_DIMM_TEMP;
+ msg.param = channel_rank;
+ msg.rx_len = 4;
+
+ rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
+ if (rc < 0)
+ return rc;
+
+ priv->temp.dimm[dimm_index].value = msg.pkg_config[dimm_order] * 1000;
+
+ if (!priv->temp.dimm[dimm_index].valid) {
+ priv->temp.dimm[dimm_index].last_updated = INITIAL_JIFFIES;
+ priv->temp.dimm[dimm_index].valid = true;
+ } else {
+ priv->temp.dimm[dimm_index].last_updated = jiffies;
+ }
+
+ return 0;
+}
+
+static ssize_t show_info(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(dev);
+ int rc;
+
+ rc = get_cpuinfo(priv);
+ if (rc < 0)
+ return rc;
+
+ return sprintf(buf, "dib : 0x%08x\n"
+ "cpuid : 0x%x\n"
+ "platform id : %d\n"
+ "stepping : %d\n"
+ "microcode : 0x%08x\n"
+ "logical thread nums : %d\n",
+ priv->cpuinfo.dib,
+ priv->cpuinfo.cpuid,
+ priv->cpuinfo.platform_id,
+ priv->cpuinfo.cpuid & 0xf,
+ priv->cpuinfo.microcode,
+ priv->cpuinfo.logical_thread_nums);
+}

Please no non-standard attributes, much less attributes not following sysfs
attribute rules. If you want to display such information, consider using
debugfs. Besides, this information specifically appears to duplicate
the content of /proc/cpuid, which doesn't really add any value at all.


Got it. Will drop this non-standard attribute.

+
+static ssize_t show_tcontrol(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(dev);
+ int rc;
+
+ rc = get_tcontrol(priv);
+ if (rc < 0)
+ return rc;
+
+ return sprintf(buf, "%d\n", priv->temp.tcontrol.value);
+}
+
+static ssize_t show_tcontrol_margin(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(dev);
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+ int rc;
+
+ rc = get_tcontrol(priv);
+ if (rc < 0)
+ return rc;
+
+ return sprintf(buf, "%d\n", sensor_attr->index == POS ?
+ priv->temp.tjmax.value -
+ priv->temp.tcontrol.value :
+ priv->temp.tcontrol.value -
+ priv->temp.tjmax.value);
+}
+
+static ssize_t show_tthrottle(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(dev);
+ int rc;
+
+ rc = get_tthrottle(priv);
+ if (rc < 0)
+ return rc;
+
+ return sprintf(buf, "%d\n", priv->temp.tthrottle.value);
+}
+
+static ssize_t show_tjmax(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(dev);
+ int rc;
+
+ rc = get_tjmax(priv);
+ if (rc < 0)
+ return rc;
+
+ return sprintf(buf, "%d\n", priv->temp.tjmax.value);
+}
+
+static ssize_t show_die_temp(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(dev);
+ int rc;
+
+ rc = get_die_temp(priv);
+ if (rc < 0)
+ return rc;
+
+ return sprintf(buf, "%d\n", priv->temp.die.value);
+}
+
+static ssize_t show_dts_therm_margin(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(dev);
+ int rc;
+
+ rc = get_dts_margin(priv);
+ if (rc < 0)
+ return rc;
+
+ return sprintf(buf, "%d\n", priv->temp.dts_margin.value);
+}
+
+static ssize_t show_core_temp(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(dev);
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+ int core_index = sensor_attr->index;
+ int rc;
+
+ rc = get_core_temp(priv, core_index);
+ if (rc < 0)
+ return rc;
+
+ return sprintf(buf, "%d\n", priv->temp.core[core_index].value);
+}
+
+static ssize_t show_dimm_temp(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(dev);
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+ int dimm_index = sensor_attr->index;
+ int rc;
+
+ rc = get_dimm_temp(priv, dimm_index);
+ if (rc < 0)
+ return rc;
+
+ return sprintf(buf, "%d\n", priv->temp.dimm[dimm_index].value);
+}
+
+static ssize_t show_value(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+
+ return sprintf(buf, "%d\n", sensor_attr->index);
+}
+
+static ssize_t show_label(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+
+ return sprintf(buf, peci_label[sensor_attr->index]);
+}
+
+static ssize_t show_core_label(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+
+ return sprintf(buf, "Core #%d temperature\n", sensor_attr->index);
+}

Your label strings are quite long. How does that look like with the
sensors command ?

Plus, again, "temperature" in a temperature label is redundant.


Agreed. Will remove the redundant string.

+
+static ssize_t show_dimm_label(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+
+ char channel = 'A' + (sensor_attr->index / 2);
+ int index = sensor_attr->index % 2;
+
+ return sprintf(buf, "Channel Rank %c DDR DIMM #%d temperature\n",
+ channel, index);
+}
+
+/* Die temperature */
+static SENSOR_DEVICE_ATTR(temp1_label, 0444, show_label, NULL, L_DIE);
+static SENSOR_DEVICE_ATTR(temp1_input, 0444, show_die_temp, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp1_max, 0444, show_tcontrol, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp1_crit, 0444, show_tjmax, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp1_crit_hyst, 0444, show_tcontrol_margin, NULL,
+ POS);
+
+static struct attribute *die_temp_attrs[] = {
+ &sensor_dev_attr_temp1_label.dev_attr.attr,
+ &sensor_dev_attr_temp1_input.dev_attr.attr,
+ &sensor_dev_attr_temp1_max.dev_attr.attr,
+ &sensor_dev_attr_temp1_crit.dev_attr.attr,
+ &sensor_dev_attr_temp1_crit_hyst.dev_attr.attr,
+ NULL
+};
+
+static const struct attribute_group die_temp_attr_group = {
+ .attrs = die_temp_attrs,
+};
+
+/* DTS thermal margin temperature */
+static SENSOR_DEVICE_ATTR(temp2_label, 0444, show_label, NULL, L_DTS);
+static SENSOR_DEVICE_ATTR(temp2_input, 0444, show_dts_therm_margin, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp2_min, 0444, show_value, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp2_lcrit, 0444, show_tcontrol_margin, NULL, NEG);
+
+static struct attribute *dts_margin_temp_attrs[] = {
+ &sensor_dev_attr_temp2_label.dev_attr.attr,
+ &sensor_dev_attr_temp2_input.dev_attr.attr,
+ &sensor_dev_attr_temp2_min.dev_attr.attr,
+ &sensor_dev_attr_temp2_lcrit.dev_attr.attr,
+ NULL
+};
+
+static const struct attribute_group dts_margin_temp_attr_group = {
+ .attrs = dts_margin_temp_attrs,
+};
+
+/* Tcontrol temperature */
+static SENSOR_DEVICE_ATTR(temp3_label, 0444, show_label, NULL, L_TCONTROL);
+static SENSOR_DEVICE_ATTR(temp3_input, 0444, show_tcontrol, NULL, 0);
+static SENSOR_DEVICE_ATTR(temp3_crit, 0444, show_tjmax, NULL, 0);
+
+static struct attribute *tcontrol_temp_attrs[] = {
+ &sensor_dev_attr_temp3_label.dev_attr.attr,
+ &sensor_dev_attr_temp3_input.dev_attr.attr,
+ &sensor_dev_attr_temp3_crit.dev_attr.attr,
+ NULL
+};
+
+static const struct attribute_group tcontrol_temp_attr_group = {
+ .attrs = tcontrol_temp_attrs,
+};
+
+/* Tthrottle temperature */
+static SENSOR_DEVICE_ATTR(temp4_label, 0444, show_label, NULL, L_TTHROTTLE);
+static SENSOR_DEVICE_ATTR(temp4_input, 0444, show_tthrottle, NULL, 0);
+
+static struct attribute *tthrottle_temp_attrs[] = {
+ &sensor_dev_attr_temp4_label.dev_attr.attr,
+ &sensor_dev_attr_temp4_input.dev_attr.attr,
+ NULL
+};
+
+static const struct attribute_group tthrottle_temp_attr_group = {
+ .attrs = tthrottle_temp_attrs,
+};
+
+/* CPU info */
+static SENSOR_DEVICE_ATTR(info, 0444, show_info, NULL, 0);
+
+static struct attribute *info_attrs[] = {
+ &sensor_dev_attr_info.dev_attr.attr,
+ NULL
+};
+
+static const struct attribute_group info_attr_group = {
+ .attrs = info_attrs,
+};
+
+const struct attribute_group *peci_hwmon_attr_groups[] = {
+ &info_attr_group,
+ &die_temp_attr_group,
+ &dts_margin_temp_attr_group,
+ &tcontrol_temp_attr_group,
+ &tthrottle_temp_attr_group,
+ NULL
+};
+
+static ssize_t (*const core_show_fn[CORE_TEMP_ATTRS]) (struct device *dev,
+ struct device_attribute *devattr, char *buf) = {
+ show_core_label,
+ show_core_temp,
+ show_tcontrol,
+ show_tjmax,
+ show_tcontrol_margin,
+};
+
+static const char *const core_suffix[CORE_TEMP_ATTRS] = {
+ "label",
+ "input",
+ "max",
+ "crit",
+ "crit_hyst",
+};
+
+static int create_core_temp_group(struct peci_hwmon *priv, int core_no)
+{
+ int i;
+
+ for (i = 0; i < CORE_TEMP_ATTRS; i++) {
+ snprintf(priv->core.attr_name[core_no][i],
+ ATTR_NAME_LEN, "temp%d_%s",
+ CORE_INDEX_OFFSET + core_no, core_suffix[i]);
+ sysfs_attr_init(
+ &priv->core.sd_attrs[core_no][i].dev_attr.attr);
+ priv->core.sd_attrs[core_no][i].dev_attr.attr.name =
+ priv->core.attr_name[core_no][i];
+ priv->core.sd_attrs[core_no][i].dev_attr.attr.mode = 0444;
+ priv->core.sd_attrs[core_no][i].dev_attr.show = core_show_fn[i];
+ if (i == 0 || i == 1) /* label or temp */
+ priv->core.sd_attrs[core_no][i].index = core_no;
+ priv->core.attrs[core_no][i] =
+ &priv->core.sd_attrs[core_no][i].dev_attr.attr;
+ }
+
+ priv->core.attr_group[core_no].attrs = priv->core.attrs[core_no];
+
+ return sysfs_create_group(&priv->hwmon_dev->kobj,
+ &priv->core.attr_group[core_no]);
+}
+
+static ssize_t (*const dimm_show_fn[DIMM_TEMP_ATTRS]) (struct device *dev,
+ struct device_attribute *devattr, char *buf) = {
+ show_dimm_label,
+ show_dimm_temp,
+};
+
+static const char *const dimm_suffix[DIMM_TEMP_ATTRS] = {
+ "label",
+ "input",
+};
+
+static int create_dimm_temp_group(struct peci_hwmon *priv, int dimm_no)
+{
+ int i;
+
+ for (i = 0; i < DIMM_TEMP_ATTRS; i++) {
+ snprintf(priv->dimm.attr_name[dimm_no][i],
+ ATTR_NAME_LEN, "temp%d_%s",
+ DIMM_INDEX_OFFSET + dimm_no, dimm_suffix[i]);
+ sysfs_attr_init(&priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr);
+ priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.name =
+ priv->dimm.attr_name[dimm_no][i];
+ priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.mode = 0444;
+ priv->dimm.sd_attrs[dimm_no][i].dev_attr.show = dimm_show_fn[i];
+ priv->dimm.sd_attrs[dimm_no][i].index = dimm_no;
+ priv->dimm.attrs[dimm_no][i] =
+ &priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr;
+ }
+
+ priv->dimm.attr_group[dimm_no].attrs = priv->dimm.attrs[dimm_no];
+
+ return sysfs_create_group(&priv->hwmon_dev->kobj,
+ &priv->dimm.attr_group[dimm_no]);
+}
+
+static int peci_hwmon_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct device_node *np = dev->of_node;
+ struct peci_hwmon *priv;
+ struct device *hwmon;
+ int rc, i;
+
+ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ dev_set_drvdata(dev, priv);
+ priv->dev = dev;
+
+ rc = of_property_read_u32(np, "cpu-id", &priv->cpu_id);

What entity determines cpu-id ?


CPU ID numbering is determined by hardware SOCKET_ID strap pins. In this driver implementation, cpu-id is being used as CPU client indexing.

+ if (rc || priv->cpu_id >= CPU_ID_MAX) {
+ dev_err(dev, "Invalid cpu-id configuration\n");
+ return rc;
+ }
+
+ rc = of_property_read_u32(np, "dimm-nums", &priv->dimm_nums);

This is an odd devicetree attribute. Normally the number of DIMMs
is dynamic. Isn't there a means to get all that information dynamically
instead of having to set it through devicetree ? What if someone adds
or removes a DIMM ? Who updates the devicetree ?


It means the number of DIMM slots each CPU has, doesn't mean the number of currently installed DIMM components. If a DIMM is inserted a slot, CPU reports its actual temperature but on empty slot, CPU reports 0 instead of reporting an error so it is the reason why this driver enumerates all DIMM slots' attribute.

+ if (rc || priv->dimm_nums > DIMM_NUMS_MAX) {
+ dev_warn(dev, "Invalid dimm-nums : %u. Use default : %u\n",
+ priv->dimm_nums, OF_DIMM_NUMS_DEFAULT);
+ priv->dimm_nums = OF_DIMM_NUMS_DEFAULT;
+ }
+
+ priv->show_core = of_property_read_bool(np, "show-core");

This does not look like an appropriate devicetree attribute.


Okay. I will remove this devicetree attribute and make this driver enable core temperature attributes always.

+
+ priv->groups = peci_hwmon_attr_groups;
+

This assignment (and the ->groups variable) is quite pointless.


Will fix it.

+ snprintf(priv->name, NAME_MAX, HWMON_NAME ".cpu%d", priv->cpu_id);
+
+ hwmon = devm_hwmon_device_register_with_groups(dev,
+ priv->name,
+ priv, priv->groups);

Please rewrite the driver to use devm_hwmon_device_register_with_info(),
and avoid dynamic attributes.


I will rewrite it.

+
+ rc = PTR_ERR_OR_ZERO(hwmon);
+ if (rc != 0) {
+ dev_err(dev, "Failed to register peci hwmon\n");
+ return rc;
+ }
+
+ priv->hwmon_dev = hwmon;

Something is logically wrong if you need to store hwmon_dev in the
private data structure. Specifically, creating attributes dynamically
after hwmon registration is wrong.

+
+ for (i = 0; i < priv->dimm_nums; i++) {
+ rc = create_dimm_temp_group(priv, i);

No. See earlier comments. All attribute groups must be created during
registration (or before, but I am not inclined to accept a new driver
doing that).

+ if (rc != 0) {
+ dev_err(dev, "Failed to create dimm temp group\n");
+ for (--i; i >= 0; i--) {
+ sysfs_remove_group(&priv->hwmon_dev->kobj,
+ &priv->dimm.attr_group[i]);
+ }
+ return rc;
+ }
+ }
+
+ /*
+ * Try to create core temp group now. It will be created if CPU is
+ * curretnly online or it will be created after the first reading of
+ * cpuinfo from the online CPU otherwise.

This is not how CPUs are supposed to be detected, and it does not handle CPUs
taken offline. If the driver is instantiated as a CPU comes online, or as it
goes offline, the driver should use the appropriate kernel interfaces to
trigger that instantiation or removal. However, if so, it may be inappropriate
to associate CPU temperatures with other system temperatures in the same
instance of the driver; after all, those are all independent of each other.

Overall, I suspect that there should be a callback or some other mechanism
in the peci core to trigger instantiation and removal of this driver, and
I am not sure if any of the devicetree properties makes sense at all.

For example, if an instance of this driver is associated with a PECI
agent (with assorted CPU/DIMM temperature reporting), the instantiation
could be triggered as soon as the PECI core detects that the agent is
available, and the PECI core could report what exactly that instance
supports.


Thanks for sharing your detailed comment. In fact, PECI doesn't have any mechanism to get a callback for checking whether CPU is online or not. The only way PECI can do is, polling a CPU using the Ping PECI command. Also, a BMC controller can't make any PECI communication with offline CPU so this implementation uses delayed creation for some attributes which is messy as you said.

Like you suggested, a PECI agent could check CPU power state using some other mechanism and this driver module could be dynamically inserted/removed by the agent according to the CPU power state. This way could make all attributes creation possible at probing time so no need to use the delayed creation.

Also, I'll check feasibility of dynamic checking for maximum supportable numbers on each component type so that we can dynamically set the values as you suggested above.

+ */
+ if (priv->show_core)
+ (void) get_cpuinfo(priv);
+
+ dev_info(dev, "peci hwmon for CPU#%d registered\n", priv->cpu_id);

Is this logging noise necessary ? Besides, some of it is redundant.


No, it isn't necessary. I will remove it.

+
+ return rc;
+}
+
+static int peci_hwmon_remove(struct platform_device *pdev)
+{
+ struct peci_hwmon *priv = dev_get_drvdata(&pdev->dev);
+ int i;
+
+ if (atomic_read(&priv->core_group_created))
+ for (i = 0; i < priv->core_nums; i++) {
+ sysfs_remove_group(&priv->hwmon_dev->kobj,
+ &priv->core.attr_group[i]);
+ }
+
+ for (i = 0; i < priv->dimm_nums; i++) {
+ sysfs_remove_group(&priv->hwmon_dev->kobj,
+ &priv->dimm.attr_group[i]);
+ }

If you need to call sysfs_remove_group from here,
something is conceptually wrong in your driver.


I'll remove it while rewriting the driver.

+
+ return 0;
+}
+
+static const struct of_device_id peci_of_table[] = {
+ { .compatible = "peci-hwmon", },

This does not look like a reference to some piece of hardware.


This driver provides generic PECI hwmon function to which controller has PECI HW such as Aspeed or Nuvoton BMC chip so it's not dependant on a specific hardware. Should I remove this or any suggestion?

+ { }
+};
+MODULE_DEVICE_TABLE(of, peci_of_table);
+
+static struct platform_driver peci_hwmon_driver = {
+ .probe = peci_hwmon_probe,
+ .remove = peci_hwmon_remove,
+ .driver = {
+ .name = DEVICE_NAME,
+ .of_match_table = peci_of_table,
+ },
+};
+
+module_platform_driver(peci_hwmon_driver);
+MODULE_AUTHOR("Jae Hyun Yoo <jae.hyun.yoo@xxxxxxxxxxxxxxx>");
+MODULE_DESCRIPTION("PECI hwmon driver");
+MODULE_LICENSE("GPL v2");