[PATCH 7/7] thermal: stats: add error accounting to thermal zone

From: Eduardo Valentin
Date: Thu May 18 2023 - 23:28:26 EST


From: Eduardo Valentin <eduval@xxxxxxxxxx>

This patch adds an extra stat to report how many
temperature update failures were detected.
Error count is increase whenever the thermal
driver returns an actual error or when the temperature
is non positive.

Sample:

$ cat /sys/class/thermal/thermal_zone0/stats/error_count
0
$ echo -1 > /sys/class/thermal/thermal_zone0/emul_temp
$ cat /sys/class/thermal/thermal_zone0/stats/error_count
3

Cc: "Rafael J. Wysocki" <rafael@xxxxxxxxxx> (supporter:THERMAL)
Cc: Daniel Lezcano <daniel.lezcano@xxxxxxxxxx> (supporter:THERMAL)
Cc: Amit Kucheria <amitk@xxxxxxxxxx> (reviewer:THERMAL)
Cc: Zhang Rui <rui.zhang@xxxxxxxxx> (reviewer:THERMAL)
Cc: Jonathan Corbet <corbet@xxxxxxx> (maintainer:DOCUMENTATION)
Cc: linux-pm@xxxxxxxxxxxxxxx (open list:THERMAL)
Cc: linux-doc@xxxxxxxxxxxxxxx (open list:DOCUMENTATION)
Cc: linux-kernel@xxxxxxxxxxxxxxx (open list)

Signed-off-by: Eduardo Valentin <eduval@xxxxxxxxxx>
---
drivers/thermal/thermal_core.c | 3 ++
drivers/thermal/thermal_core.h | 7 ++++
drivers/thermal/thermal_sysfs.c | 64 +++++++++++++++++++++++++++++++++
3 files changed, 74 insertions(+)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 2ff7d9c7c973..359e7b2ff0e3 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -389,6 +389,9 @@ static void update_temperature(struct thermal_zone_device *tz)
/* tell the governor its source is hosed */
handle_error_temperature(tz, ret);

+ /* book keeping */
+ thermal_zone_device_error_stats_update(tz, ret);
+
return;
}

diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index ef37b92bbb7c..612f93e6c257 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -141,12 +141,19 @@ ssize_t weight_store(struct device *, struct device_attribute *, const char *,

#ifdef CONFIG_THERMAL_STATISTICS
void thermal_zone_device_stats_update(struct thermal_zone_device *tz);
+void thermal_zone_device_error_stats_update(struct thermal_zone_device *tz,
+ int error);
void thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev,
struct thermal_instance *instance,
unsigned long new_state);
#else
static inline
void thermal_zone_device_stats_update(struct thermal_zone_device *tz) {}
+static inline
+void thermal_zone_device_error_stats_update(struct thermal_zone_device *tz,
+ int error)
+{
+}
static inline void
thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev,
struct thermal_instance *instance,
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 25851fe073c3..e511042e9dab 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -541,12 +541,21 @@ static void destroy_trip_attrs(struct thermal_zone_device *tz)
/* thermal zone device statistics handling */
struct thermal_zone_device_stats {
spinlock_t lock; /* protects this struct */
+ unsigned int error_count; /* just account them */
+ int max_temperature;
s64 max_gradient;
s64 min_gradient;
ktime_t last_time;
ktime_t *time_in_trip;
};

+static void error_stats_update(struct thermal_zone_device *tz, int error)
+{
+ struct thermal_zone_device_stats *stats = tz->stats;
+
+ stats->error_count++;
+}
+
#define DELTA_MILLI_C_TO_MICRO_C(t0, t1) (((t0) - (t1)) * 1000)
static void temperature_stats_update(struct thermal_zone_device *tz)
{
@@ -555,6 +564,15 @@ static void temperature_stats_update(struct thermal_zone_device *tz)
s64 cur_gradient, delta_temp;
int i, trip_id = -1;

+ if (tz->temperature <= 0) {
+ /* probably a wrong reading */
+ error_stats_update(tz, tz->temperature);
+ return;
+ }
+
+ if (tz->temperature > stats->max_temperature)
+ stats->max_temperature = tz->temperature;
+
delta = ktime_sub(now, stats->last_time);
stats->last_time = now;

@@ -610,6 +628,31 @@ void thermal_zone_device_stats_update(struct thermal_zone_device *tz)
spin_unlock(&stats->lock);
}

+void thermal_zone_device_error_stats_update(struct thermal_zone_device *tz,
+ int error)
+{
+ struct thermal_zone_device_stats *stats = tz->stats;
+
+ spin_lock(&stats->lock);
+ error_stats_update(tz, error);
+ spin_unlock(&stats->lock);
+}
+
+static ssize_t max_temperature_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct thermal_zone_device *tz = to_thermal_zone(dev);
+ struct thermal_zone_device_stats *stats = tz->stats;
+ int ret;
+
+ spin_lock(&stats->lock);
+ temperature_stats_update(tz);
+ ret = snprintf(buf, PAGE_SIZE, "%d\n", stats->max_temperature);
+ spin_unlock(&stats->lock);
+
+ return ret;
+}
+
static ssize_t max_gradient_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -640,6 +683,21 @@ static ssize_t min_gradient_show(struct device *dev,
return ret;
}

+static ssize_t error_count_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct thermal_zone_device *tz = to_thermal_zone(dev);
+ struct thermal_zone_device_stats *stats = tz->stats;
+ int ret;
+
+ spin_lock(&stats->lock);
+ temperature_stats_update(tz);
+ ret = snprintf(buf, PAGE_SIZE, "%u\n", stats->error_count);
+ spin_unlock(&stats->lock);
+
+ return ret;
+}
+
static ssize_t
time_in_trip_ms_show(struct device *dev, struct device_attribute *attr,
char *buf)
@@ -705,6 +763,8 @@ reset_tz_stats_store(struct device *dev, struct device_attribute *attr,

stats->min_gradient = 0;
stats->max_gradient = 0;
+ stats->max_temperature = 0;
+ stats->error_count = 0;
stats->last_time = ktime_get();

for (i = 0; i <= tz->num_trips; i++)
@@ -717,13 +777,17 @@ reset_tz_stats_store(struct device *dev, struct device_attribute *attr,

static DEVICE_ATTR_RO(min_gradient);
static DEVICE_ATTR_RO(max_gradient);
+static DEVICE_ATTR_RO(max_temperature);
static DEVICE_ATTR_RO(time_in_trip_ms);
+static DEVICE_ATTR_RO(error_count);
static DEVICE_ATTR_WO(reset_tz_stats);

static struct attribute *thermal_zone_device_stats_attrs[] = {
&dev_attr_min_gradient.attr,
&dev_attr_max_gradient.attr,
+ &dev_attr_max_temperature.attr,
&dev_attr_time_in_trip_ms.attr,
+ &dev_attr_error_count.attr,
&dev_attr_reset_tz_stats.attr,
NULL
};
--
2.34.1