Re: [PATCH 1/1] powerpc/pseries: Interface to represent PAPR firmware attributes

From: Pratik Sampat
Date: Mon Jun 21 2021 - 11:35:51 EST


Hello Fabiano,

Thank you for your review.

On 19/06/21 2:28 am, Fabiano Rosas wrote:
"Pratik R. Sampat" <psampat@xxxxxxxxxxxxx> writes:

Adds a generic interface to represent the energy and frequency related
PAPR attributes on the system using the new H_CALL
"H_GET_ENERGY_SCALE_INFO".

H_GET_EM_PARMS H_CALL was previously responsible for exporting this
information in the lparcfg, however the H_GET_EM_PARMS H_CALL
will be deprecated P10 onwards.

The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
hcall(
uint64 H_GET_ENERGY_SCALE_INFO, // Get energy scale info
uint64 flags, // Per the flag request
uint64 firstAttributeId,// The attribute id
uint64 bufferAddress, // Guest physical address of the output buffer
uint64 bufferSize // The size in bytes of the output buffer
);

This H_CALL can query either all the attributes at once with
firstAttributeId = 0, flags = 0 as well as query only one attribute
at a time with firstAttributeId = id

The output buffer consists of the following
1. number of attributes - 8 bytes
2. array offset to the data location - 8 bytes
3. version info - 1 byte
4. A data array of size num attributes, which contains the following:
a. attribute ID - 8 bytes
b. attribute value in number - 8 bytes
c. attribute name in string - 64 bytes
d. attribute value in string - 64 bytes

The new H_CALL exports information in direct string value format, hence
a new interface has been introduced in
/sys/firmware/papr/energy_scale_info to export this information to
userspace in an extensible pass-through format.

The H_CALL returns the name, numeric value and string value (if exists)

The format of exposing the sysfs information is as follows:
/sys/firmware/papr/energy_scale_info/
|-- <id>/
|-- desc
|-- value
|-- value_desc (if exists)
|-- <id>/
|-- desc
|-- value
|-- value_desc (if exists)
...

The energy information that is exported is useful for userspace tools
such as powerpc-utils. Currently these tools infer the
"power_mode_data" value in the lparcfg, which in turn is obtained from
the to be deprecated H_GET_EM_PARMS H_CALL.
On future platforms, such userspace utilities will have to look at the
data returned from the new H_CALL being populated in this new sysfs
interface and report this information directly without the need of
interpretation.

Signed-off-by: Pratik R. Sampat <psampat@xxxxxxxxxxxxx>
---
.../sysfs-firmware-papr-energy-scale-info | 26 ++
arch/powerpc/include/asm/hvcall.h | 21 +-
arch/powerpc/kvm/trace_hv.h | 1 +
arch/powerpc/platforms/pseries/Makefile | 3 +-
.../pseries/papr_platform_attributes.c | 292 ++++++++++++++++++
5 files changed, 341 insertions(+), 2 deletions(-)
create mode 100644 Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c

diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
new file mode 100644
index 000000000000..499bc1ae173a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
@@ -0,0 +1,26 @@
+What: /sys/firmware/papr/energy_scale_info
+Date: June 2021
+Contact: Linux for PowerPC mailing list <linuxppc-dev@xxxxxxxxxx>
+Description: Director hosting a set of platform attributes on Linux
+ running as a PAPR guest.
This still refers to papr attributes. We want energy/frequency, etc. instead.

Sure, I can mention about energy, frequency info attributes here.

+
+ Each file in a directory contains a platform
+ attribute hierarchy pertaining to performance/
+ energy-savings mode and processor frequency.
+
+What: /sys/firmware/papr/energy_scale_info/<id>
+ /sys/firmware/papr/energy_scale_info/<id>/desc
+ /sys/firmware/papr/energy_scale_info/<id>/value
+ /sys/firmware/papr/energy_scale_info/<id>/value_desc
+Date: June 2021
+Contact: Linux for PowerPC mailing list <linuxppc-dev@xxxxxxxxxx>
+Description: PAPR attributes directory for POWERVM servers
Same here.

ack.

+
+ This directory provides PAPR information. It
And here.

ack.


+ contains below sysfs attributes:
+
+ - desc: File contains the name of attribute <id>
desc: String description of the attribute <id>

Sure, this description is more complete


+
+ - value: Numeric value of attribute <id>
+
+ - value_desc: String value of attribute <id>
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index e3b29eda8074..19a2a8c77a49 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -316,7 +316,8 @@
#define H_SCM_PERFORMANCE_STATS 0x418
#define H_RPT_INVALIDATE 0x448
#define H_SCM_FLUSH 0x44C
-#define MAX_HCALL_OPCODE H_SCM_FLUSH
+#define H_GET_ENERGY_SCALE_INFO 0x450
+#define MAX_HCALL_OPCODE H_GET_ENERGY_SCALE_INFO

/* Scope args for H_SCM_UNBIND_ALL */
#define H_UNBIND_SCOPE_ALL (0x1)
@@ -631,6 +632,24 @@ struct hv_gpci_request_buffer {
uint8_t bytes[HGPCI_MAX_DATA_BYTES];
} __packed;

+#define MAX_EM_ATTRS 10
+#define MAX_EM_DATA_BYTES \
+ (sizeof(struct energy_scale_attributes) * MAX_EM_ATTRS)
EM doesn't really mean anything in the context of this code. Maybe we
could standardize the names with 'energy_scale_info' and 'esi' for
short.

Right, energy_scale_info (esi) is better naming convention and I can change
others to maintain consistency too.


+struct energy_scale_attributes {
s/attributes/attribute/

ack, thanks for pointing out incorrect grammar.


+ __be64 attr_id;
+ __be64 attr_value;
+ unsigned char attr_desc[64];
+ unsigned char attr_value_desc[64];
These attr_ prefixes could be dropped. I will be clear from the context
where they are used.

+} __packed;
+
+struct hv_energy_scale_buffer {
+ __be64 num_attr;
s/num_attr/num_attrs/

ack


+ __be64 array_offset;
+ __u8 data_header_version;
+ unsigned char data[MAX_EM_DATA_BYTES];
+} __packed;
Your code is correct with the current assumptions, but I think we got
the assumptions wrong, see if you agree:

My understanding of the hypercall is that it is designed around a header
+ data concept. If the header is versioned and backwards compatible,
then it means that it could increase in size. The offset to the data is
what ensures backward compatibility so that we can always access the
data, no matter what the header contains. So this leads me to conclude:

1- We should stop aborting in case the version changes. Nothing should
really break if that happens. Both the kernel and userspace would read
less data than the hypercall is returning, but that is it.

With the current design, if the hypervisor changes the header version,
the attributes stop being exposed in sysfs (because this code aborts),
which will break the userspace setup.

2- We cannot have 'data' in the struct. There is no array there. The
array is wherever the offset indicates. If the header increases in size,
then the 'data' would move forward in the buffer.

3- The concept of MAX_EM_ATTRS is misleading. It is the maximum number
of attributes that fit in the buffer *for this version of the hcall*. A
subsequent version could fit fewer attributes and our MAX_EM_ATTRS would
land us out of bounds. So the number of attributes must be defined
exclusively by num_attrs.

If we change the above, we only need to touch this code again if the
header version changes *and* we want to expose the extra information
brought by the change. An unexpected change in version should not cause
this code to fail.

I think I have understand the approach you are describing here.
Basically you mean to say that the header can change in size in the future and
that is why header and data should be their own separate entities.

Although I'm not a 100% sure that the header isn't a fixed entity, I agree with
your proposed approach. It only makes the design more flexible and I don't see
any downsides to it.

Please correct me if I'm wrong, but this is what I'm thinking this means in
rough psuedo-code:

char *esi_buf;
struct h_energy_scale_info_hdr *esi_hdr;
struct energy_scale_attribute *esi_attrs;

plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_ALL, 0,
virt_to_phys(esi_buf), MAX_BUF_SZ);
esi_hdr = (struct h_energy_scale_info_hdr *) &esi_buf[0];
esi_attrs = (struct energy_scale_attribute *) &esi_buf[be64_to_cpu(esi_hdr->array_offset)];

where "h_energy_scale_info_hdr" is the header that you have defined below, minus the "data".
and "energy_scale_attribute" is pretty much the same minus the "attr" prefix on all the variables

So what I suggest is we keep a 'header' struct:

struct h_energy_scale_info_hdr {
__be64 num_attrs;
__be64 data_offset;
__u8 version;
};

and we define an arbitrary size for the attributes array and allocate
that much more memory:

/*
* Note: we allocate space for 10 attributes, but the HV can move the data
* array further in the buffer, so it could return fewer attributes.
*/

Yes, that really depends where the attribute is, Currently there are only 7
attributes that this HCALL returns that's why I have made the max 10 so that
the offset and even then a few more attributes could be accommodated for.

#define MAX_BUF_SZ (sizeof(struct h_energy_scale_info_hdr) + \
sizeof(struct energy_scale_attributes) * 10)
...
em_buf = kmalloc(MAX_BUF_SZ, GFP_KERNEL);



The suggestions from here on apply even if my analysis above is wrong:

+
+
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_HVCALL_H */
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
index 830a126e095d..38cd0ed0a617 100644
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -115,6 +115,7 @@
{H_VASI_STATE, "H_VASI_STATE"}, \
{H_ENABLE_CRQ, "H_ENABLE_CRQ"}, \
{H_GET_EM_PARMS, "H_GET_EM_PARMS"}, \
+ {H_GET_ENERGY_SCALE_INFO, "H_GET_ENERGY_SCALE_INFO"}, \
{H_SET_MPP, "H_SET_MPP"}, \
{H_GET_MPP, "H_GET_MPP"}, \
{H_HOME_NODE_ASSOCIATIVITY, "H_HOME_NODE_ASSOCIATIVITY"}, \
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index c8a2b0b05ac0..d14fca89ac25 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -6,7 +6,8 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \
of_helpers.o \
setup.o iommu.o event_sources.o ras.o \
firmware.o power.o dlpar.o mobility.o rng.o \
- pci.o pci_dlpar.o eeh_pseries.o msi.o
+ pci.o pci_dlpar.o eeh_pseries.o msi.o \
+ papr_platform_attributes.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_SCANLOG) += scanlog.o
obj-$(CONFIG_KEXEC_CORE) += kexec.o
diff --git a/arch/powerpc/platforms/pseries/papr_platform_attributes.c b/arch/powerpc/platforms/pseries/papr_platform_attributes.c
new file mode 100644
index 000000000000..498c74a5e9ab
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr_platform_attributes.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PAPR platform energy attributes driver
+ *
+ * This driver creates a sys file at /sys/firmware/papr/ which contains
+ * files keyword - value pairs that specify energy configuration of the system.
This description needs updating.

ack, I'll update this

+ *
+ * Copyright 2021 IBM Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/hugetlb.h>
+#include <asm/lppaca.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/time.h>
+#include <asm/prom.h>
+#include <asm/vdso_datapage.h>
+#include <asm/vio.h>
+#include <asm/mmu.h>
+#include <asm/machdep.h>
+#include <asm/drmem.h>
+
+#include "pseries.h"
+
+#define MAX_ATTRS 3
It might be good to link this with ops_info size somehow to make sure we
don't update one without the other.

Sure, I could define it something as follows to determine this relationship
visually.
ops_info[MAX_ATTRS] = { ... }


+#define MAX_NAME_LEN 16
+
+struct papr_attr {
+ u64 id;
+ struct kobj_attribute attr;
We have attr everywhere. I would use kobj_attr here to improve
readability.

+};
+struct papr_group {
+ char name[MAX_NAME_LEN];
+ struct attribute_group pg;
+ struct papr_attr *pgattrs;
+} *pgs;
+
+struct kobject *papr_kobj;
+struct kobject *escale_kobj;
Nitpick: esi_kobj

ack, I'll make the "energy_scale_info (esi)" naming convention coherent everywhere.


+struct hv_energy_scale_buffer *em_buf;
Could replace the "em" here.

+struct energy_scale_attributes *ea;
Nitpick: esi_attrs.

ack.


+
+static ssize_t papr_show_desc(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct papr_attr *pattr = container_of(attr, struct papr_attr, attr);
+ int idx, ret = 0;
+
+ /*
+ * We do not expect the name to change, hence use the old value
+ * and save a HCALL
+ */
+ for (idx = 0; idx < be64_to_cpu(em_buf->num_attr); idx++) {
+ if (pattr->id == be64_to_cpu(ea[idx].attr_id)) {
+ ret = sprintf(buf, "%s\n", ea[idx].attr_desc);
+ if (ret < 0)
+ ret = -EIO;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t papr_show_value(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct papr_attr *pattr = container_of(attr, struct papr_attr, attr);
+ struct hv_energy_scale_buffer *t_buf;
+ struct energy_scale_attributes *t_ea;
+ int data_offset, ret = 0;
+
+ t_buf = kmalloc(sizeof(*t_buf), GFP_KERNEL);
+ if (t_buf == NULL)
+ return -ENOMEM;
+
+ ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, 0,
+ pattr->id, virt_to_phys(t_buf),
+ sizeof(*t_buf));
+
+ if (ret != H_SUCCESS) {
+ pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
+ goto out;
+ }
+
+ data_offset = be64_to_cpu(t_buf->array_offset) -
+ (sizeof(t_buf->num_attr) +
+ sizeof(t_buf->array_offset) +
+ sizeof(t_buf->data_header_version));
+
+ t_ea = (struct energy_scale_attributes *) &t_buf->data[data_offset];
t_ea = (struct energy_scale_attributes *)(t_buf + be64_to_cpu(t_buf->array_offset));

Right? If array_offset "points" to data, then data_offset would always
be 0. So there is no point doing this arithmetic.

Yes, the offset now can directly used as it is a different structure altogether
we don't need to account for the header as that is now in the original parsed
buffer too.


+
+ ret = sprintf(buf, "%llu\n", be64_to_cpu(t_ea->attr_value));
+ if (ret < 0)
+ ret = -EIO;
+out:
+ kfree(t_buf);
+
+ return ret;
+}
+
+static ssize_t papr_show_value_desc(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct papr_attr *pattr = container_of(attr, struct papr_attr, attr);
+ struct hv_energy_scale_buffer *t_buf;
+ struct energy_scale_attributes *t_ea;
+ int data_offset, ret = 0;
+
+ t_buf = kmalloc(sizeof(*t_buf), GFP_KERNEL);
+ if (t_buf == NULL)
+ return -ENOMEM;
+
+ ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, 0,
+ pattr->id, virt_to_phys(t_buf),
+ sizeof(*t_buf));
+
+ if (ret != H_SUCCESS) {
+ pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
+ goto out;
+ }
+
+ data_offset = be64_to_cpu(t_buf->array_offset) -
+ (sizeof(t_buf->num_attr) +
+ sizeof(t_buf->array_offset) +
+ sizeof(t_buf->data_header_version));
+
+ t_ea = (struct energy_scale_attributes *) &t_buf->data[data_offset];
+
+ ret = sprintf(buf, "%s\n", t_ea->attr_value_desc);
+ if (ret < 0)
+ ret = -EIO;
+out:
+ kfree(t_buf);
+
+ return ret;
+}
+
+static struct papr_ops_info {
+ const char *attr_name;
+ ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf);
+} ops_info[] = {
+ { "desc", papr_show_desc },
+ { "value", papr_show_value },
+ { "value_desc", papr_show_value_desc },
+};
+
+static void add_attr(u64 id, int index, struct papr_attr *attr)
+{
+ attr->id = id;
+ sysfs_attr_init(&attr->attr.attr);
+ attr->attr.attr.name = ops_info[index].attr_name;
+ attr->attr.attr.mode = 0444;
+ attr->attr.show = ops_info[index].show;
+}
+
+static int add_attr_group(u64 id, int len, struct papr_group *pg,
+ bool show_val_desc)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (!strcmp(ops_info[i].attr_name, "value_desc") &&
+ !show_val_desc) {
+ continue;
+ }
+ add_attr(id, i, &pg->pgattrs[i]);
+ pg->pg.attrs[i] = &pg->pgattrs[i].attr.attr;
+ }
+
+ return sysfs_create_group(escale_kobj, &pg->pg);
+}
+
+
+static int __init papr_init(void)
+{
+ uint64_t num_attr;
+ int ret, idx, i, data_offset;
+
+ em_buf = kmalloc(sizeof(*em_buf), GFP_KERNEL);
+ if (em_buf == NULL)
+ return -ENOMEM;
+ /*
+ * hcall(
+ * uint64 H_GET_ENERGY_SCALE_INFO, // Get energy scale info
+ * uint64 flags, // Per the flag request
+ * uint64 firstAttributeId, // The attribute id
+ * uint64 bufferAddress, // Guest physical address of the output buffer
+ * uint64 bufferSize); // The size in bytes of the output buffer
+ */
Since the flags are well defined, we could have:
#define ESI_FLAGS_ALL PPC_BIT(0)
#define ESI_FLAGS_SINGLE PPC_BIT(1)

Sure, I can add this macro for abstraction.
I understand this will make things clear looking at the call itself rather then
deciphering the document

I assume the bits are IBM-order. But you get my point...

+ ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, 0, 0,
+ virt_to_phys(em_buf), sizeof(*em_buf));
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR) || ret != H_SUCCESS ||
+ em_buf->data_header_version != 0x1) {
+ pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
+ goto out;
+ }
I'd make the FW_FEATURE_LPAR check an early return at the start of the
function.

Right, makes sense. No point doing the HCALL if we don't have the right
firmware feature. Thanks


+
+ num_attr = be64_to_cpu(em_buf->num_attr);
+
+ /*
+ * Typecast the energy buffer to the attribute structure at the offset
+ * specified in the buffer
+ */
+ data_offset = be64_to_cpu(em_buf->array_offset) -
+ (sizeof(em_buf->num_attr) +
+ sizeof(em_buf->array_offset) +
+ sizeof(em_buf->data_header_version));
+
+ ea = (struct energy_scale_attributes *) &em_buf->data[data_offset];
+
+ pgs = kcalloc(num_attr, sizeof(*pgs), GFP_KERNEL);
+ if (!pgs)
+ goto out_pgs;
+
+ papr_kobj = kobject_create_and_add("papr", firmware_kobj);
+ if (!papr_kobj) {
+ pr_warn("kobject_create_and_add papr failed\n");
+ goto out_kobj;
+ }
+
+ escale_kobj = kobject_create_and_add("energy_scale_info", papr_kobj);
+ if (!escale_kobj) {
+ pr_warn("kobject_create_and_add energy_scale_info failed\n");
+ goto out_ekobj;
+ }
+
+ for (idx = 0; idx < num_attr; idx++) {
+ char buf[4];
+ bool show_val_desc = true;
+
+ pgs[idx].pgattrs = kcalloc(MAX_ATTRS,
+ sizeof(*pgs[idx].pgattrs),
+ GFP_KERNEL);
+ if (!pgs[idx].pgattrs)
+ goto out_kobj;
+
+ pgs[idx].pg.attrs = kcalloc(MAX_ATTRS + 1,
+ sizeof(*pgs[idx].pg.attrs),
+ GFP_KERNEL);
+ if (!pgs[idx].pg.attrs) {
+ kfree(pgs[idx].pgattrs);
+ goto out_kobj;
+ }
+
+ sprintf(buf, "%lld", be64_to_cpu(ea[idx].attr_id));
+ pgs[idx].pg.name = buf;
+
+ /* Do not add the value description if it does not exist */
+ if (strlen(ea[idx].attr_value_desc) == 0)
+ show_val_desc = false;
+
+ if (add_attr_group(be64_to_cpu(ea[idx].attr_id),
+ MAX_ATTRS, &pgs[idx], show_val_desc)) {
+ pr_warn("Failed to create papr attribute group %s\n",
+ pgs[idx].pg.name);
+ goto out_pgattrs;
+ }
+ }
+
+ return 0;
+
+out_pgattrs:
+ for (i = 0; i < MAX_ATTRS; i++) {
+ kfree(pgs[i].pgattrs);
+ kfree(pgs[i].pg.attrs);
+ }
+out_ekobj:
+ kobject_put(escale_kobj);
+out_kobj:
+ kobject_put(papr_kobj);
+out_pgs:
+ kfree(pgs);
+out:
+ kfree(em_buf);
+
+ return -ENOMEM;
+}
+
+machine_device_initcall(pseries, papr_init);

Regards,
Pratik