Re: [PATCH V2 25/37] perf script: Add synthesized Intel PT power and ptwrite events

From: Adrian Hunter
Date: Thu Jun 29 2017 - 16:02:39 EST


On 06/28/2017 11:26 PM, Arnaldo Carvalho de Melo wrote:
> Em Wed, Jun 28, 2017 at 08:21:37PM +0000, Hunter, Adrian escreveu:
>> Sorry for the top-post...
>>
>> Yeah, I've now mixed up the variable attribute:
>>
>> https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#Common-Variable-Attributes
>>
>> with the type attribute:
>>
>> https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#Common-Type-Attributes
>>
>> Late here, so maybe it will make more sense tomorrow.
>
> Right, and I've not been able to focus on this, but I think the problem
> is with packed mixed with unnamed unions :-\

Another possibility is to avoid packed altogether e.g.

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index ea8534dd44b6..83cdc0a61fd6 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1216,10 +1216,10 @@ static void print_sample_pt_spacing(int len)

static void print_sample_synth_ptwrite(struct perf_sample *sample)
{
- struct perf_synth_intel_ptwrite *data = sample->raw_data;
+ struct perf_synth_intel_ptwrite *data = perf_sample__synth_ptr(sample);
int len;

- if (sample->raw_size < sizeof(*data))
+ if (perf_sample__bad_synth_size(sample, *data))
return;

len = printf(" IP: %u payload: %#" PRIx64 " ",
@@ -1229,10 +1229,10 @@ static void print_sample_synth_ptwrite(struct perf_sample *sample)

static void print_sample_synth_mwait(struct perf_sample *sample)
{
- struct perf_synth_intel_mwait *data = sample->raw_data;
+ struct perf_synth_intel_mwait *data = perf_sample__synth_ptr(sample);
int len;

- if (sample->raw_size < sizeof(*data))
+ if (perf_sample__bad_synth_size(sample, *data))
return;

len = printf(" hints: %#x extensions: %#x ",
@@ -1242,10 +1242,10 @@ static void print_sample_synth_mwait(struct perf_sample *sample)

static void print_sample_synth_pwre(struct perf_sample *sample)
{
- struct perf_synth_intel_pwre *data = sample->raw_data;
+ struct perf_synth_intel_pwre *data = perf_sample__synth_ptr(sample);
int len;

- if (sample->raw_size < sizeof(*data))
+ if (perf_sample__bad_synth_size(sample, *data))
return;

len = printf(" hw: %u cstate: %u sub-cstate: %u ",
@@ -1255,10 +1255,10 @@ static void print_sample_synth_pwre(struct perf_sample *sample)

static void print_sample_synth_exstop(struct perf_sample *sample)
{
- struct perf_synth_intel_exstop *data = sample->raw_data;
+ struct perf_synth_intel_exstop *data = perf_sample__synth_ptr(sample);
int len;

- if (sample->raw_size < sizeof(*data))
+ if (perf_sample__bad_synth_size(sample, *data))
return;

len = printf(" IP: %u ", data->ip);
@@ -1267,10 +1267,10 @@ static void print_sample_synth_exstop(struct perf_sample *sample)

static void print_sample_synth_pwrx(struct perf_sample *sample)
{
- struct perf_synth_intel_pwrx *data = sample->raw_data;
+ struct perf_synth_intel_pwrx *data = perf_sample__synth_ptr(sample);
int len;

- if (sample->raw_size < sizeof(*data))
+ if (perf_sample__bad_synth_size(sample, *data))
return;

len = printf(" deepest cstate: %u last cstate: %u wake reason: %#x ",
@@ -1281,11 +1281,11 @@ static void print_sample_synth_pwrx(struct perf_sample *sample)

static void print_sample_synth_cbr(struct perf_sample *sample)
{
- struct perf_synth_intel_cbr *data = sample->raw_data;
+ struct perf_synth_intel_cbr *data = perf_sample__synth_ptr(sample);
unsigned int percent, freq;
int len;

- if (sample->raw_size < sizeof(*data))
+ if (perf_sample__bad_synth_size(sample, *data))
return;

freq = (le32_to_cpu(data->freq) + 500) / 1000;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index d93a6825ce09..9967c87af7a6 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -266,11 +266,16 @@ enum perf_synth_id {
};

/*
- * Raw data formats for synthesized events. Note that raw data plus the raw data
- * size (4 bytes) must align to 8-bytes.
+ * Raw data formats for synthesized events. Note that 4 bytes of padding are
+ * present to match the 'size' member of PERF_SAMPLE_RAW data which is always
+ * 8-byte aligned. That means we must dereference raw_data with an offset of 4.
+ * Refer perf_sample__synth_ptr() and perf_synth__raw_data(). It also means the
+ * structure sizes are 4 bytes bigger than the raw_size, refer
+ * perf_synth__raw_size().
*/

struct perf_synth_intel_ptwrite {
+ u32 padding;
union {
struct {
u32 ip : 1,
@@ -279,9 +284,10 @@ struct perf_synth_intel_ptwrite {
u32 flags;
};
u64 payload;
-} __packed;
+};

struct perf_synth_intel_mwait {
+ u32 padding;
u32 reserved;
union {
struct {
@@ -292,9 +298,10 @@ struct perf_synth_intel_mwait {
};
u64 payload;
};
-} __packed;
+};

struct perf_synth_intel_pwre {
+ u32 padding;
u32 reserved;
union {
struct {
@@ -306,9 +313,10 @@ struct perf_synth_intel_pwre {
};
u64 payload;
};
-} __packed;
+};

struct perf_synth_intel_exstop {
+ u32 padding;
union {
struct {
u32 ip : 1,
@@ -319,6 +327,7 @@ struct perf_synth_intel_exstop {
};

struct perf_synth_intel_pwrx {
+ u32 padding;
u32 reserved;
union {
struct {
@@ -329,9 +338,10 @@ struct perf_synth_intel_pwrx {
};
u64 payload;
};
-} __packed;
+};

struct perf_synth_intel_cbr {
+ u32 padding;
union {
struct {
u32 cbr : 8,
@@ -346,6 +356,24 @@ struct perf_synth_intel_cbr {
};

/*
+ * raw_data is always 4 bytes from an 8-byte boundary, so subtract 4 to get
+ * 8-byte alignment.
+ */
+static inline void *perf_sample__synth_ptr(struct perf_sample *sample)
+{
+ return sample->raw_data - 4;
+}
+
+static inline void *perf_synth__raw_data(void *p)
+{
+ return p + 4;
+}
+
+#define perf_synth__raw_size(d) (sizeof(d) - 4)
+
+#define perf_sample__bad_synth_size(s, d) ((s)->raw_size < sizeof(d) - 4)
+
+/*
* The kernel collects the number of events it couldn't send in a stretch and
* when possible sends this number in a PERF_RECORD_LOST event. The number of
* such "chunks" of lost events is stored in .nr_events[PERF_EVENT_LOST] while
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 754e92ee6c3e..b58f9fd1e2ee 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -1283,8 +1283,8 @@ static int intel_pt_synth_ptwrite_sample(struct intel_pt_queue *ptq)
raw.ip = !!(ptq->state->flags & INTEL_PT_FUP_IP);
raw.payload = cpu_to_le64(ptq->state->ptw_payload);

- sample.raw_size = sizeof(raw);
- sample.raw_data = &raw;
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);

return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
pt->ptwrites_sample_type);
@@ -1311,8 +1311,8 @@ static int intel_pt_synth_cbr_sample(struct intel_pt_queue *ptq)
raw.freq = cpu_to_le32(raw.cbr * pt->cbr2khz);
raw.reserved3 = 0;

- sample.raw_size = sizeof(raw);
- sample.raw_data = &raw;
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);

return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
pt->pwr_events_sample_type);
@@ -1336,8 +1336,8 @@ static int intel_pt_synth_mwait_sample(struct intel_pt_queue *ptq)
raw.reserved = 0;
raw.payload = cpu_to_le64(ptq->state->mwait_payload);

- sample.raw_size = sizeof(raw);
- sample.raw_data = &raw;
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);

return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
pt->pwr_events_sample_type);
@@ -1361,8 +1361,8 @@ static int intel_pt_synth_pwre_sample(struct intel_pt_queue *ptq)
raw.reserved = 0;
raw.payload = cpu_to_le64(ptq->state->pwre_payload);

- sample.raw_size = sizeof(raw);
- sample.raw_data = &raw;
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);

return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
pt->pwr_events_sample_type);
@@ -1386,8 +1386,8 @@ static int intel_pt_synth_exstop_sample(struct intel_pt_queue *ptq)
raw.flags = 0;
raw.ip = !!(ptq->state->flags & INTEL_PT_FUP_IP);

- sample.raw_size = sizeof(raw);
- sample.raw_data = &raw;
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);

return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
pt->pwr_events_sample_type);
@@ -1411,8 +1411,8 @@ static int intel_pt_synth_pwrx_sample(struct intel_pt_queue *ptq)
raw.reserved = 0;
raw.payload = cpu_to_le64(ptq->state->pwrx_payload);

- sample.raw_size = sizeof(raw);
- sample.raw_data = &raw;
+ sample.raw_size = perf_synth__raw_size(raw);
+ sample.raw_data = perf_synth__raw_data(&raw);

return intel_pt_deliver_synth_event(pt, ptq, event, &sample,
pt->pwr_events_sample_type);



>
>> -----Original Message-----
>> From: Arnaldo Carvalho de Melo [mailto:acme@xxxxxxxxxx]
>> Sent: Wednesday, June 28, 2017 9:54 PM
>> To: Hunter, Adrian <adrian.hunter@xxxxxxxxx>
>> Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx>; linux-kernel@xxxxxxxxxxxxxxx
>> Subject: Re: [PATCH V2 25/37] perf script: Add synthesized Intel PT power and ptwrite events
>>
>> Em Wed, Jun 28, 2017 at 08:40:25PM +0300, Adrian Hunter escreveu:
>>> On 06/28/2017 04:04 PM, Arnaldo Carvalho de Melo wrote:
>>>> Em Fri, May 26, 2017 at 11:17:26AM +0300, Adrian Hunter escreveu:
>>>>> Add definitions for synthesized Intel PT events for power and ptwrite.
>>>>
>>>>> +++ b/tools/perf/util/event.h
>>>>> +/*
>>>>> + * Raw data formats for synthesized events. Note that raw data
>>>>> +plus the raw data
>>>>> + * size (4 bytes) must align to 8-bytes.
>>>>> + */
>>>>> +
>>>>> +struct perf_synth_intel_ptwrite {
>>>>> + union {
>>>>> + struct {
>>>>> + u32 ip : 1,
>>>>> + reserved : 31;
>>>>> + };
>>>>> + u32 flags;
>>>>> + };
>>>>> + u64 payload;
>>>>> +} __packed;
>>>>
>>>>
>>>> some versions of clang and gcc dislike this __packed here:
>>>>
>>>> In file included from builtin-script.c:5:
>>>> In file included from /git/linux/tools/perf/util/debug.h:8:
>>>> /git/linux/tools/perf/util/event.h:274:2: error: packed attribute is unnecessary for (null) [-Werror,-Wpacked]
>>>> union {
>>>> ^
>>>> /git/linux/tools/perf/util/event.h:285:6: error: packed attribute is unnecessary for 'reserved' [-Werror,-Wpacked]
>>>> u32 reserved;
>>>> ^
>>>> /git/linux/tools/perf/util/event.h:298:6: error: packed attribute is unnecessary for 'reserved' [-Werror,-Wpacked]
>>>> u32 reserved;
>>>> ^
>>>> /git/linux/tools/perf/util/event.h:322:6: error: packed attribute is unnecessary for 'reserved' [-Werror,-Wpacked]
>>>> u32 reserved;
>>>> ^
>>>> 4 errors generated.
>>>> mv: can't rename '/tmp/build/perf/.builtin-script.o.tmp': No such
>>>> file or directory
>>>>
>>>> /git/linux/tools/build/Makefile.build:101: recipe for target
>>>> '/tmp/build/perf/builtin-script.o' failed
>>>>
>>>> Failing in various distros:
>>>>
>>>> [root@jouet ~]# waitp 3940 ; time dm
>>>> 1 92.3684147260 alpine:3.4: FAIL
>>>> 2 95.9136365930 alpine:3.5: FAIL
>>>> 3 104.8328303770 alpine:3.6: FAIL
>>>> 4 121.6584964930 alpine:edge: FAIL
>>>> 5 37.2536373490 android-ndk:r12b-arm: Ok
>>>> 6 83.9077612370 archlinux:latest: Ok
>>>> 7 14.7094639200 centos:5: FAIL
>>>> 8 16.6371634320 centos:6: FAIL
>>>>
>>>> Investigating...
>>>
>>> Re-reading the documentation for __packed, it seems like the following
>>> might be better:
>>
>> Humm, can you provide the URL for such docs? I always saw packed as an attribute for a struct, not for a member... For members "aligned" is what I'm used to see:
>>
>> __attribute__ ((aligned (8)))
>>
>> In the kernel sources there are a few such cases as you suggest:
>>
>> [acme@jouet linux]$ find include/ -name "*.h"| xargs grep -w __packed | grep -v } | grep -v "struct __packed" | wc -l
>> 12
>> [acme@jouet linux]$
>>
>> But most are the other way, i.e. tagging the packed attribute to the whole struct, as you originally did :-\
>>
>> - Arnaldo
>>
>>> diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index
>>> c283603f59c7..a7547cb3b760 100644
>>> --- a/tools/perf/util/event.h
>>> +++ b/tools/perf/util/event.h
>>> @@ -278,8 +278,8 @@ struct perf_synth_intel_ptwrite {
>>> };
>>> u32 flags;
>>> };
>>> - u64 payload;
>>> -} __packed;
>>> + u64 payload __packed;
>>> +};
>>>
>>> struct perf_synth_intel_mwait {
>>> u32 reserved;
>>> @@ -291,8 +291,8 @@ struct perf_synth_intel_mwait {
>>> reserved2 : 30;
>>> };
>>> u64 payload;
>>> - };
>>> -} __packed;
>>> + } __packed;
>>> +};
>>>
>>> struct perf_synth_intel_pwre {
>>> u32 reserved;
>>> @@ -305,8 +305,8 @@ struct perf_synth_intel_pwre {
>>> reserved2 : 48;
>>> };
>>> u64 payload;
>>> - };
>>> -} __packed;
>>> + } __packed;
>>> +};
>>>
>>> struct perf_synth_intel_exstop {
>>> union {
>>> @@ -328,8 +328,8 @@ struct perf_synth_intel_pwrx {
>>> reserved1 : 52;
>>> };
>>> u64 payload;
>>> - };
>>> -} __packed;
>>> + } __packed;
>>> +};
>>>
>>> struct perf_synth_intel_cbr {
>>> union {
>