[PATCH 15/17] tools/arch/x86/pmtctl: Add pmtxml2json conversion tool

From: David E. Box

Date: Mon May 25 2026 - 21:50:55 EST

Add a Python converter that turns Intel PMT XML metric definitions into the
pmtctl/perf-style JSON consumed by pmtctl and by the built-in metric
definition generator.

The converter supports two input modes:

Local path: point it at an existing Intel-PMT xml tree (--by-path
/path/to/Intel-PMT/xml) and convert in place.

Fetch: --fetch-pmt-repo clones the upstream Intel-PMT repository
into a cache (default ~/.cache/pmtctl).

--refresh-pmt-repo updates the cache.

Output JSON files are written under --output-dir, one file per metric
group, suitable for direct use with pmtctl -J or as input to
gen_builtin_defs.py for compiled-in definitions.

The document pmtxml2json.md provages usage examples covering the
different workflows.

Assisted-by: GitHub-Copilot:claude-sonnet-4.6
Signed-off-by: David E. Box <david.e.box@xxxxxxxxxxxxxxx>
---
tools/arch/x86/pmtctl/Makefile | 96 +-
tools/arch/x86/pmtctl/scripts/pmtxml2json.md | 158 ++++
tools/arch/x86/pmtctl/scripts/pmtxml2json.py | 883 +++++++++++++++++++
3 files changed, 1129 insertions(+), 8 deletions(-)
create mode 100644 tools/arch/x86/pmtctl/scripts/pmtxml2json.md
create mode 100755 tools/arch/x86/pmtctl/scripts/pmtxml2json.py

diff --git a/tools/arch/x86/pmtctl/Makefile b/tools/arch/x86/pmtctl/Makefile
index 52e50597b5c1..d55819372f79 100644
--- a/tools/arch/x86/pmtctl/Makefile
+++ b/tools/arch/x86/pmtctl/Makefile
@@ -1,6 +1,27 @@
# SPDX-License-Identifier: GPL-2.0-only

+# Remove targets whose recipe exited non-zero so a failed codegen step
+# does not leave a truncated $@ behind that fools the next build.
+.DELETE_ON_ERROR:
+
CC ?= gcc
+PYTHON ?= python3
+
+# Directories for the XML -> JSON -> C codegen pipeline.
+DEFS_DIR ?= defs
+GENERATED_DIR ?= generated
+PMT_CACHE_DIR ?= $(HOME)/.cache/pmtctl
+
+XML2JSON_SCRIPT := scripts/pmtxml2json.py
+GEN_DEFS_SCRIPT := scripts/gen_builtin_defs.py
+
+# JSON sources that define built-in metrics. pmtxml2json.py writes
+# one subdirectory per platform under $(DEFS_DIR)/, so recurse.
+DEFS_JSON ?= $(shell find $(DEFS_DIR) -name '*.json' 2>/dev/null)
+
+# Stamp marks "the XML->JSON conversion has run". The exact set of
+# generated files is not known up front, so we depend on a single stamp.
+DEFS_JSON_STAMP := $(DEFS_DIR)/.stamp

BUILD ?= release

@@ -35,7 +56,6 @@ TARGET := pmtctl
LIBDIR := lib
LIBPMTCTL_CORE := $(BUILDDIR)/lib/libpmtctl_core.a
LIBPMTCTL_ARTIFACTS := $(LIBPMTCTL_CORE)
-LIBPMTCTL_STAMP := $(BUILDDIR)/lib/.built
SAMPLE_SRC := samples/libpmtctl_sample.c
SAMPLE_TARGET := $(BUILDDIR)/samples/libpmtctl_sample

@@ -50,13 +70,21 @@ SRC := \
OBJ := $(patsubst $(SRCDIR)/%.c,$(BUILDDIR)/%.o,$(SRC))
CLEAN_BUILDS := release debug

-.PHONY: all clean libpmtctl_core sample FORCE
+.PHONY: all clean defs defs-json-fetch defs-json-pull defs-clean \
+ libpmtctl_core sample FORCE

all: $(TARGET)

$(TARGET): $(OBJ) $(LIBPMTCTL_ARTIFACTS)
$(CC) $(CFLAGS) -o $@ $(OBJ) $(LIBPMTCTL_ARTIFACTS) $(LDLIBS)

+# If JSON definitions exist, ensure the generated built-in defs are up to
+# date before the lib sub-make runs. Without this, edits under defs/ would
+# not propagate into pmtctl until the user explicitly ran 'make defs'.
+ifneq ($(DEFS_JSON),)
+$(LIBPMTCTL_CORE): $(GENERATED_DIR)/builtin_defs.c
+endif
+
libpmtctl_core: $(LIBPMTCTL_CORE)

sample: $(SAMPLE_TARGET)
@@ -69,15 +97,58 @@ $(SAMPLE_TARGET): $(SAMPLE_SRC) $(LIBPMTCTL_ARTIFACTS)
@mkdir -p $(dir $@)
$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $< $(LIBPMTCTL_ARTIFACTS) $(LDLIBS)

-$(LIBPMTCTL_ARTIFACTS): $(LIBPMTCTL_STAMP)
-
-$(LIBPMTCTL_STAMP): FORCE
+# Recurse into lib/ on every invocation. The sub-make is incremental and
+# does nothing when up to date. Because $(LIBPMTCTL_CORE) has its own
+# recipe here, GNU make re-stats it afterwards, so any mtime advance from
+# sub-make correctly propagates to $(TARGET) and triggers a relink.
+$(LIBPMTCTL_CORE): FORCE
$(MAKE) -C $(LIBDIR) BUILD=$(BUILD)
- @mkdir -p $(dir $@)
- @touch $@

FORCE:

+# --- XML -> JSON step (network-bound; opt-in) ---
+#
+# Fetches the Intel-PMT git repo (cached under $(PMT_CACHE_DIR)) and
+# converts every aggregator XML into perf-style JSON under $(DEFS_DIR)/.
+# Not wired into 'all' on purpose: avoid surprise git clones.
+defs-json-fetch: $(DEFS_JSON_STAMP)
+
+$(DEFS_JSON_STAMP): $(XML2JSON_SCRIPT)
+ @echo "defs-json-fetch: git cloning Intel-PMT into $(PMT_CACHE_DIR)"
+ @command -v $(PYTHON) >/dev/null 2>&1 || { \
+ echo "$(PYTHON) is required for $(XML2JSON_SCRIPT)" >&2; exit 1; }
+ @mkdir -p $(DEFS_DIR)
+ $(PYTHON) $(XML2JSON_SCRIPT) \
+ --fetch-pmt-repo \
+ --pmt-cache-dir $(PMT_CACHE_DIR) \
+ --output-dir $(DEFS_DIR)
+ @touch $@
+
+# Run 'git pull' on the cached Intel-PMT repo, then regenerate JSON.
+defs-json-pull: $(XML2JSON_SCRIPT)
+ @echo "defs-json-pull: running 'git pull' on $(PMT_CACHE_DIR)"
+ @mkdir -p $(DEFS_DIR)
+ $(PYTHON) $(XML2JSON_SCRIPT) \
+ --fetch-pmt-repo --refresh-pmt-repo \
+ --pmt-cache-dir $(PMT_CACHE_DIR) \
+ --output-dir $(DEFS_DIR)
+ @touch $(DEFS_JSON_STAMP)
+
+# --- JSON -> C step (does NOT build pmtctl) ---
+#
+# DEFS_JSON is expanded at parse time, so 'make defs-json-fetch' must be run
+# in a separate invocation before 'make defs' the first time.
+$(GENERATED_DIR)/builtin_defs.c: $(GEN_DEFS_SCRIPT) $(DEFS_JSON)
+ @mkdir -p $(GENERATED_DIR)
+ @if [ -z "$(DEFS_JSON)" ]; then \
+ echo "No JSON files under $(DEFS_DIR)/. Run 'make defs-json-fetch' first," >&2; \
+ echo "then re-run 'make defs'." >&2; \
+ exit 1; \
+ fi
+ @command -v $(PYTHON) >/dev/null 2>&1 || { \
+ echo "$(PYTHON) is required for $(GEN_DEFS_SCRIPT)" >&2; exit 1; }
+ $(PYTHON) $(GEN_DEFS_SCRIPT) $(DEFS_JSON) > $@
+
# Install settings
PREFIX ?= /usr/local
DESTDIR ?=
@@ -105,6 +176,15 @@ uninstall:
$(MAKE) -C $(LIBDIR) BUILD=$(BUILD) PREFIX=$(PREFIX) DESTDIR=$(DESTDIR) uninstall-headers
$(MAKE) -C $(LIBDIR) BUILD=$(BUILD) PREFIX=$(PREFIX) DESTDIR=$(DESTDIR) uninstall-pkgconfig
@echo "Removed $(DESTDIR)$(PREFIX)/bin/$(TARGET) (if present)"
+defs: $(GENERATED_DIR)/builtin_defs.c
+ @if [ -f $(GENERATED_DIR)/builtin_defs.c ]; then \
+ echo "Generated defs in $(GENERATED_DIR)/builtin_defs.c"; \
+ fi
+
+# Separate from 'clean' so a routine clean does not throw away the
+# (potentially slow) fetched/converted JSON tree.
+defs-clean:
+ rm -rf $(DEFS_DIR) $(GENERATED_DIR)/builtin_defs.c

$(BUILDDIR)/%.o: $(SRCDIR)/%.c
@mkdir -p $(BUILDDIR)
@@ -115,4 +195,4 @@ clean:
$(MAKE) -C $(LIBDIR) BUILD=$$build_type clean; \
rm -rf build/$$build_type; \
done
- rm -rf $(BUILDDIR) $(TARGET)
+ rm -rf $(BUILDDIR) $(TARGET) $(GENERATED_DIR)/builtin_defs.c
diff --git a/tools/arch/x86/pmtctl/scripts/pmtxml2json.md b/tools/arch/x86/pmtctl/scripts/pmtxml2json.md
new file mode 100644
index 000000000000..67eb08a83c86
--- /dev/null
+++ b/tools/arch/x86/pmtctl/scripts/pmtxml2json.md
@@ -0,0 +1,158 @@
+# pmtxml2json: XML → perf JSON conversion
+
+[`pmtxml2json.py`](pmtxml2json.py) converts Intel PMT (Platform Monitoring
+Technology) Aggregator XML files into perf-style JSON event definitions
+consumed by `pmtctl` (via `gen_builtin_defs.py` → `generated/builtin_defs.c`).
+
+This document focuses on the **EventName naming convention** — the rule used
+to derive a perf-style event name from each `<TELC:sample>` element.
+
+## Inputs
+
+For each sample, only two XML inputs participate in naming:
+
+| Input | XML source | Example value |
+| ----------------- | ----------------------------------------- | ------------------------- |
+| `name` | `name=` attribute on `<TELC:sample>` | `IA_SCALABILITY` |
+| `sampleSubGroup` | `<TELC:sampleSubGroup>` child text | `IA_SCALABILITY_CORE7` |
+
+The aggregator's `<TELEM:uniqueid>` (GUID) is used for the output filename
+(`pmt_ep_<guid>.json`), **not** for naming.
+
+`sampleID`, `sampleGroupID`, `lsb`, `msb`, and `productid` are not used to
+build `EventName`. They describe bit layout, packaging, or platform
+identity rather than the metric's identity, and using them would either
+produce names that change when the XML is regenerated or names that
+duplicate information already conveyed by `PMU` / `ConfigCode`.
+
+## Pre-filter: reserved samples
+
+Before naming, samples are dropped if any of the following match the
+case-insensitive pattern `reserved|rsvd` (optionally with trailing digits,
+not embedded in larger tokens):
+
+- the `name` attribute,
+- the `<TELC:sampleSubGroup>` text, or
+- the sample/group `<TELC:description>` text.
+
+Reserved samples never receive an `EventName`.
+
+## Naming rule (lazy prefix)
+
+Within a single aggregator XML, let `N(name)` be the number of non-reserved
+samples sharing the same `name`. For each surviving sample:
+
+1. **Unique name** — `N(name) == 1`:
+ `EventName = sanitize(name)`
+2. **Name collides** and `sampleSubGroup` is non-empty and `sampleSubGroup != name`:
+ `EventName = sanitize(sampleSubGroup) + "." + sanitize(name)`
+3. **Name collides** but `sampleSubGroup` is empty or equals `name`:
+ `EventName = sanitize(name)` (no disambiguation available)
+
+### Why lazy?
+
+`sampleSubGroup` plays two different roles in practice:
+
+- A **metric-instance index** — e.g. `IA_SCALABILITY_CORE7` qualifies a
+ per-core copy of `IA_SCALABILITY`. Prefixing is meaningful and useful.
+- A **container alias** — e.g. `INTEL_VERSION_2` is just an enclosing
+ container around an already-unique `RTL_VERSION`. Prefixing here would
+ produce a confusing label like `intel_version_2.rtl_version`.
+
+The lazy rule borrows `sampleSubGroup` **only when `name` actually collides**,
+yielding clean labels in the common case and disambiguated ones when needed.
+
+### `sanitize()`
+
+`_sanitize_token()` normalizes free-form text into a perf-friendly token:
+
+1. Strip leading/trailing whitespace.
+2. Replace any run of non-alphanumeric characters with a single `_`.
+3. Collapse repeated `_` and trim leading/trailing `_`.
+4. Lowercase.
+
+When concatenating subgroup and name, **each part is sanitized
+separately** and joined with a literal `.`, so the dot is preserved in the
+final `EventName` (e.g. `ia_scalability_core7.ia_scalability`).
+
+## Worked example
+
+Consider an aggregator XML containing three samples:
+
+```xml
+<TELC:sampleGroup sampleID="0x0">
+ <TELC:sample name="RTL_VERSION" ...>
+ <TELC:sampleSubGroup>INTEL_VERSION_2</TELC:sampleSubGroup>
+ <TELC:lsb>0</TELC:lsb><TELC:msb>15</TELC:msb>
+ </TELC:sample>
+</TELC:sampleGroup>
+
+<TELC:sampleGroup sampleID="0x10">
+ <TELC:sample name="IA_SCALABILITY" ...>
+ <TELC:sampleSubGroup>IA_SCALABILITY_CORE0</TELC:sampleSubGroup>
+ <TELC:lsb>0</TELC:lsb><TELC:msb>7</TELC:msb>
+ </TELC:sample>
+</TELC:sampleGroup>
+
+<TELC:sampleGroup sampleID="0x11">
+ <TELC:sample name="IA_SCALABILITY" ...>
+ <TELC:sampleSubGroup>IA_SCALABILITY_CORE7</TELC:sampleSubGroup>
+ <TELC:lsb>0</TELC:lsb><TELC:msb>7</TELC:msb>
+ </TELC:sample>
+</TELC:sampleGroup>
+```
+
+Per-aggregator name counts:
+
+| `name` | count |
+| ---------------- | ----- |
+| `RTL_VERSION` | 1 |
+| `IA_SCALABILITY` | 2 |
+
+Resulting `EventName`s:
+
+| Sample | Rule branch | EventName |
+| ----------------- | -------------------------------- | -------------------------------------- |
+| `RTL_VERSION` | (1) unique | `rtl_version` |
+| `IA_SCALABILITY` (CORE0) | (2) collision + distinct subgroup | `ia_scalability_core0.ia_scalability` |
+| `IA_SCALABILITY` (CORE7) | (2) collision + distinct subgroup | `ia_scalability_core7.ia_scalability` |
+
+Note that `RTL_VERSION` is **not** prefixed with its `INTEL_VERSION_2`
+container, even though `sampleSubGroup` is set — because the name is
+already unique within the aggregator.
+
+## Output shape
+
+For each emitted sample, the JSON object is:
+
+```json
+{
+ "PMU": "pmt_ep_<guid>",
+ "EventName": "<name per rule above>",
+ "BriefDescription": "<sample or group description>",
+ "MetricGroup": "pmt",
+ "ConfigCode": "0x<msb><lsb><sampleID>",
+ "PlatformGroup": "<optional, from --by-path>"
+}
+```
+
+`ConfigCode` packs the perf config bits as:
+
+```
+bits 0..15 sampleID
+bits 16..23 lsb
+bits 24..31 msb
+```
+
+## EventName uniqueness
+
+Within a single aggregator XML the three-rule scheme above resolves most
+collisions. If two samples still share the same `EventName` after
+subgroup-prefix disambiguation (rule 2) — for example because neither has a
+usable `sampleSubGroup` — the converter applies a last-resort ordinal suffix
+`__0`, `__1`, … and emits a `WARN` line to stderr. The double-underscore is
+chosen to be visually distinct from any XML field so it is not mistaken for a
+meaningful part of the metric name.
+
+Across **different** GUIDs, names may repeat — they live in distinct PMUs
+and are disambiguated by the `PMU` field.
diff --git a/tools/arch/x86/pmtctl/scripts/pmtxml2json.py b/tools/arch/x86/pmtctl/scripts/pmtxml2json.py
new file mode 100755
index 000000000000..31995f0fc72e
--- /dev/null
+++ b/tools/arch/x86/pmtctl/scripts/pmtxml2json.py
@@ -0,0 +1,883 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+"""Convert Intel PMT aggregator XML files into perf JSON events.
+
+Provides core XML-to-event conversion plus optional Intel-PMT repository
+fetch/cache support.
+"""
+
+import argparse
+import glob
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import traceback
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+from lxml import etree # pylint: disable=c-extension-no-member
+
+METRIC_GROUP = "pmt"
+INTEL_PMT_REPO_URL = "https://github.com/intel/Intel-PMT";
+
+
+def _expand_path(path: str) -> str:
+ """Return an absolute path with user home expansion applied."""
+ return os.path.abspath(os.path.expanduser(path))
+
+
+def _repo_dir_name_from_url(repo_url: str) -> str:
+ """Return a deterministic cache directory name for a git repository URL."""
+ cleaned = (repo_url or "").rstrip("/")
+ if cleaned.endswith(".git"):
+ cleaned = cleaned[: -len(".git")]
+
+ base = os.path.basename(cleaned) or "repo"
+ base = re.sub(r"[^0-9A-Za-z._-]+", "-", base).strip("-")
+ return base or "repo"
+
+
+def _fetch_intel_pmt_xml_root(
+ cache_dir: str,
+ refresh: bool = False,
+ debug: bool = False,
+ repo_url: str = INTEL_PMT_REPO_URL,
+) -> Optional[str]:
+ """Ensure Intel-PMT exists in cache and return its xml root path."""
+ cache_root = _expand_path(cache_dir)
+ repo_dir = os.path.join(cache_root, _repo_dir_name_from_url(repo_url))
+
+ os.makedirs(cache_root, exist_ok=True)
+
+ try:
+ if not os.path.isdir(repo_dir):
+ if debug:
+ print(
+ f"# fetch: cloning {repo_url} into {repo_dir}",
+ file=sys.stderr,
+ )
+ subprocess.run(
+ ["git", "clone", "--depth", "1", repo_url, repo_dir],
+ check=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ timeout=300,
+ )
+ elif refresh:
+ if debug:
+ print(f"# fetch: refreshing cached repo at {repo_dir}", file=sys.stderr)
+ subprocess.run(
+ ["git", "-C", repo_dir, "pull", "--ff-only"],
+ check=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ timeout=300,
+ )
+ elif debug:
+ print(f"# fetch: using cached repo at {repo_dir}", file=sys.stderr)
+ except FileNotFoundError:
+ print("ERROR: git is not installed or not found in PATH.", file=sys.stderr)
+ return None
+ except subprocess.TimeoutExpired:
+ print("ERROR: fetching Intel-PMT timed out.", file=sys.stderr)
+ return None
+ except subprocess.CalledProcessError as ex:
+ err = (ex.stderr or "").strip()
+ print("ERROR: failed to fetch Intel-PMT repository.", file=sys.stderr)
+ if err:
+ print(f" git stderr: {err}", file=sys.stderr)
+ return None
+
+ xml_root = os.path.join(repo_dir, "xml")
+ if not os.path.isdir(xml_root):
+ print(
+ (
+ "ERROR: fetched repository does not contain expected xml "
+ f"directory: {xml_root}"
+ ),
+ file=sys.stderr,
+ )
+ return None
+
+ return xml_root
+
+
+def _find_pmt_xml(
+ fetched_xml_root: Optional[str], by_path: Optional[str]
+) -> Optional[str]:
+ """Locate pmt.xml in the Intel-PMT xml/ folder.
+
+ Prefer the fetched repo's xml root. Otherwise, walk upward from --by-path
+ looking for a pmt.xml sibling (xml/ folder root).
+ """
+ candidates: List[str] = []
+ if fetched_xml_root:
+ candidates.append(os.path.join(fetched_xml_root, "pmt.xml"))
+
+ if by_path:
+ start = by_path
+ if os.path.isfile(start):
+ start = os.path.dirname(start) or "."
+ start = os.path.abspath(start)
+ cur = start
+ while True:
+ candidates.append(os.path.join(cur, "pmt.xml"))
+ parent = os.path.dirname(cur)
+ if parent == cur:
+ break
+ cur = parent
+
+ for c in candidates:
+ if os.path.isfile(c):
+ return c
+ return None
+
+
+# ---------- Reserved/RSVD skipping ----------
+# Name: match 'reserved' or 'rsvd' with optional digits, not embedded in larger tokens
+RESERVED_RX = re.compile(
+ r"(?<![a-z0-9])(reserved|rsvd)(?:\d+)?(?![a-z0-9])", re.IGNORECASE
+)
+# Description: exact 'reserved' or 'rsvd' with optional trailing digits/whitespace
+DESC_RESERVED_RX = re.compile(r"\s*(?:reserved|rsvd)(?:\s*\d+)?\s*$", re.IGNORECASE)
+
+
+@dataclass(frozen=True)
+class SampleDef: # pylint: disable=too-many-instance-attributes
+ """Normalized representation of one PMT sample field definition."""
+
+ guid: int
+ group_name: str
+ sample_id: int
+ sample_name: str
+ lsb: int
+ msb: int
+ datatype_idref: Optional[str]
+ description: Optional[str]
+ sample_type: Optional[str]
+ sample_subgroup: Optional[str]
+
+
+@dataclass
+class Counters:
+ """Per-file conversion counters for summary diagnostics."""
+
+ total: int = 0
+ emitted: int = 0
+ skipped: int = 0
+
+
+def norm(tag: str) -> str:
+ """Return XML tag name without namespace prefix."""
+ return tag[tag.rfind("}") + 1 :] if "}" in tag else tag
+
+
+def parse_xml(xml_path: str):
+ """Parse and return the XML root element for the given file path."""
+ # pylint: disable-next=c-extension-no-member
+ parser = etree.XMLParser(load_dtd=True, resolve_entities=True, no_network=False)
+ # pylint: disable-next=c-extension-no-member
+ root = etree.parse(xml_path, parser).getroot()
+
+ return root
+
+
+def _basedir_to_name(basedir: str) -> str:
+ """Normalize pmt.xml <basedir> into the per-GUID short name.
+
+ Lowercases the string and replaces '/' with '_'. Other characters
+ (including existing hyphens like 'RMID-EE') are preserved.
+ """
+ if not basedir:
+ return ""
+ return basedir.strip().lower().replace("/", "_")
+
+
+def _parse_mapping_entry(
+ m,
+) -> Optional[Tuple[int, str, str]]:
+ """Extract (guid, name, description) from a <mapping> element.
+
+ Returns None if the mapping has no usable GUID.
+ """
+ guid_txt = m.attrib.get("guid")
+ if not guid_txt:
+ return None
+
+ try:
+ guid = int(guid_txt, 0)
+ except ValueError:
+ guid = int(guid_txt, 16)
+
+ description = ""
+ basedir = ""
+ for ch in m:
+ t = norm(ch.tag).lower()
+ if t == "description":
+ description = (ch.text or "").strip()
+ elif t == "xmlset":
+ for sub in ch:
+ if norm(sub.tag).lower() == "basedir":
+ basedir = (sub.text or "").strip()
+
+ return guid, _basedir_to_name(basedir), description
+
+
+def _merge_duplicate_mapping(existing: Dict[str, object], name: str) -> None:
+ """Merge a duplicate-GUID mapping's alternate name into existing record."""
+ if not name or name == existing["name"]:
+ return
+ extra = f"(also: {name})"
+ if existing["description"]:
+ if extra not in existing["description"]:
+ existing["description"] = f"{existing['description']} {extra}"
+ else:
+ existing["description"] = extra
+
+
+def _parse_pmt_xml_guids(pmt_xml_path: str) -> List[Dict[str, object]]:
+ """Parse pmt.xml and return one record per unique <mapping> GUID.
+
+ Each record has: {"guid": int, "name": str, "description": str}.
+ When the same GUID appears in multiple <mapping> entries (unrelated
+ platforms occasionally reuse early GUIDs), the first occurrence wins
+ and subsequent ones are merged into the description (best-effort, for
+ diagnostics).
+ """
+ root = parse_xml(pmt_xml_path)
+ entries: List[Dict[str, object]] = []
+ by_guid: Dict[int, Dict[str, object]] = {}
+
+ for m in root.iter():
+ if norm(m.tag).lower() != "mapping":
+ continue
+
+ parsed = _parse_mapping_entry(m)
+ if parsed is None:
+ continue
+ guid, name, description = parsed
+
+ existing = by_guid.get(guid)
+ if existing is None:
+ rec = {"guid": guid, "name": name, "description": description}
+ by_guid[guid] = rec
+ entries.append(rec)
+ else:
+ _merge_duplicate_mapping(existing, name)
+
+ entries.sort(key=lambda e: e["guid"])
+ return entries
+
+
+def _write_pmt_guids_json(
+ pmt_xml_src: str, output_dir: str, debug: bool = False
+) -> None:
+ """Parse pmt.xml and write a sidecar pmt_guids.json into output_dir."""
+ entries = _parse_pmt_xml_guids(pmt_xml_src)
+ out_path = os.path.join(output_dir, "pmt_guids.json")
+ serial = [
+ {
+ "guid": f"0x{e['guid']:08x}",
+ "name": e["name"],
+ "description": e["description"],
+ }
+ for e in entries
+ ]
+ with open(out_path, "w", encoding="utf-8") as f:
+ json.dump(serial, f, indent=2)
+ f.write("\n")
+ if debug:
+ print(f"# wrote {out_path} ({len(serial)} entries)", file=sys.stderr)
+
+
+def get_guid(root) -> int:
+ """Extract the telemetry GUID from <uniqueid>."""
+ for e in root.iter():
+ if norm(e.tag).lower() == "uniqueid":
+ v = (e.text or "").strip()
+ if not v:
+ break
+
+ try:
+ # works for "0x1234" or "1234" if decimal was intended
+ return int(v, 0)
+ except ValueError:
+ # force hex for values without prefix
+ return int(v, 16)
+
+ raise ValueError("Missing <TELEM:uniqueid>")
+
+
+# pylint: disable=too-many-locals,too-many-branches,too-many-statements
+def parse_samples(
+ root,
+) -> List[SampleDef]:
+ """Parse SampleGroup/Sample entries into filtered SampleDef records."""
+ guid = get_guid(root)
+ out: List[SampleDef] = []
+
+ for sg in root.iter():
+ if norm(sg.tag).lower() != "samplegroup":
+ continue
+
+ sid_txt = sg.attrib.get("sampleID") or sg.attrib.get("sampleid")
+ if sid_txt is None:
+ raise ValueError("SampleGroup missing sampleID")
+
+ sample_id = int(sid_txt, 0)
+ group_name = (sg.attrib.get("name") or "").strip() or f"group_{sample_id}"
+ group_len = None
+ group_desc = None
+
+ for child in sg:
+ t = norm(child.tag).lower()
+ if t == "length":
+ try:
+ group_len = int((child.text or "").strip(), 0)
+ except (TypeError, ValueError):
+ pass
+ elif t == "description":
+ group_desc = (child.text or "").strip()
+
+ if group_len is not None and group_len != 64:
+ raise ValueError(
+ f"{group_name} sampleID={sample_id} length={group_len} (expected 64)"
+ )
+
+ samples = [c for c in sg if norm(c.tag).lower() == "sample"]
+ if not samples:
+ continue
+
+ for s in samples:
+ sname = (s.attrib.get("name") or f"sample_{sample_id}").strip()
+
+ lsb = None
+ msb = None
+ stype = None
+ sdesc = None
+ ssubgroup = None
+ dtype_ref = s.attrib.get("dataTypeIDREF") or s.attrib.get("datatypeIDREF")
+
+ for ch in s:
+ t = norm(ch.tag).lower()
+ if t == "lsb":
+ lsb = int((ch.text or "").strip(), 0)
+ elif t == "msb":
+ msb = int((ch.text or "").strip(), 0)
+ elif t == "sampletype":
+ stype = (ch.text or "").strip()
+ elif t == "description":
+ sdesc = (ch.text or "").strip()
+ elif t == "samplesubgroup":
+ ssubgroup = (ch.text or "").strip()
+ elif t == "datatypeidref" and not dtype_ref:
+ dtype_ref = (ch.text or "").strip()
+
+ if lsb is None or msb is None:
+ raise ValueError(f"{sname} (sampleID={sample_id}): missing lsb/msb")
+
+ if not 0 <= lsb <= msb < 64:
+ raise ValueError(
+ f"{sname} (sampleID={sample_id}): invalid bit range {lsb}-{msb}"
+ )
+
+ desc_text = sdesc if sdesc else group_desc
+
+ # Skip reserved/rsvd samples by name, sampleSubGroup, or description
+ is_reserved_name = RESERVED_RX.search(sname)
+ is_reserved_sub = ssubgroup and RESERVED_RX.search(ssubgroup)
+ is_reserved_desc = desc_text and DESC_RESERVED_RX.fullmatch(desc_text)
+ if is_reserved_name or is_reserved_sub or is_reserved_desc:
+ continue
+
+ out.append(
+ SampleDef(
+ guid=guid,
+ group_name=group_name,
+ sample_id=sample_id,
+ sample_name=sname,
+ lsb=lsb,
+ msb=msb,
+ datatype_idref=dtype_ref,
+ description=desc_text,
+ sample_type=stype,
+ sample_subgroup=ssubgroup,
+ )
+ )
+
+ return out
+
+
+def pack_config(sample_id: int, lsb: int, msb: int) -> int:
+ """Pack sample_id/lsb/msb into perf ConfigCode bit layout."""
+ return (sample_id & 0xFFFF) | ((lsb & 0xFF) << 16) | ((msb & 0xFF) << 24)
+
+
+def _sanitize_token(s: str) -> str:
+ """Normalize free-form text into a lowercase underscore token."""
+ t = re.sub(r"[^0-9a-zA-Z]+", "_", s.strip()).lower()
+ t = re.sub(r"_+", "_", t).strip("_")
+
+ return t
+
+
+def brief_desc(s: SampleDef) -> str:
+ """Build a short description for perf JSON output."""
+ if s.description:
+ return re.sub(r"\s+", " ", s.description)[:240]
+
+ width = s.msb - s.lsb + 1
+
+ return f"{s.sample_name.replace('_', ' ').title()} ({width}b)"
+
+
+def make_event(
+ s: SampleDef,
+ pmu_name: str,
+ name_counts: Dict[str, int],
+ platform_group: Optional[str] = None,
+) -> Dict[str, str]:
+ """Create one perf event dictionary for a sample."""
+ cfg = pack_config(s.sample_id, s.lsb, s.msb)
+ # Lazy-prefix disambiguation: only borrow sampleSubGroup when the bare
+ # sample name collides with another non-reserved sample in this
+ # aggregator. sampleSubGroup is sometimes a metric-instance index
+ # (e.g. IA_SCALABILITY_CORE7) and sometimes a container alias
+ # (e.g. INTEL_VERSION_2); unconditional prefixing produces confusing
+ # labels in the latter case.
+ if name_counts.get(s.sample_name, 0) <= 1:
+ evname = _sanitize_token(s.sample_name)
+ elif s.sample_subgroup and s.sample_subgroup != s.sample_name:
+ evname = (
+ f"{_sanitize_token(s.sample_subgroup)}.{_sanitize_token(s.sample_name)}"
+ )
+ else:
+ # No subgroup available for disambiguation; return the bare name.
+ # The caller is responsible for detecting and resolving any resulting
+ # duplicate EventName via _resolve_duplicate_event_names().
+ evname = _sanitize_token(s.sample_name)
+
+ e = {
+ "PMU": pmu_name,
+ "EventName": evname,
+ "BriefDescription": brief_desc(s),
+ "MetricGroup": METRIC_GROUP,
+ "ConfigCode": f"0x{cfg:08x}",
+ }
+
+ if platform_group:
+ e["PlatformGroup"] = platform_group
+
+ return e
+
+
+def _resolve_duplicate_event_names(
+ out: List[Dict[str, str]], pmu_name: str, agg_xml: str
+) -> None:
+ """Detect duplicate EventNames and rename collisions as name__0, name__1, ...
+
+ The subgroup-prefix disambiguation in make_event covers the common case.
+ This function is a last-resort safety net for names that could not be
+ disambiguated there (e.g. no usable sampleSubGroup). The double-underscore
+ suffix is intentionally distinct from any XML field so it is not mistaken
+ for a meaningful part of the metric name.
+ """
+ seen: Dict[str, List[int]] = {}
+ for i, e in enumerate(out):
+ name = e["EventName"]
+ seen.setdefault(name, []).append(i)
+
+ for name, indices in seen.items():
+ if len(indices) <= 1:
+ continue
+ print(
+ f"WARN: {agg_xml}: PMU={pmu_name}: "
+ f"EventName '{name}' collision ({len(indices)} entries); "
+ f"renaming as {name}__0 .. {name}__{len(indices) - 1}",
+ file=sys.stderr,
+ )
+ for ordinal, idx in enumerate(indices):
+ out[idx]["EventName"] = f"{name}__{ordinal}"
+
+
+# ------------------------------
+# main()
+# ------------------------------
+# pylint: disable=too-many-locals,too-many-branches,too-many-statements
+def main(
+ argv: List[str],
+) -> int:
+ """CLI entry point: discover XML files, convert, and write JSON outputs."""
+ ap = argparse.ArgumentParser(
+ description="Convert Intel PMT Aggregator XML to perf JSON (intel_pmt only)"
+ )
+ ap.add_argument(
+ "xml",
+ nargs="?",
+ help="Input PMT Aggregator XML file (optional when using --by-path)",
+ )
+ ap.add_argument(
+ "--by-path",
+ default=None,
+ help=(
+ "Directory to auto-discover PMT XMLs. When used without a "
+ "positional XML, processes all *_aggregator.xml recursively "
+ "and emits one output per directory."
+ ),
+ )
+ ap.add_argument(
+ "--output-dir",
+ default=None,
+ help=(
+ "Directory where JSON output files will be placed. Used "
+ "verbatim (files are written flat by GUID). If omitted, "
+ "the deepest folder name from --by-path is used (lowercased)."
+ ),
+ )
+ ap.add_argument(
+ "--fetch-pmt-repo",
+ action="store_true",
+ help=(
+ "Fetch Intel-PMT repository and use its xml folder when "
+ "local xml/by-path inputs are not provided."
+ ),
+ )
+ ap.add_argument(
+ "--pmt-cache-dir",
+ default="~/.cache/pmtctl",
+ help=(
+ "Cache directory for Intel-PMT repository clone "
+ "(default: ~/.cache/pmtctl)."
+ ),
+ )
+ ap.add_argument(
+ "--refresh-pmt-repo",
+ action="store_true",
+ help=(
+ "Refresh cached Intel-PMT repository before conversion "
+ "(used with --fetch-pmt-repo)."
+ ),
+ )
+ ap.add_argument(
+ "--pmt-repo-url",
+ default=INTEL_PMT_REPO_URL,
+ help=argparse.SUPPRESS,
+ )
+ ap.add_argument("--debug", action="store_true")
+ args = ap.parse_args(argv)
+
+ if args.refresh_pmt_repo and not args.fetch_pmt_repo:
+ print(
+ "ERROR: --refresh-pmt-repo requires --fetch-pmt-repo.",
+ file=sys.stderr,
+ )
+ return 2
+
+ fetched_xml_root: Optional[str] = None
+ if args.fetch_pmt_repo:
+ fetched_xml_root = _fetch_intel_pmt_xml_root(
+ cache_dir=args.pmt_cache_dir,
+ refresh=args.refresh_pmt_repo,
+ debug=args.debug,
+ repo_url=args.pmt_repo_url,
+ )
+ if fetched_xml_root is None:
+ return 2
+ if args.debug:
+ print(f"# fetch: xml root={fetched_xml_root}", file=sys.stderr)
+ if args.by_path is None and args.xml is None:
+ args.by_path = fetched_xml_root
+
+ # ------------------------------
+ # Auto-discovery helpers
+ # ------------------------------
+ def _pick_one(cands, label):
+ """Pick deterministically: shortest path first, then alphabetical."""
+ if not cands:
+ return None
+
+ cands = sorted(cands, key=lambda p: (len(p), p))
+ if args.debug and len(cands) > 1:
+ print(
+ (
+ f"# by-path: multiple {label} matches, choosing: {cands[0]} ; "
+ f"others: {cands[1:]}"
+ ),
+ file=sys.stderr,
+ )
+
+ return cands[0]
+
+ def _discover_xmls_by_path(p):
+ """Return (aggregator, common) or (None, None) if not found."""
+ if not p:
+ return (None, None)
+
+ base = p
+ if os.path.isfile(base):
+ base = os.path.dirname(base) or "."
+
+ # First, non-recursive search
+ agg = glob.glob(os.path.join(base, "*_aggregator.xml"))
+ com = glob.glob(os.path.join(base, "*_common.xml"))
+
+ # If any missing, try recursive
+ if not agg or not com:
+ agg = agg or glob.glob(
+ os.path.join(base, "**", "*_aggregator.xml"), recursive=True
+ )
+ com = com or glob.glob(
+ os.path.join(base, "**", "*_common.xml"), recursive=True
+ )
+
+ return (
+ _pick_one(agg, "aggregator"),
+ _pick_one(com, "common"),
+ )
+
+ def _rel_parts_from_root(d: str, root: str) -> List[str]:
+ """Return sanitized relative path segments from root to d."""
+ try:
+ rel = os.path.relpath(os.path.normpath(d), os.path.normpath(root))
+ except ValueError:
+ return []
+ if rel in (".", "") or rel.startswith(".."):
+ return []
+
+ def _seg(s: str) -> str:
+ s = (s or "").strip().lower()
+ s = re.sub(r"[^0-9a-z]+", "-", s)
+ return re.sub(r"-+", "-", s).strip("-")
+
+ out: List[str] = []
+ for seg in rel.split(os.sep):
+ if seg in (".", ".."):
+ continue
+ s = _seg(seg)
+ if s:
+ out.append(s)
+ return out
+
+ def _discover_all_xml_sets_by_path(
+ p: str,
+ ) -> List[Tuple[str, List[str]]]:
+ """Return a list of (aggregator, rel_parts) work items."""
+ if not p:
+ return []
+
+ base = p
+ if os.path.isfile(base):
+ base = os.path.dirname(base) or "."
+
+ # Find every directory that contains an *_aggregator.xml
+ agg_all = glob.glob(
+ os.path.join(base, "**", "*_aggregator.xml"), recursive=True
+ )
+ if not agg_all:
+ # allow the base directory itself
+ agg_all = glob.glob(os.path.join(base, "*_aggregator.xml"))
+
+ dir_to_aggs: Dict[str, List[str]] = {}
+ for a in agg_all:
+ d = os.path.dirname(a) or "."
+ dir_to_aggs.setdefault(d, []).append(a)
+
+ work: List[Tuple[str, List[str]]] = []
+ for d in sorted(dir_to_aggs.keys()):
+ agg = _pick_one(dir_to_aggs[d], "aggregator")
+ if not agg:
+ continue
+
+ rel_parts = _rel_parts_from_root(d, base)
+ work.append((agg, rel_parts))
+
+ return work
+
+ # Determine work items
+ work_items: List[Tuple[str, List[str]]] = []
+
+ if args.by_path and args.xml is None:
+ # Recursive multi-mode
+ work_items = _discover_all_xml_sets_by_path(args.by_path)
+ if args.debug:
+ print(
+ f"# by-path discovered {len(work_items)} aggregator directory(ies)",
+ file=sys.stderr,
+ )
+ else:
+ # Single-mode (backwards compatible): by-path can auto-fill missing files
+ if args.by_path:
+ a_auto, _ = _discover_xmls_by_path(args.by_path)
+ if args.xml is None:
+ args.xml = a_auto
+ if args.debug:
+ print(
+ f"# by-path resolved: xml={args.xml}",
+ file=sys.stderr,
+ )
+
+ if args.xml:
+ rel_parts: List[str] = []
+ if args.by_path:
+ rel_parts = _rel_parts_from_root(
+ os.path.dirname(args.xml) or ".", args.by_path
+ )
+ work_items = [(args.xml, rel_parts)]
+
+ # Sanity check: we must have at least one aggregator XML to process
+ if not work_items:
+ print(
+ (
+ "ERROR: No aggregator XML specified or discovered. "
+ "Provide a file or use --by-path."
+ ),
+ file=sys.stderr,
+ )
+ return 2
+
+ # Determine output directory.
+ #
+ # If --output-dir is given, use it verbatim; outputs are written flat
+ # by GUID (pmt_ep_<guid>.json). If omitted, fall back to a folder
+ # named after the deepest --by-path directory (lowercased).
+ output_dir: Optional[str] = None
+ if args.output_dir:
+ output_dir = args.output_dir
+ elif args.by_path:
+ by_path = args.by_path
+ if os.path.isfile(by_path):
+ by_path = os.path.dirname(by_path) or "."
+ deepest_folder = os.path.basename(os.path.normpath(by_path))
+ output_dir = (
+ "jsons" if deepest_folder.lower() == "xml" else deepest_folder.lower()
+ )
+
+ # Create output directory if specified
+ if output_dir:
+ os.makedirs(output_dir, exist_ok=True)
+ if args.debug:
+ print(f"# output directory: {output_dir}", file=sys.stderr)
+
+ # Copy pmt.xml from the Intel-PMT xml/ folder into the output directory
+ if output_dir:
+ pmt_xml_src = _find_pmt_xml(fetched_xml_root, args.by_path)
+ if pmt_xml_src:
+ pmt_xml_dst = os.path.join(output_dir, "pmt.xml")
+ shutil.copyfile(pmt_xml_src, pmt_xml_dst)
+ if args.debug:
+ print(
+ f"# copied {pmt_xml_src} -> {pmt_xml_dst}",
+ file=sys.stderr,
+ )
+ _write_pmt_guids_json(pmt_xml_src, output_dir, debug=args.debug)
+ elif args.debug:
+ print("# pmt.xml not found; skipping copy", file=sys.stderr)
+
+ # Process each discovered set
+ any_failed = False
+ written_by_guid: Dict[int, str] = {}
+
+ for agg_xml, rel_parts in work_items:
+ try:
+ # Load the main aggregator XML
+ root = parse_xml(agg_xml)
+ guid = get_guid(root)
+
+ pmu_name = f"pmt_ep_{guid:08x}"
+ base_filename = f"{pmu_name}.json"
+ out_filename = (
+ os.path.join(output_dir, base_filename) if output_dir else base_filename
+ )
+
+ # GUIDs are globally unique to a telemetry layout; a duplicate
+ # across aggregators indicates a source-data bug, not something
+ # to silently paper over by namespacing the output.
+ prior = written_by_guid.get(guid)
+ if prior is not None:
+ raise ValueError(
+ f"duplicate GUID 0x{guid:08x} from {agg_xml}; "
+ f"previously emitted by {prior}"
+ )
+
+ samples = parse_samples(root)
+ ctr = Counters(total=len(samples))
+
+ # Per-aggregator platform group derived from its location under
+ # the discovery root (e.g. "alderlake-s"). Falls back to the
+ # by-path basename for the single-mode case.
+ platform_group: Optional[str] = None
+ if rel_parts:
+ platform_group = rel_parts[0].upper()
+ elif args.by_path:
+ by_path = args.by_path
+ if os.path.isfile(by_path):
+ by_path = os.path.dirname(by_path) or "."
+ deepest_folder = os.path.basename(os.path.normpath(by_path))
+ if deepest_folder and deepest_folder.lower() != "xml":
+ platform_group = deepest_folder.upper()
+
+ out = []
+
+ # Pre-pass: count bare sample-name occurrences within this
+ # aggregator so make_event can apply lazy-prefix disambiguation.
+ name_counts: Dict[str, int] = {}
+ for s in samples:
+ name_counts[s.sample_name] = name_counts.get(s.sample_name, 0) + 1
+
+ for s in samples:
+ try:
+ # Build event
+ e = make_event(
+ s,
+ pmu_name,
+ name_counts,
+ platform_group=platform_group,
+ )
+
+ out.append(e)
+ ctr.emitted += 1
+ except Exception as ex: # pylint: disable=broad-exception-caught
+ ctr.skipped += 1
+ print(
+ (
+ f"WARN: skipping {s.sample_name} "
+ f"(sampleID={s.sample_id}): {ex}"
+ ),
+ file=sys.stderr,
+ )
+ traceback.print_exc()
+
+ # Last-resort: detect and rename any duplicate EventNames that
+ # subgroup-prefix disambiguation could not resolve.
+ _resolve_duplicate_event_names(out, pmu_name, agg_xml)
+
+ # Write events JSON
+ with open(out_filename, "w", encoding="utf-8") as f:
+ json.dump(out, f, indent=2)
+ f.write("\n")
+
+ written_by_guid[guid] = agg_xml
+ print(f"# wrote {out_filename}", file=sys.stderr)
+ print(
+ (
+ f"# PMU={pmu_name} total={ctr.total} "
+ f"emitted={ctr.emitted} skipped={ctr.skipped}"
+ ),
+ file=sys.stderr,
+ )
+
+ except Exception: # pylint: disable=broad-exception-caught
+ any_failed = True
+ print(f"ERROR: failed processing aggregator={agg_xml}", file=sys.stderr)
+
+ return 1 if any_failed else 0
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
--
2.43.0