[PATCH 1/2] x86/CPU/AMD: Present package as die instead of socket

From: Suravee Suthikulpanit
Date: Tue Jun 27 2017 - 02:41:41 EST


According to the Documentation/x86/topology.txt, AMD nomenclature for
package is NUMA node (or die). However, this is not the case on AMD
family17h multi-die processor platforms, which can have up to 4 dies
per socket as shown in the following system topology.

Die (Dx) View :
----------------------------
C0 | T0 T1 | || | T0 T1 | C4
--------| || |--------
C1 | T0 T1 | L3 || L3 | T0 T1 | C5
--------| || |--------
C2 | T0 T1 | #0 || #1 | T0 T1 | C6
--------| || |--------
C3 | T0 T1 | || | T0 T1 | C7
----------------------------

System View (with 2 socket) :
--------------------
| -------------|------
| | | |
------------ ------------
| D1 -- D0 | | D7 -- D6 |
| | \/ | | | | \/ | |
SOCKET0 | | /\ | | | | /\ | | SOCKET1
| D2 -- D3 | | D4 -- D5 |
------------ ------------
| | | |
------|------------| |
--------------------

Current logic interpretes package as socket (i.e. phys_proc_id is
socket id), which results in setting x86_has_numa_in_package, and omits
the DIE schedule domain. However, NUMA schedule domains are derived from
SRAT/SLIT, which assumes NUMA node is a die, and build NUMA schedule
domains on top of NUMA nodes. This results in incomplete schedule domains
as following:
domain 0: SMT
domain 1: MC /* core complex w/ shared L3*/
---- Missing DIE level domain ----
domain 2: NUMA /* socket */
domain 3: NUMA /* platform */

Presenting package-as-die does not set x86_has_numa_in_package.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx>
Signed-off-by: Leo Duran <leo.duran@xxxxxxx>
Signed-off-by: Yazen Ghannam <yazen.ghannam@xxxxxxx>
Cc: <stable@xxxxxxxxxxxxxxx> # v4.10+
---
arch/x86/kernel/cpu/amd.c | 189 +++++++++++++++++++++++++++-------------------
1 file changed, 112 insertions(+), 77 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bb5abe8..2f5869c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "x86/AMD: " fmt
+
#include <linux/export.h>
#include <linux/bitops.h>
#include <linux/elf.h>
@@ -32,6 +34,12 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
*/
static u32 nodes_per_socket = 1;

+/*
+ * l3_num_threads_sharing: Stores the number of threads sharing L3 cache.
+ * Refer to CPUID_Fn8000001D_EAX_x03 [Cache Properties (L3)] NumSharingCache.
+ */
+static u32 l3_num_threads_sharing;
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -296,96 +304,122 @@ static int nearby_node(int apicid)
}
#endif

+#ifdef CONFIG_SMP
+
/*
- * Fixup core topology information for
- * (1) AMD multi-node processors
+ * Per Documentation/x86/topology.c, the kernel works with
+ * {packages, cores, threads}, and we will map:
+ *
+ * thread = core in compute-unit (CMT), or thread in core (SMT)
+ * core = compute-unit (CMT), or core (SMT)
+ * package = node (die)
+ *
+ * Discover topology based on available information from CPUID first,
+ * and only derive them as needed.
+ *
+ * (1) phys_proc_id is die ID in AMD multi-die processors.
* Assumption: Number of cores in each internal node is the same.
- * (2) AMD processors supporting compute units
+ * (2) cpu_core_id is derived from either CPUID topology extension
+ * or initial APIC_ID.
+ * (3) cpu_llc_id is either L3 or per-node
*/
-#ifdef CONFIG_SMP
static void amd_get_topology(struct cpuinfo_x86 *c)
{
- u8 node_id;
int cpu = smp_processor_id();

- /* get information required for multi-node processors */
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
u32 eax, ebx, ecx, edx;

cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);

- node_id = ecx & 0xff;
+ c->phys_proc_id = ecx & 0xff;
smp_num_siblings = ((ebx >> 8) & 0xff) + 1;

- if (c->x86 == 0x15)
- c->cu_id = ebx & 0xff;
-
- if (c->x86 >= 0x17) {
- c->cpu_core_id = ebx & 0xff;
-
- if (smp_num_siblings > 1)
- c->x86_max_cores /= smp_num_siblings;
- }
+ /* Adjustment to get core per die */
+ c->x86_max_cores /= smp_num_siblings;

/*
- * We may have multiple LLCs if L3 caches exist, so check if we
- * have an L3 cache by looking at the L3 cache CPUID leaf.
+ * For family15h/16h, this is ComputeUnitId per socket
+ * For family17h, this is CoreId per socket
*/
+ c->cpu_core_id = (ebx & 0xff);
+
if (cpuid_edx(0x80000006)) {
- if (c->x86 == 0x17) {
+ cpuid_count(0x8000001d, 3, &eax, &ebx, &ecx, &edx);
+ l3_num_threads_sharing = ((eax >> 14) & 0xfff) + 1;
+ }
+
+ if (c->x86 == 0x17) {
+ /*
+ * In family 17h, the CPUID_Fn8000001E_EBX[7:0] (CoreId)
+ * is non-contiguous in down-coring and non-SMT cases.
+ * This logic fixes up the cpu_core_id to be contiguous
+ * for cores within the die.
+ */
+ u32 tmp = c->cpu_core_id;
+ u32 die_offset, ccx_offset, cpu_offset;
+
+ if (smp_num_siblings == 1) {
/*
- * LLC is at the core complex level.
- * Core complex id is ApicId[3].
+ * For SMT-disabled case, the CoreId bit-encoding is
+ * [7:4] : die
+ * [3] : ccx
+ * [2:0] : core
*/
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ die_offset = ((tmp >> 4) & 0xf) * c->x86_max_cores;
+ ccx_offset = ((tmp >> 3) & 1) * l3_num_threads_sharing;
+ cpu_offset = tmp & 7;
} else {
- /* LLC is at the node level. */
- per_cpu(cpu_llc_id, cpu) = node_id;
+ /*
+ * For SMT-enabled case, the CoreId bit-encoding is
+ * [7:3] : die
+ * [2] : ccx
+ * [1:0] : core
+ */
+ die_offset = ((tmp >> 3) & 0x1f) * c->x86_max_cores;
+ ccx_offset = ((tmp >> 2) & 1) * l3_num_threads_sharing / smp_num_siblings;
+ cpu_offset = tmp & 3;
}
+ c->cpu_core_id = die_offset + ccx_offset + cpu_offset;
+ pr_debug("Fixup CoreId:%#x to cpu_core_id:%#x\n", tmp, c->cpu_core_id);
}
- } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
- u64 value;
+ } else {
+ if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+ u64 value;

- rdmsrl(MSR_FAM10H_NODE_ID, value);
- node_id = value & 7;
-
- per_cpu(cpu_llc_id, cpu) = node_id;
- } else
- return;
-
- /* fixup multi-node processor information */
- if (nodes_per_socket > 1) {
- u32 cus_per_node;
-
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cus_per_node = c->x86_max_cores / nodes_per_socket;
+ /* Use MSR provided node ID */
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ c->phys_proc_id = value & 7;
+ } else {
+ /*
+ * On older AMD dual core setup the lower
+ * bits of the APIC id distinguish the cores.
+ * Assumes number of cores is a power of two.
+ */
+ c->phys_proc_id = c->initial_apicid >> c->x86_coreid_bits;
+ }

- /* core id has to be in the [0 .. cores_per_node - 1] range */
- c->cpu_core_id %= cus_per_node;
+ /* Get core id from APIC */
+ c->cpu_core_id = c->initial_apicid & ((1 << c->x86_coreid_bits) - 1);
}
-}
-#endif

-/*
- * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
- * Assumes number of cores is a power of two.
- */
-static void amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned bits;
- int cpu = smp_processor_id();
+ /* core id has to be in the [0 .. cores_per_die - 1] range */
+ c->cpu_core_id %= c->x86_max_cores;

- bits = c->x86_coreid_bits;
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
- /* use socket ID also for last level cache */
+ /* Default LLC is at the die level. */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
- amd_get_topology(c);
-#endif
+
+ /*
+ * We may have multiple LLCs if L3 caches exist, so check if we
+ * have an L3 cache by looking at the L3 cache CPUID leaf.
+ * For family17h, LLC is at the core complex level.
+ * Core complex id is ApicId[3].
+ */
+ if (cpuid_edx(0x80000006) && c->x86 == 0x17)
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+
}
+#endif

u16 amd_get_nb_id(int cpu)
{
@@ -412,7 +446,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)

node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
- node = per_cpu(cpu_llc_id, cpu);
+ node = c->phys_proc_id;

/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -457,26 +491,23 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
static void early_init_amd_mc(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- unsigned bits, ecx;
+ u32 threads_per_socket;

/* Multi core CPU? */
if (c->extended_cpuid_level < 0x80000008)
return;

- ecx = cpuid_ecx(0x80000008);
-
- c->x86_max_cores = (ecx & 0xff) + 1;
-
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
-
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- }
+ /* Threads per socket */
+ threads_per_socket = (cpuid_ecx(0x80000008) & 0xff) + 1;
+ /* Thread per die */
+ c->x86_max_cores = threads_per_socket / nodes_per_socket;

- c->x86_coreid_bits = bits;
+ /*
+ * This is per socket, and should only be used to decode APIC ID,
+ * which is needed on older systems where X86_FEATURE_TOPOEXT
+ * is not supported.
+ */
+ c->x86_coreid_bits = get_count_order(threads_per_socket);
#endif
}

@@ -765,11 +796,15 @@ static void init_amd(struct cpuinfo_x86 *c)

cpu_detect_cache_sizes(c);

- /* Multi core CPU? */
+#ifdef CONFIG_SMP
if (c->extended_cpuid_level >= 0x80000008) {
- amd_detect_cmp(c);
+ amd_get_topology(c);
srat_detect_node(c);
}
+#endif
+ /* Multi-die? */
+ if (nodes_per_socket > 1)
+ set_cpu_cap(c, X86_FEATURE_AMD_DCM);

#ifdef CONFIG_X86_32
detect_ht(c);
--
2.7.4