Re: [PATCH] sched/topology: Improve load balancing on AMD EPYC

From: Matt Fleming
Date: Tue Jun 18 2019 - 06:48:24 EST


On Tue, 11 Jun, at 05:22:21PM, Lendacky, Thomas wrote:
> On 6/10/19 4:26 PM, Matt Fleming wrote:
> > On Wed, 05 Jun, at 08:00:35PM, Peter Zijlstra wrote:
> >>
> >> And then we had two magic values :/
> >>
> >> Should we not 'fix' RECLAIM_DISTANCE for EPYC or something? Because
> >> surely, if we want to load-balance agressively over 30, then so too
> >> should we do node_reclaim() I'm thikning.
> >
> > Yeah we can fix it just for EPYC, Mel suggested that approach originally.
> >
> > Suravee, Tom, what's the best way to detect these EPYC machines that need to
> > override RECLAIM_DISTANCE?
>
> You should be able to do it based on the family. There's an init_amd_zn()
> function in arch/x86/kernel/cpu/amd.c. You can add something there or,
> since init_amd_zn() sets X86_FEATURE_ZEN, you could check for that if you
> prefer putting it in a different location.

This works for me under all my tests. Thoughts?

--->8---

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 80a405c2048a..4db4e9e7654b 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/random.h>
+#include <linux/topology.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cacheinfo.h>
@@ -824,6 +825,8 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_ZEN);

+ node_reclaim_distance = 32;
+
/* Fix erratum 1076: CPB feature bit not being set in CPUID. */
if (!cpu_has(c, X86_FEATURE_CPB))
set_cpu_cap(c, X86_FEATURE_CPB);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index cb0775e1ee4b..74b484354ac9 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -59,6 +59,9 @@ int arch_update_cpu_topology(void);
*/
#define RECLAIM_DISTANCE 30
#endif
+
+extern int __read_mostly node_reclaim_distance;
+
#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS (1)
#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..20f0f8f6792b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1284,6 +1284,7 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
#endif

/*
@@ -1410,7 +1411,7 @@ sd_init(struct sched_domain_topology_level *tl,

sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a335f7c1fac4..67f5f09d70ed 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid)
for (i = 0; i < MAX_NUMNODES; i++) {
if (!khugepaged_node_load[i])
continue;
- if (node_distance(nid, i) > RECLAIM_DISTANCE)
+ if (node_distance(nid, i) > node_reclaim_distance)
return true;
}
return false;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d66bc8abe0af..8ccaaf3a47f2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3450,7 +3450,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
- RECLAIM_DISTANCE;
+ node_reclaim_distance;
}
#else /* CONFIG_NUMA */
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)