[PATCH 5/5] writeback, memcg: Implement foreign dirty flushing

From: Tejun Heo
Date: Mon Aug 26 2019 - 12:07:22 EST


There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.

Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.

This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.

A reproducer follows.

write-range.c::

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>

static const char *usage = "write-range FILE START SIZE\n";

int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];

if (argc < 4) {
fprintf(stderr, usage);
return 1;
}

fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}

start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}

size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}

end = start + size;

while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;

if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}

bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;

while (bwritten < bread) {
long this;

this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}

bwritten += this;
pos += bwritten;
}
}
}
}

repro.sh::

#!/bin/bash

set -e
set -x

sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches

TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B

mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high

rm -f testfile
touch testfile
fallocate -l 4G testfile

echo "Starting B"

(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &

echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"

(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))

v2: Added comments explaining why the specific intervals are being used.

v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.

v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
---
include/linux/backing-dev-defs.h | 1 +
include/linux/memcontrol.h | 39 +++++++++
mm/memcontrol.c | 134 +++++++++++++++++++++++++++++++
mm/page-writeback.c | 4 +
4 files changed, 178 insertions(+)

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 1075f2552cfc..4fc87dee005a 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -63,6 +63,7 @@ enum wb_reason {
* so it has a mismatch name.
*/
WB_REASON_FORKER_THREAD,
+ WB_REASON_FOREIGN_FLUSH,

WB_REASON_MAX,
};
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2cd4359cb38c..ad8f1a397ae4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -183,6 +183,23 @@ struct memcg_padding {
#define MEMCG_PADDING(name)
#endif

+/*
+ * Remember four most recent foreign writebacks with dirty pages in this
+ * cgroup. Inode sharing is expected to be uncommon and, even if we miss
+ * one in a given round, we're likely to catch it later if it keeps
+ * foreign-dirtying, so a fairly low count should be enough.
+ *
+ * See mem_cgroup_track_foreign_dirty_slowpath() for details.
+ */
+#define MEMCG_CGWB_FRN_CNT 4
+
+struct memcg_cgwb_frn {
+ u64 bdi_id; /* bdi->id of the foreign inode */
+ int memcg_id; /* memcg->css.id of foreign inode */
+ u64 at; /* jiffies_64 at the time of dirtying */
+ struct wb_completion done; /* tracks in-flight foreign writebacks */
+};
+
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
@@ -307,6 +324,7 @@ struct mem_cgroup {
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
struct wb_domain cgwb_domain;
+ struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif

/* List of events which userspace want to receive */
@@ -1237,6 +1255,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
unsigned long *pheadroom, unsigned long *pdirty,
unsigned long *pwriteback);

+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb);
+
+static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+ struct bdi_writeback *wb)
+{
+ if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
+ mem_cgroup_track_foreign_dirty_slowpath(page, wb);
+}
+
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
+
#else /* CONFIG_CGROUP_WRITEBACK */

static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
@@ -1252,6 +1282,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
{
}

+static inline void mem_cgroup_track_foreign_dirty(struct page *page,
+ struct bdi_writeback *wb)
+{
+}
+
+static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */

struct sock;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 26e2999af608..eb626a290d93 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -87,6 +87,10 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif

+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -4238,6 +4242,127 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
}
}

+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback. The former
+ * trackes ownership per-page while the latter per-inode. This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases. For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode. B owns the inode and
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback. A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction. However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism. When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ struct memcg_cgwb_frn *frn;
+ u64 now = get_jiffies_64();
+ u64 oldest_at = now;
+ int oldest = -1;
+ int i;
+
+ /*
+ * Pick the slot to use. If there is already a slot for @wb, keep
+ * using it. If not replace the oldest one which isn't being
+ * written out.
+ */
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ frn = &memcg->cgwb_frn[i];
+ if (frn->bdi_id == wb->bdi->id &&
+ frn->memcg_id == wb->memcg_css->id)
+ break;
+ if (time_before64(frn->at, oldest_at) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ oldest = i;
+ oldest_at = frn->at;
+ }
+ }
+
+ if (i < MEMCG_CGWB_FRN_CNT) {
+ /*
+ * Re-using an existing one. Update timestamp lazily to
+ * avoid making the cacheline hot. We want them to be
+ * reasonably up-to-date and significantly shorter than
+ * dirty_expire_interval as that's what expires the record.
+ * Use the shorter of 1s and dirty_expire_interval / 8.
+ */
+ unsigned long update_intv =
+ min_t(unsigned long, HZ,
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+ if (time_before64(frn->at, now - update_intv))
+ frn->at = now;
+ } else if (oldest >= 0) {
+ /* replace the oldest free one */
+ frn = &memcg->cgwb_frn[oldest];
+ frn->bdi_id = wb->bdi->id;
+ frn->memcg_id = wb->memcg_css->id;
+ frn->at = now;
+ }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ u64 now = jiffies_64;
+ int i;
+
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+ /*
+ * If the record is older than dirty_expire_interval,
+ * writeback on it has already started. No need to kick it
+ * off again. Also, don't start a new one if there's
+ * already one in flight.
+ */
+ if (time_after64(frn->at, now - intv) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ frn->at = 0;
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ WB_REASON_FOREIGN_FLUSH,
+ &frn->done);
+ }
+ }
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */

static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4760,6 +4885,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
struct mem_cgroup *memcg;
unsigned int size;
int node;
+ int __maybe_unused i;

size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4803,6 +4929,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ memcg->cgwb_frn[i].done =
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
@@ -4932,7 +5061,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ int __maybe_unused i;

+#ifdef CONFIG_CGROUP_WRITEBACK
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1804f64ff43c..50055d2e4ea8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);

+ mem_cgroup_flush_foreign(wb);
+
/*
* Calculate global domain's pos_ratio and select the
* global dtc by default.
@@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
+
+ mem_cgroup_track_foreign_dirty(page, wb);
}
}

--
2.17.1