Re: [PATCH 5/5] writeback, memcg: Implement foreign dirty flushing
From: Jan Kara
Date: Tue Aug 27 2019 - 10:47:11 EST
On Mon 26-08-19 09:06:56, Tejun Heo wrote:
> There's an inherent mismatch between memcg and writeback. The former
> trackes ownership per-page while the latter per-inode. This was a
> deliberate design decision because honoring per-page ownership in the
> writeback path is complicated, may lead to higher CPU and IO overheads
> and deemed unnecessary given that write-sharing an inode across
> different cgroups isn't a common use-case.
>
> Combined with inode majority-writer ownership switching, this works
> well enough in most cases but there are some pathological cases. For
> example, let's say there are two cgroups A and B which keep writing to
> different but confined parts of the same inode. B owns the inode and
> A's memory is limited far below B's. A's dirty ratio can rise enough
> to trigger balance_dirty_pages() sleeps but B's can be low enough to
> avoid triggering background writeback. A will be slowed down without
> a way to make writeback of the dirty pages happen.
>
> This patch implements foreign dirty recording and foreign mechanism so
> that when a memcg encounters a condition as above it can trigger
> flushes on bdi_writebacks which can clean its pages. Please see the
> comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
> details.
>
> A reproducer follows.
>
> write-range.c::
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <sys/types.h>
>
> static const char *usage = "write-range FILE START SIZE\n";
>
> int main(int argc, char **argv)
> {
> int fd;
> unsigned long start, size, end, pos;
> char *endp;
> char buf[4096];
>
> if (argc < 4) {
> fprintf(stderr, usage);
> return 1;
> }
>
> fd = open(argv[1], O_WRONLY);
> if (fd < 0) {
> perror("open");
> return 1;
> }
>
> start = strtoul(argv[2], &endp, 0);
> if (*endp != '\0') {
> fprintf(stderr, usage);
> return 1;
> }
>
> size = strtoul(argv[3], &endp, 0);
> if (*endp != '\0') {
> fprintf(stderr, usage);
> return 1;
> }
>
> end = start + size;
>
> while (1) {
> for (pos = start; pos < end; ) {
> long bread, bwritten = 0;
>
> if (lseek(fd, pos, SEEK_SET) < 0) {
> perror("lseek");
> return 1;
> }
>
> bread = read(0, buf, sizeof(buf) < end - pos ?
> sizeof(buf) : end - pos);
> if (bread < 0) {
> perror("read");
> return 1;
> }
> if (bread == 0)
> return 0;
>
> while (bwritten < bread) {
> long this;
>
> this = write(fd, buf + bwritten,
> bread - bwritten);
> if (this < 0) {
> perror("write");
> return 1;
> }
>
> bwritten += this;
> pos += bwritten;
> }
> }
> }
> }
>
> repro.sh::
>
> #!/bin/bash
>
> set -e
> set -x
>
> sysctl -w vm.dirty_expire_centisecs=300000
> sysctl -w vm.dirty_writeback_centisecs=300000
> sysctl -w vm.dirtytime_expire_seconds=300000
> echo 3 > /proc/sys/vm/drop_caches
>
> TEST=/sys/fs/cgroup/test
> A=$TEST/A
> B=$TEST/B
>
> mkdir -p $A $B
> echo "+memory +io" > $TEST/cgroup.subtree_control
> echo $((1<<30)) > $A/memory.high
> echo $((32<<30)) > $B/memory.high
>
> rm -f testfile
> touch testfile
> fallocate -l 4G testfile
>
> echo "Starting B"
>
> (echo $BASHPID > $B/cgroup.procs
> pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
>
> echo "Waiting 10s to ensure B claims the testfile inode"
> sleep 5
> sync
> sleep 5
> sync
> echo "Starting A"
>
> (echo $BASHPID > $A/cgroup.procs
> pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
>
> v2: Added comments explaining why the specific intervals are being used.
>
> v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
> flushing while avoding possible livelocks.
>
> v4: Use get_jiffies_64() and time_before/after64() instead of raw
> jiffies_64 and arthimetic comparisons as suggested by Jan.
>
> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
The patch looks good to me. You can add:
Reviewed-by: Jan Kara <jack@xxxxxxx>
Honza
> ---
> include/linux/backing-dev-defs.h | 1 +
> include/linux/memcontrol.h | 39 +++++++++
> mm/memcontrol.c | 134 +++++++++++++++++++++++++++++++
> mm/page-writeback.c | 4 +
> 4 files changed, 178 insertions(+)
>
> diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
> index 1075f2552cfc..4fc87dee005a 100644
> --- a/include/linux/backing-dev-defs.h
> +++ b/include/linux/backing-dev-defs.h
> @@ -63,6 +63,7 @@ enum wb_reason {
> * so it has a mismatch name.
> */
> WB_REASON_FORKER_THREAD,
> + WB_REASON_FOREIGN_FLUSH,
>
> WB_REASON_MAX,
> };
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 2cd4359cb38c..ad8f1a397ae4 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -183,6 +183,23 @@ struct memcg_padding {
> #define MEMCG_PADDING(name)
> #endif
>
> +/*
> + * Remember four most recent foreign writebacks with dirty pages in this
> + * cgroup. Inode sharing is expected to be uncommon and, even if we miss
> + * one in a given round, we're likely to catch it later if it keeps
> + * foreign-dirtying, so a fairly low count should be enough.
> + *
> + * See mem_cgroup_track_foreign_dirty_slowpath() for details.
> + */
> +#define MEMCG_CGWB_FRN_CNT 4
> +
> +struct memcg_cgwb_frn {
> + u64 bdi_id; /* bdi->id of the foreign inode */
> + int memcg_id; /* memcg->css.id of foreign inode */
> + u64 at; /* jiffies_64 at the time of dirtying */
> + struct wb_completion done; /* tracks in-flight foreign writebacks */
> +};
> +
> /*
> * The memory controller data structure. The memory controller controls both
> * page cache and RSS per cgroup. We would eventually like to provide
> @@ -307,6 +324,7 @@ struct mem_cgroup {
> #ifdef CONFIG_CGROUP_WRITEBACK
> struct list_head cgwb_list;
> struct wb_domain cgwb_domain;
> + struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
> #endif
>
> /* List of events which userspace want to receive */
> @@ -1237,6 +1255,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> unsigned long *pheadroom, unsigned long *pdirty,
> unsigned long *pwriteback);
>
> +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
> + struct bdi_writeback *wb);
> +
> +static inline void mem_cgroup_track_foreign_dirty(struct page *page,
> + struct bdi_writeback *wb)
> +{
> + if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
> + mem_cgroup_track_foreign_dirty_slowpath(page, wb);
> +}
> +
> +void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
> +
> #else /* CONFIG_CGROUP_WRITEBACK */
>
> static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
> @@ -1252,6 +1282,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
> {
> }
>
> +static inline void mem_cgroup_track_foreign_dirty(struct page *page,
> + struct bdi_writeback *wb)
> +{
> +}
> +
> +static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
> +{
> +}
> +
> #endif /* CONFIG_CGROUP_WRITEBACK */
>
> struct sock;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 26e2999af608..eb626a290d93 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -87,6 +87,10 @@ int do_swap_account __read_mostly;
> #define do_swap_account 0
> #endif
>
> +#ifdef CONFIG_CGROUP_WRITEBACK
> +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
> +#endif
> +
> /* Whether legacy memory+swap accounting is active */
> static bool do_memsw_account(void)
> {
> @@ -4238,6 +4242,127 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> }
> }
>
> +/*
> + * Foreign dirty flushing
> + *
> + * There's an inherent mismatch between memcg and writeback. The former
> + * trackes ownership per-page while the latter per-inode. This was a
> + * deliberate design decision because honoring per-page ownership in the
> + * writeback path is complicated, may lead to higher CPU and IO overheads
> + * and deemed unnecessary given that write-sharing an inode across
> + * different cgroups isn't a common use-case.
> + *
> + * Combined with inode majority-writer ownership switching, this works well
> + * enough in most cases but there are some pathological cases. For
> + * example, let's say there are two cgroups A and B which keep writing to
> + * different but confined parts of the same inode. B owns the inode and
> + * A's memory is limited far below B's. A's dirty ratio can rise enough to
> + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
> + * triggering background writeback. A will be slowed down without a way to
> + * make writeback of the dirty pages happen.
> + *
> + * Conditions like the above can lead to a cgroup getting repatedly and
> + * severely throttled after making some progress after each
> + * dirty_expire_interval while the underyling IO device is almost
> + * completely idle.
> + *
> + * Solving this problem completely requires matching the ownership tracking
> + * granularities between memcg and writeback in either direction. However,
> + * the more egregious behaviors can be avoided by simply remembering the
> + * most recent foreign dirtying events and initiating remote flushes on
> + * them when local writeback isn't enough to keep the memory clean enough.
> + *
> + * The following two functions implement such mechanism. When a foreign
> + * page - a page whose memcg and writeback ownerships don't match - is
> + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
> + * bdi_writeback on the page owning memcg. When balance_dirty_pages()
> + * decides that the memcg needs to sleep due to high dirty ratio, it calls
> + * mem_cgroup_flush_foreign() which queues writeback on the recorded
> + * foreign bdi_writebacks which haven't expired. Both the numbers of
> + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
> + * limited to MEMCG_CGWB_FRN_CNT.
> + *
> + * The mechanism only remembers IDs and doesn't hold any object references.
> + * As being wrong occasionally doesn't matter, updates and accesses to the
> + * records are lockless and racy.
> + */
> +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
> + struct bdi_writeback *wb)
> +{
> + struct mem_cgroup *memcg = page->mem_cgroup;
> + struct memcg_cgwb_frn *frn;
> + u64 now = get_jiffies_64();
> + u64 oldest_at = now;
> + int oldest = -1;
> + int i;
> +
> + /*
> + * Pick the slot to use. If there is already a slot for @wb, keep
> + * using it. If not replace the oldest one which isn't being
> + * written out.
> + */
> + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
> + frn = &memcg->cgwb_frn[i];
> + if (frn->bdi_id == wb->bdi->id &&
> + frn->memcg_id == wb->memcg_css->id)
> + break;
> + if (time_before64(frn->at, oldest_at) &&
> + atomic_read(&frn->done.cnt) == 1) {
> + oldest = i;
> + oldest_at = frn->at;
> + }
> + }
> +
> + if (i < MEMCG_CGWB_FRN_CNT) {
> + /*
> + * Re-using an existing one. Update timestamp lazily to
> + * avoid making the cacheline hot. We want them to be
> + * reasonably up-to-date and significantly shorter than
> + * dirty_expire_interval as that's what expires the record.
> + * Use the shorter of 1s and dirty_expire_interval / 8.
> + */
> + unsigned long update_intv =
> + min_t(unsigned long, HZ,
> + msecs_to_jiffies(dirty_expire_interval * 10) / 8);
> +
> + if (time_before64(frn->at, now - update_intv))
> + frn->at = now;
> + } else if (oldest >= 0) {
> + /* replace the oldest free one */
> + frn = &memcg->cgwb_frn[oldest];
> + frn->bdi_id = wb->bdi->id;
> + frn->memcg_id = wb->memcg_css->id;
> + frn->at = now;
> + }
> +}
> +
> +/* issue foreign writeback flushes for recorded foreign dirtying events */
> +void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
> + unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
> + u64 now = jiffies_64;
> + int i;
> +
> + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
> + struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
> +
> + /*
> + * If the record is older than dirty_expire_interval,
> + * writeback on it has already started. No need to kick it
> + * off again. Also, don't start a new one if there's
> + * already one in flight.
> + */
> + if (time_after64(frn->at, now - intv) &&
> + atomic_read(&frn->done.cnt) == 1) {
> + frn->at = 0;
> + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
> + WB_REASON_FOREIGN_FLUSH,
> + &frn->done);
> + }
> + }
> +}
> +
> #else /* CONFIG_CGROUP_WRITEBACK */
>
> static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
> @@ -4760,6 +4885,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
> struct mem_cgroup *memcg;
> unsigned int size;
> int node;
> + int __maybe_unused i;
>
> size = sizeof(struct mem_cgroup);
> size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
> @@ -4803,6 +4929,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
> #endif
> #ifdef CONFIG_CGROUP_WRITEBACK
> INIT_LIST_HEAD(&memcg->cgwb_list);
> + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
> + memcg->cgwb_frn[i].done =
> + __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
> #endif
> idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
> return memcg;
> @@ -4932,7 +5061,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
> static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> + int __maybe_unused i;
>
> +#ifdef CONFIG_CGROUP_WRITEBACK
> + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
> + wb_wait_for_completion(&memcg->cgwb_frn[i].done);
> +#endif
> if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
> static_branch_dec(&memcg_sockets_enabled_key);
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 1804f64ff43c..50055d2e4ea8 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
> if (unlikely(!writeback_in_progress(wb)))
> wb_start_background_writeback(wb);
>
> + mem_cgroup_flush_foreign(wb);
> +
> /*
> * Calculate global domain's pos_ratio and select the
> * global dtc by default.
> @@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
> task_io_account_write(PAGE_SIZE);
> current->nr_dirtied++;
> this_cpu_inc(bdp_ratelimits);
> +
> + mem_cgroup_track_foreign_dirty(page, wb);
> }
> }
>
> --
> 2.17.1
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR