[PATCH -mm] mm: fine-grained dirty_ratio_pcm and dirty_background_ratio_pcm(v2)

From: Andrea Righi
Date: Mon Nov 10 2008 - 15:58:54 EST


The current granularity of 5% of dirtyable memory for dirty pages writeback is
too coarse for large memory machines and this will get worse as
memory-size/disk-speed ratio continues to increase.

These large writebacks can be unpleasant for desktop or latency-sensitive
environments, where the time to complete each writeback can be perceived as a
lack of responsiveness by the whole system.

Following there's a similar solution as discussed in [1], but a little
bit simplified in order to provide the same functionality (in particular
to avoid backward compatibility problems) and reduce the amount of code
needed to implement an in-kernel parser to handle percentages with
decimals digits.

The kernel provides the following parameters:
- dirty_ratio, dirty_background_ratio in percentage (1 ... 100)
- dirty_ratio_pcm, dirty_background_ratio_pcm in units of percent mille (1 ... 100,000)

Both dirty_ratio and dirty_ratio_pcm refer to the same vm_dirty_ratio variable,
only the interface to read/write this value is different. The same is valid for
dirty_background_ratio.

In this way it's possible to provide a fine-grained interface to configure the
writeback policy and at the same time preserve the compatibility with the old
dirty_ratio / dirty_background_ratio users.

Examples:
# echo 5 > /proc/sys/vm/dirty_ratio
# cat /proc/sys/vm/dirty_ratio
5
# cat /proc/sys/vm/dirty_ratio_pcm
5000

# echo 500 > /proc/sys/vm/dirty_ratio_pcm
# cat /proc/sys/vm/dirty_ratio
0
# cat /proc/sys/vm/dirty_ratio_pcm
500

# echo 5500 > /proc/sys/vm/dirty_ratio_pcm
# cat /proc/sys/vm/dirty_ratio
5
# cat /proc/sys/vm/dirty_ratio_pcm
5500

Changelog: (v1 -> v2)

* fix overflow in 32bit systems (calc_period_shift needs a u64)
* rebase (and tested) to 2.6.28-rc2-mm1

[1] http://lkml.org/lkml/2008/10/7/230

Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx>
---
Documentation/filesystems/proc.txt | 20 +++++++++
include/linux/sysctl.h | 7 +++
kernel/sysctl.c | 80 +++++++++++++++++++++++++++++++++--
kernel/sysctl_check.c | 3 +
mm/page-writeback.c | 31 +++++++++++---
5 files changed, 129 insertions(+), 12 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index bcceb99..38ed5bf 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1389,6 +1389,16 @@ pages + file cache, not including locked pages and HugePages), the number of
pages at which the pdflush background writeback daemon will start writing out
dirty data.

+dirty_background_ratio_pcm
+--------------------------
+
+A fine-grained interface to configure dirty_background_ratio.
+
+Contains, as a percentage in units of pcm (percent mille) of the dirtyable
+system memory (free pages + mapped pages + file cache, not including locked
+pages and HugePages), the number of pages at which the pdflush background
+writeback daemon will start writing out dirty data.
+
dirty_ratio
-----------------

@@ -1397,6 +1407,16 @@ pages + file cache, not including locked pages and HugePages), the number of
pages at which a process which is generating disk writes will itself start
writing out dirty data.

+dirty_ratio_pcm
+---------------
+
+A fine-grained interface to configure dirty_ratio.
+
+Contains, as a percentage in units of pcm (percent mille) of the dirtyable
+system memory (free pages + mapped pages + file cache, not including locked
+pages and HugePages), the number of pages at which a process which is
+generating disk writes will itself start writing out dirty data.
+
dirty_writeback_centisecs
-------------------------

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 39d471d..799594b 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -32,6 +32,9 @@
struct file;
struct completion;

+#define PERCENT_PCM 1000
+#define ONE_HUNDRED_PCM (100 * PERCENT_PCM)
+
#define CTL_MAXNAME 10 /* how many path components do we allow in a
call to sysctl? In other words, what is
the largest acceptable value for the nlen
@@ -205,6 +208,8 @@ enum
VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
+ VM_DIRTY_BACKGROUND_PCM = 36, /* fine-grained dirty_background_ratio */
+ VM_DIRTY_RATIO_PCM = 37, /* fine-grained dirty_ratio */
};


@@ -991,6 +996,8 @@ extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
+extern int proc_dointvec_pcm_minmax(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d14953a..06ba902 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -88,9 +88,7 @@ extern int rcutorture_runnable;
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */

/* Constants used for minimum and maximum */
-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
static int one = 1;
-#endif

#ifdef CONFIG_DETECT_SOFTLOCKUP
static int sixty = 60;
@@ -103,6 +101,7 @@ static int two = 2;

static int zero;
static int one_hundred = 100;
+static int one_hundred_pcm = ONE_HUNDRED_PCM;

/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -926,12 +925,23 @@ static struct ctl_table vm_table[] = {
.data = &dirty_background_ratio,
.maxlen = sizeof(dirty_background_ratio),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &proc_dointvec_pcm_minmax,
.strategy = &sysctl_intvec,
- .extra1 = &zero,
+ .extra1 = &one,
.extra2 = &one_hundred,
},
{
+ .ctl_name = VM_DIRTY_BACKGROUND_PCM,
+ .procname = "dirty_background_ratio_pcm",
+ .data = &dirty_background_ratio,
+ .maxlen = sizeof(dirty_background_ratio),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &one,
+ .extra2 = &one_hundred_pcm,
+ },
+ {
.ctl_name = VM_DIRTY_RATIO,
.procname = "dirty_ratio",
.data = &vm_dirty_ratio,
@@ -939,10 +949,21 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &dirty_ratio_handler,
.strategy = &sysctl_intvec,
- .extra1 = &zero,
+ .extra1 = &one,
.extra2 = &one_hundred,
},
{
+ .ctl_name = VM_DIRTY_RATIO_PCM,
+ .procname = "dirty_ratio_pcm",
+ .data = &vm_dirty_ratio,
+ .maxlen = sizeof(vm_dirty_ratio),
+ .mode = 0644,
+ .proc_handler = &dirty_ratio_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &one,
+ .extra2 = &one_hundred_pcm,
+ },
+ {
.procname = "dirty_writeback_centisecs",
.data = &dirty_writeback_interval,
.maxlen = sizeof(dirty_writeback_interval),
@@ -2525,6 +2546,35 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
lenp, ppos, HZ, 1000l);
}

+static int do_proc_dointvec_pcm_minmax_conv(int *negp, unsigned long *lvalp,
+ int *valp, int write, void *data)
+{
+ struct do_proc_dointvec_minmax_conv_param *param = data;
+ int val;
+
+ if (write) {
+ if (*lvalp > LONG_MAX / PERCENT_PCM)
+ return -EINVAL;
+ val = *negp ? -*lvalp : *lvalp;
+ if ((param->min && *param->min > val) ||
+ (param->max && *param->max < val))
+ return -EINVAL;
+ *valp = val * PERCENT_PCM;
+ } else {
+ unsigned long lval;
+
+ val = *valp;
+ if (val < 0) {
+ *negp = -1;
+ lval = (unsigned long)-val;
+ } else {
+ *negp = 0;
+ lval = (unsigned long)val;
+ }
+ *lvalp = lval / PERCENT_PCM;
+ }
+ return 0;
+}

static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
int *valp,
@@ -2663,6 +2713,19 @@ int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *fi
do_proc_dointvec_ms_jiffies_conv, NULL);
}

+int proc_dointvec_pcm_minmax(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct do_proc_dointvec_minmax_conv_param param = {
+ .min = (int *)table->extra1,
+ .max = (int *)table->extra2,
+ };
+
+ return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+ do_proc_dointvec_pcm_minmax_conv, &param);
+}
+
static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2711,6 +2774,13 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
return -ENOSYS;
}

+int proc_dointvec_pcm_minmax(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c35da23..83934a8 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,7 +111,9 @@ static const struct trans_ctl_table trans_vm_table[] = {
{ VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
{ VM_PAGE_CLUSTER, "page-cluster" },
{ VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
+ { VM_DIRTY_BACKGROUND_PCM, "dirty_background_ratio_pcm" },
{ VM_DIRTY_RATIO, "dirty_ratio" },
+ { VM_DIRTY_RATIO_PCM, "dirty_ratio_pcm" },
{ VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
{ VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" },
{ VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
@@ -1494,6 +1496,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
(table->proc_handler == proc_dostring) ||
(table->proc_handler == proc_dointvec) ||
(table->proc_handler == proc_dointvec_minmax) ||
+ (table->proc_handler == proc_dointvec_pcm_minmax) ||
(table->proc_handler == proc_dointvec_jiffies) ||
(table->proc_handler == proc_dointvec_userhz_jiffies) ||
(table->proc_handler == proc_dointvec_ms_jiffies) ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b3584bf..e010a39 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -66,7 +66,7 @@ static inline long sync_writeback_pages(void)
/*
* Start background writeback (via pdflush) at this percentage
*/
-int dirty_background_ratio = 5;
+int dirty_background_ratio = 5 * PERCENT_PCM;

/*
* free highmem will not be subtracted from the total free memory
@@ -77,7 +77,7 @@ int vm_highmem_is_dirtyable;
/*
* The generator of dirty data starts writeback at this percentage
*/
-int vm_dirty_ratio = 10;
+int vm_dirty_ratio = 10 * PERCENT_PCM;

/*
* The interval between `kupdate'-style writebacks, in jiffies
@@ -133,9 +133,10 @@ static struct prop_descriptor vm_dirties;
*/
static int calc_period_shift(void)
{
- unsigned long dirty_total;
+ u64 dirty_total;

- dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+ dirty_total = (vm_dirty_ratio * determine_dirtyable_memory())
+ / ONE_HUNDRED_PCM;
return 2 + ilog2(dirty_total - 1);
}

@@ -147,7 +148,23 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
- int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ int ret;
+
+ switch (table->ctl_name) {
+ case VM_DIRTY_RATIO:
+ ret = proc_dointvec_pcm_minmax(table, write, filp, buffer,
+ lenp, ppos);
+ break;
+ case VM_DIRTY_RATIO_PCM:
+ ret = proc_dointvec_minmax(table, write, filp, buffer,
+ lenp, ppos);
+ break;
+ default:
+ ret = -EINVAL;
+ WARN_ON(1);
+ break;
+ }
+
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
int shift = calc_period_shift();
prop_change_shift(&vm_completions, shift);
@@ -380,8 +397,8 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
if (background_ratio >= dirty_ratio)
background_ratio = dirty_ratio / 2;

- background = (background_ratio * available_memory) / 100;
- dirty = (dirty_ratio * available_memory) / 100;
+ background = (background_ratio * available_memory) / ONE_HUNDRED_PCM;
+ dirty = (dirty_ratio * available_memory) / ONE_HUNDRED_PCM;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
background += background / 4;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/