RE: [PATCH 00/23] per device dirty throttling -v9

From: Peter Zijlstra
Date: Thu Aug 23 2007 - 13:42:29 EST

Next message: Alexey Dobriyan: "Re: [PATCH resubmit] /drivers/ata ioremap returncode check"
Previous message: Maurizio Monge: "Spurious completions during NCQ with ahci/WDC WD1600BEVS-2"
In reply to: Martin Knoblauch: "RE: [PATCH 00/23] per device dirty throttling -v9"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Thu, 2007-08-23 at 08:59 -0700, Martin Knoblauch wrote:
> --- Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> wrote:
>
> > On Thu, 2007-08-16 at 05:49 -0700, Martin Knoblauch wrote:
> >
> > > Peter,
> > >
> > > any chance to get a rollup against 2.6.22-stable?
> > >
> > > The 2.6.23 series may not be usable for me due to the
> > > nosharedcache changes for NFS (the new default will massively
> > > disturb the user-space automounter).
> >
> > I'll see what I can do, bit busy with other stuff atm, hopefully
> > after
> > the weekend.
> >
> Hi Peter,
>
> any progress on a version against 2.6.22.5? I have seen the very
> positive report from Jeffrey W. Baker and would really love to test
> your patch. But as I said, anything newer than 2.6.22.x might not be an
> option due to the NFS changes.

mindless port, seems to compile and boot on my test box ymmv.

I think .5 should not present anything other than trivial rejects if
anything. But I'm not keeping -stable in my git remotes so I can't say
for sure.
Index: linux-2.6/fs/nfs/write.c
===================================================================
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -237,10 +237,8 @@ static void nfs_end_page_writeback(struc
struct nfs_server *nfss = NFS_SERVER(inode);

end_page_writeback(page);
- if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+ if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
clear_bdi_congested(&nfss->backing_dev_info, WRITE);
- congestion_end(WRITE);
- }
}

/*
@@ -466,6 +464,7 @@ nfs_mark_request_commit(struct nfs_page
set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
spin_unlock(&nfsi->req_lock);
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}

@@ -552,6 +551,8 @@ static void nfs_cancel_commit_list(struc
while(!list_empty(head)) {
req = nfs_list_entry(head->next);
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
nfs_list_remove_request(req);
clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
nfs_inode_remove_request(req);
@@ -1207,6 +1208,8 @@ nfs_commit_list(struct inode *inode, str
nfs_list_remove_request(req);
nfs_mark_request_commit(req);
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
nfs_clear_page_writeback(req);
}
return -ENOMEM;
@@ -1232,6 +1235,8 @@ static void nfs_commit_done(struct rpc_t
nfs_list_remove_request(req);
clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+ dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+ BDI_RECLAIMABLE);

dprintk("NFS: commit (%s/%Ld %d@%Ld)",
req->wb_context->dentry->d_inode->i_sb->s_id,
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -8,6 +8,9 @@
#ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H

+#include <linux/percpu_counter.h>
+#include <linux/log2.h>
+#include <linux/proportions.h>
#include <asm/atomic.h>

struct page;
@@ -24,6 +27,14 @@ enum bdi_state {

typedef int (congested_fn)(void *, int);

+enum bdi_stat_item {
+ BDI_RECLAIMABLE,
+ BDI_WRITEBACK,
+ NR_BDI_STAT_ITEMS
+};
+
+#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
struct backing_dev_info {
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
@@ -32,8 +43,90 @@ struct backing_dev_info {
void *congested_data; /* Pointer to aux data for congested func */
void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
void *unplug_io_data;
+
+ struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
+
+ struct prop_local_percpu completions;
+ int dirty_exceeded;
};

+int bdi_init(struct backing_dev_info *bdi);
+void bdi_destroy(struct backing_dev_info *bdi);
+
+static inline void __mod_bdi_stat(struct backing_dev_info *bdi,
+ enum bdi_stat_item item, s64 amount)
+{
+ __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
+}
+
+static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
+ enum bdi_stat_item item)
+{
+ __mod_bdi_stat(bdi, item, 1);
+}
+
+static inline void inc_bdi_stat(struct backing_dev_info *bdi,
+ enum bdi_stat_item item)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __inc_bdi_stat(bdi, item);
+ local_irq_restore(flags);
+}
+
+static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
+ enum bdi_stat_item item)
+{
+ __mod_bdi_stat(bdi, item, -1);
+}
+
+static inline void dec_bdi_stat(struct backing_dev_info *bdi,
+ enum bdi_stat_item item)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __dec_bdi_stat(bdi, item);
+ local_irq_restore(flags);
+}
+
+static inline s64 bdi_stat(struct backing_dev_info *bdi,
+ enum bdi_stat_item item)
+{
+ return percpu_counter_read_positive(&bdi->bdi_stat[item]);
+}
+
+static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
+ enum bdi_stat_item item)
+{
+ return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
+}
+
+static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
+ enum bdi_stat_item item)
+{
+ s64 sum;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sum = __bdi_stat_sum(bdi, item);
+ local_irq_restore(flags);
+
+ return sum;
+}
+
+/*
+ * maximal error of a stat counter.
+ */
+static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
+{
+#ifdef CONFIG_SMP
+ return nr_cpu_ids * BDI_STAT_BATCH;
+#else
+ return 1;
+#endif
+}

/*
* Flags in backing_dev_info::capability
@@ -94,7 +187,6 @@ void clear_bdi_congested(struct backing_
void set_bdi_congested(struct backing_dev_info *bdi, int rw);
long congestion_wait(int rw, long timeout);
long congestion_wait_interruptible(int rw, long timeout);
-void congestion_end(int rw);

#define bdi_cap_writeback_dirty(bdi) \
(!((bdi)->capabilities & BDI_CAP_NO_WRITEBACK))
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c
+++ linux-2.6/mm/backing-dev.c
@@ -5,6 +5,41 @@
#include <linux/sched.h>
#include <linux/module.h>

+int bdi_init(struct backing_dev_info *bdi)
+{
+ int i, j;
+ int err;
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
+ err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+ if (err)
+ goto err;
+ }
+
+ bdi->dirty_exceeded = 0;
+ err = prop_local_init_percpu(&bdi->completions);
+
+ if (err) {
+err:
+ for (j = 0; j < i; j++)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(bdi_init);
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+ int i;
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+
+ prop_local_destroy_percpu(&bdi->completions);
+}
+EXPORT_SYMBOL(bdi_destroy);
+
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -70,16 +105,3 @@ long congestion_wait_interruptible(int r
return ret;
}
EXPORT_SYMBOL(congestion_wait_interruptible);
-
-/**
- * congestion_end - wake up sleepers on a congested backing_dev_info
- * @rw: READ or WRITE
- */
-void congestion_end(int rw)
-{
- wait_queue_head_t *wqh = &congestion_wqh[rw];
-
- if (waitqueue_active(wqh))
- wake_up(wqh);
-}
-EXPORT_SYMBOL(congestion_end);
Index: linux-2.6/fs/ext2/balloc.c
===================================================================
--- linux-2.6.orig/fs/ext2/balloc.c
+++ linux-2.6/fs/ext2/balloc.c
@@ -124,7 +124,7 @@ static int reserve_blocks(struct super_b
return 0;
}

- percpu_counter_mod(&sbi->s_freeblocks_counter, -count);
+ percpu_counter_sub(&sbi->s_freeblocks_counter, count);
sb->s_dirt = 1;
return count;
}
@@ -134,7 +134,7 @@ static void release_blocks(struct super_
if (count) {
struct ext2_sb_info *sbi = EXT2_SB(sb);

- percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+ percpu_counter_add(&sbi->s_freeblocks_counter, count);
sb->s_dirt = 1;
}
}
Index: linux-2.6/fs/ext2/ialloc.c
===================================================================
--- linux-2.6.orig/fs/ext2/ialloc.c
+++ linux-2.6/fs/ext2/ialloc.c
@@ -542,7 +542,7 @@ got:
goto fail;
}

- percpu_counter_mod(&sbi->s_freeinodes_counter, -1);
+ percpu_counter_add(&sbi->s_freeinodes_counter, -1);
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);

Index: linux-2.6/fs/ext3/balloc.c
===================================================================
--- linux-2.6.orig/fs/ext3/balloc.c
+++ linux-2.6/fs/ext3/balloc.c
@@ -570,7 +570,7 @@ do_more:
cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
group_freed);
spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+ percpu_counter_add(&sbi->s_freeblocks_counter, count);

/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -1633,7 +1633,7 @@ allocated:
gdp->bg_free_blocks_count =
cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);

BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext3_journal_dirty_metadata(handle, gdp_bh);
Index: linux-2.6/fs/ext3/resize.c
===================================================================
--- linux-2.6.orig/fs/ext3/resize.c
+++ linux-2.6/fs/ext3/resize.c
@@ -884,9 +884,9 @@ int ext3_group_add(struct super_block *s
input->reserved_blocks);

/* Update the free space counts */
- percpu_counter_mod(&sbi->s_freeblocks_counter,
+ percpu_counter_add(&sbi->s_freeblocks_counter,
input->free_blocks_count);
- percpu_counter_mod(&sbi->s_freeinodes_counter,
+ percpu_counter_add(&sbi->s_freeinodes_counter,
EXT3_INODES_PER_GROUP(sb));

ext3_journal_dirty_metadata(handle, sbi->s_sbh);
Index: linux-2.6/fs/ext4/balloc.c
===================================================================
--- linux-2.6.orig/fs/ext4/balloc.c
+++ linux-2.6/fs/ext4/balloc.c
@@ -587,7 +587,7 @@ do_more:
cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
group_freed);
spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+ percpu_counter_add(&sbi->s_freeblocks_counter, count);

/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -1647,7 +1647,7 @@ allocated:
gdp->bg_free_blocks_count =
cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);

BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext4_journal_dirty_metadata(handle, gdp_bh);
Index: linux-2.6/fs/ext4/resize.c
===================================================================
--- linux-2.6.orig/fs/ext4/resize.c
+++ linux-2.6/fs/ext4/resize.c
@@ -893,9 +893,9 @@ int ext4_group_add(struct super_block *s
input->reserved_blocks);

/* Update the free space counts */
- percpu_counter_mod(&sbi->s_freeblocks_counter,
+ percpu_counter_add(&sbi->s_freeblocks_counter,
input->free_blocks_count);
- percpu_counter_mod(&sbi->s_freeinodes_counter,
+ percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb));

ext4_journal_dirty_metadata(handle, sbi->s_sbh);
Index: linux-2.6/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.orig/include/linux/percpu_counter.h
+++ linux-2.6/include/linux/percpu_counter.h
@@ -26,20 +26,43 @@ struct percpu_counter {
#define FBC_BATCH (NR_CPUS*4)
#endif

-static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+static inline
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
{
spin_lock_init(&fbc->lock);
fbc->count = amount;
fbc->counters = alloc_percpu(s32);
+ if (!fbc->counters)
+ return -ENOMEM;
+ return 0;
}

+int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
+
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
free_percpu(fbc->counters);
}

-void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
-s64 percpu_counter_sum(struct percpu_counter *fbc);
+void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
+void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
+s64 __percpu_counter_sum(struct percpu_counter *fbc);
+
+static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
+{
+ __percpu_counter_add(fbc, amount, FBC_BATCH);
+}
+
+static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
+{
+ s64 ret = __percpu_counter_sum(fbc);
+ return ret < 0 ? 0 : ret;
+}
+
+static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
+{
+ return __percpu_counter_sum(fbc);
+}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
@@ -67,17 +90,28 @@ struct percpu_counter {
s64 count;
};

-static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
{
fbc->count = amount;
+ return 0;
}

+#define percpu_counter_init_irq percpu_counter_init
+
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
}

+static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
+{
+ fbc->count = amount;
+}
+
+#define __percpu_counter_add(fbc, amount, batch) \
+ percpu_counter_add(fbc, amount)
+
static inline void
-percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
+percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
preempt_disable();
fbc->count += amount;
@@ -94,21 +128,31 @@ static inline s64 percpu_counter_read_po
return fbc->count;
}

-static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
+static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
return percpu_counter_read_positive(fbc);
}

+static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
+{
+ return percpu_counter_read(fbc);
+}
+
#endif /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
{
- percpu_counter_mod(fbc, 1);
+ percpu_counter_add(fbc, 1);
}

static inline void percpu_counter_dec(struct percpu_counter *fbc)
{
- percpu_counter_mod(fbc, -1);
+ percpu_counter_add(fbc, -1);
+}
+
+static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_add(fbc, -amount);
}

#endif /* _LINUX_PERCPU_COUNTER_H */
Index: linux-2.6/lib/percpu_counter.c
===================================================================
--- linux-2.6.orig/lib/percpu_counter.c
+++ linux-2.6/lib/percpu_counter.c
@@ -5,15 +5,41 @@
#include <linux/percpu_counter.h>
#include <linux/module.h>

-void percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
+void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
- long count;
+ int cpu;
+
+ spin_lock(&fbc->lock);
+ for_each_possible_cpu(cpu) {
+ s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
+ *pcount = 0;
+ }
+ fbc->count = amount;
+ spin_unlock(&fbc->lock);
+}
+EXPORT_SYMBOL(percpu_counter_set);
+
+static struct lock_class_key percpu_counter_irqsafe;
+
+int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount)
+{
+ int err;
+
+ err = percpu_counter_init(fbc, amount);
+ if (!err)
+ lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe);
+ return err;
+}
+
+void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
+{
+ s64 count;
s32 *pcount;
int cpu = get_cpu();

pcount = per_cpu_ptr(fbc->counters, cpu);
count = *pcount + amount;
- if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+ if (count >= batch || count <= -batch) {
spin_lock(&fbc->lock);
fbc->count += count;
*pcount = 0;
@@ -23,13 +49,13 @@ void percpu_counter_mod(struct percpu_co
}
put_cpu();
}
-EXPORT_SYMBOL(percpu_counter_mod);
+EXPORT_SYMBOL(__percpu_counter_add);

/*
* Add up all the per-cpu counts, return the result. This is a more accurate
* but much slower version of percpu_counter_read_positive()
*/
-s64 percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
s64 ret;
int cpu;
@@ -41,6 +67,6 @@ s64 percpu_counter_sum(struct percpu_cou
ret += *pcount;
}
spin_unlock(&fbc->lock);
- return ret < 0 ? 0 : ret;
+ return ret;
}
-EXPORT_SYMBOL(percpu_counter_sum);
+EXPORT_SYMBOL(__percpu_counter_sum);
Index: linux-2.6/fs/ext3/super.c
===================================================================
--- linux-2.6.orig/fs/ext3/super.c
+++ linux-2.6/fs/ext3/super.c
@@ -1406,6 +1406,7 @@ static int ext3_fill_super (struct super
int i;
int needs_recovery;
__le32 features;
+ int err;

sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -1665,12 +1666,16 @@ static int ext3_fill_super (struct super
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);

- percpu_counter_init(&sbi->s_freeblocks_counter,
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
ext3_count_free_blocks(sb));
- percpu_counter_init(&sbi->s_freeinodes_counter,
+ err |= percpu_counter_init(&sbi->s_freeinodes_counter,
ext3_count_free_inodes(sb));
- percpu_counter_init(&sbi->s_dirs_counter,
+ err |= percpu_counter_init(&sbi->s_dirs_counter,
ext3_count_dirs(sb));
+ if (err) {
+ printk(KERN_ERR "EXT3-fs: insufficient memory\n");
+ goto failed_mount3;
+ }

/* per fileystem reservation list head & lock */
spin_lock_init(&sbi->s_rsv_window_lock);
@@ -2448,12 +2453,12 @@ static int ext3_statfs (struct dentry *
buf->f_type = EXT3_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
- buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+ buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
- buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+ buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
buf->f_namelen = EXT3_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
Index: linux-2.6/fs/ext4/super.c
===================================================================
--- linux-2.6.orig/fs/ext4/super.c
+++ linux-2.6/fs/ext4/super.c
@@ -1465,6 +1465,7 @@ static int ext4_fill_super (struct super
int needs_recovery;
__le32 features;
__u64 blocks_count;
+ int err;

sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -1737,12 +1738,16 @@ static int ext4_fill_super (struct super
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);

- percpu_counter_init(&sbi->s_freeblocks_counter,
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
ext4_count_free_blocks(sb));
- percpu_counter_init(&sbi->s_freeinodes_counter,
+ err |= percpu_counter_init(&sbi->s_freeinodes_counter,
ext4_count_free_inodes(sb));
- percpu_counter_init(&sbi->s_dirs_counter,
+ err |= percpu_counter_init(&sbi->s_dirs_counter,
ext4_count_dirs(sb));
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+ goto failed_mount3;
+ }

/* per fileystem reservation list head & lock */
spin_lock_init(&sbi->s_rsv_window_lock);
@@ -2523,12 +2528,12 @@ static int ext4_statfs (struct dentry *
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = ext4_blocks_count(es) - overhead;
- buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+ buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
- buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+ buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
buf->f_namelen = EXT4_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
Index: linux-2.6/fs/file_table.c
===================================================================
--- linux-2.6.orig/fs/file_table.c
+++ linux-2.6/fs/file_table.c
@@ -98,7 +98,7 @@ struct file *get_empty_filp(void)
* percpu_counters are inaccurate. Do an expensive check before
* we go and fail.
*/
- if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
+ if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
goto over;
}

Index: linux-2.6/fs/ext2/super.c
===================================================================
--- linux-2.6.orig/fs/ext2/super.c
+++ linux-2.6/fs/ext2/super.c
@@ -652,6 +652,7 @@ static int ext2_fill_super(struct super_
int db_count;
int i, j;
__le32 features;
+ int err;

sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -907,12 +908,16 @@ static int ext2_fill_super(struct super_
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);

- percpu_counter_init(&sbi->s_freeblocks_counter,
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
ext2_count_free_blocks(sb));
- percpu_counter_init(&sbi->s_freeinodes_counter,
+ err |= percpu_counter_init(&sbi->s_freeinodes_counter,
ext2_count_free_inodes(sb));
- percpu_counter_init(&sbi->s_dirs_counter,
+ err |= percpu_counter_init(&sbi->s_dirs_counter,
ext2_count_dirs(sb));
+ if (err) {
+ printk(KERN_ERR "EXT2-fs: insufficient memory\n");
+ goto failed_mount3;
+ }
/*
* set up enough so that it can read an inode
*/
Index: linux-2.6/block/ll_rw_blk.c
===================================================================
--- linux-2.6.orig/block/ll_rw_blk.c
+++ linux-2.6/block/ll_rw_blk.c
@@ -1783,6 +1783,7 @@ static void blk_release_queue(struct kob

blk_trace_shutdown(q);

+ bdi_destroy(&q->backing_dev_info);
kmem_cache_free(requestq_cachep, q);
}

@@ -1835,6 +1836,7 @@ static struct kobj_type queue_ktype;

request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
{
+ int err;
request_queue_t *q;

q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id);
@@ -1842,15 +1844,20 @@ request_queue_t *blk_alloc_queue_node(gf
return NULL;

memset(q, 0, sizeof(*q));
+ q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+ q->backing_dev_info.unplug_io_data = q;
+ err = bdi_init(&q->backing_dev_info);
+ if (err) {
+ kmem_cache_free(requestq_cachep, q);
+ return NULL;
+ }
+
init_timer(&q->unplug_timer);

snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
q->kobj.ktype = &queue_ktype;
kobject_init(&q->kobj);

- q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
- q->backing_dev_info.unplug_io_data = q;
-
mutex_init(&q->sysfs_lock);

return q;
@@ -3984,6 +3991,73 @@ static ssize_t queue_max_hw_sectors_show
return queue_var_show(max_hw_sectors_kb, (page));
}

+static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page)
+{
+ unsigned long long nr_reclaimable =
+ bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE);
+
+ return sprintf(page, "%llu\n",
+ nr_reclaimable >> (PAGE_CACHE_SHIFT - 10));
+}
+
+static ssize_t queue_nr_writeback_show(struct request_queue *q, char *page)
+{
+ unsigned long long nr_writeback =
+ bdi_stat(&q->backing_dev_info, BDI_WRITEBACK);
+
+ return sprintf(page, "%llu\n",
+ nr_writeback >> (PAGE_CACHE_SHIFT - 10));
+}
+
+extern void bdi_writeout_fraction(struct backing_dev_info *bdi,
+ long *numerator, long *denominator);
+
+static ssize_t queue_nr_cache_ratio_show(struct request_queue *q, char *page)
+{
+ long scale, div;
+
+ bdi_writeout_fraction(&q->backing_dev_info, &scale, &div);
+ scale *= 1024;
+ scale /= div;
+
+ return sprintf(page, "%ld\n", scale);
+}
+
+static ssize_t queue_nr_cache_num_show(struct request_queue *q, char *page)
+{
+ long scale, div;
+
+ bdi_writeout_fraction(&q->backing_dev_info, &scale, &div);
+
+ return sprintf(page, "%ld\n", scale);
+}
+
+static ssize_t queue_nr_cache_denom_show(struct request_queue *q, char *page)
+{
+ long scale, div;
+
+ bdi_writeout_fraction(&q->backing_dev_info, &scale, &div);
+
+ return sprintf(page, "%ld\n", div);
+}
+
+extern void
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+ struct backing_dev_info *bdi);
+
+static ssize_t queue_nr_cache_size_show(struct request_queue *q, char *page)
+{
+ long background, dirty, bdi_dirty;
+ get_dirty_limits(&background, &dirty, &bdi_dirty, &q->backing_dev_info);
+ return sprintf(page, "%ld\n", bdi_dirty);
+}
+
+static ssize_t queue_nr_cache_total_show(struct request_queue *q, char *page)
+{
+ long background, dirty, bdi_dirty;
+ get_dirty_limits(&background, &dirty, &bdi_dirty, &q->backing_dev_info);
+ return sprintf(page, "%ld\n", dirty);
+}

static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -4008,6 +4082,41 @@ static struct queue_sysfs_entry queue_ma
.show = queue_max_hw_sectors_show,
};

+static struct queue_sysfs_entry queue_reclaimable_entry = {
+ .attr = {.name = "reclaimable_kb", .mode = S_IRUGO },
+ .show = queue_nr_reclaimable_show,
+};
+
+static struct queue_sysfs_entry queue_writeback_entry = {
+ .attr = {.name = "writeback_kb", .mode = S_IRUGO },
+ .show = queue_nr_writeback_show,
+};
+
+static struct queue_sysfs_entry queue_cache_ratio_entry = {
+ .attr = {.name = "cache_ratio", .mode = S_IRUGO },
+ .show = queue_nr_cache_ratio_show,
+};
+
+static struct queue_sysfs_entry queue_cache_num_entry = {
+ .attr = {.name = "cache_num", .mode = S_IRUGO },
+ .show = queue_nr_cache_num_show,
+};
+
+static struct queue_sysfs_entry queue_cache_denom_entry = {
+ .attr = {.name = "cache_denom", .mode = S_IRUGO },
+ .show = queue_nr_cache_denom_show,
+};
+
+static struct queue_sysfs_entry queue_cache_size_entry = {
+ .attr = {.name = "cache_size", .mode = S_IRUGO },
+ .show = queue_nr_cache_size_show,
+};
+
+static struct queue_sysfs_entry queue_cache_total_entry = {
+ .attr = {.name = "cache_total", .mode = S_IRUGO },
+ .show = queue_nr_cache_total_show,
+};
+
static struct queue_sysfs_entry queue_iosched_entry = {
.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
.show = elv_iosched_show,
@@ -4019,6 +4128,13 @@ static struct attribute *default_attrs[]
&queue_ra_entry.attr,
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
+ &queue_reclaimable_entry.attr,
+ &queue_writeback_entry.attr,
+ &queue_cache_ratio_entry.attr,
+ &queue_cache_num_entry.attr,
+ &queue_cache_denom_entry.attr,
+ &queue_cache_size_entry.attr,
+ &queue_cache_total_entry.attr,
&queue_iosched_entry.attr,
NULL,
};
Index: linux-2.6/drivers/block/rd.c
===================================================================
--- linux-2.6.orig/drivers/block/rd.c
+++ linux-2.6/drivers/block/rd.c
@@ -411,6 +411,9 @@ static void __exit rd_cleanup(void)
blk_cleanup_queue(rd_queue[i]);
}
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+ bdi_destroy(&rd_file_backing_dev_info);
+ bdi_destroy(&rd_backing_dev_info);
}

/*
@@ -419,7 +422,19 @@ static void __exit rd_cleanup(void)
static int __init rd_init(void)
{
int i;
- int err = -ENOMEM;
+ int err;
+
+ err = bdi_init(&rd_backing_dev_info);
+ if (err)
+ goto out2;
+
+ err = bdi_init(&rd_file_backing_dev_info);
+ if (err) {
+ bdi_destroy(&rd_backing_dev_info);
+ goto out2;
+ }
+
+ err = -ENOMEM;

if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
(rd_blocksize & (rd_blocksize-1))) {
@@ -473,6 +488,9 @@ out:
put_disk(rd_disks[i]);
blk_cleanup_queue(rd_queue[i]);
}
+ bdi_destroy(&rd_backing_dev_info);
+ bdi_destroy(&rd_file_backing_dev_info);
+out2:
return err;
}

Index: linux-2.6/drivers/char/mem.c
===================================================================
--- linux-2.6.orig/drivers/char/mem.c
+++ linux-2.6/drivers/char/mem.c
@@ -977,6 +977,11 @@ static struct class *mem_class;
static int __init chr_dev_init(void)
{
int i;
+ int err;
+
+ err = bdi_init(&zero_bdi);
+ if (err)
+ return err;

if (register_chrdev(MEM_MAJOR,"mem",&memory_fops))
printk("unable to get major %d for memory devs\n", MEM_MAJOR);
Index: linux-2.6/fs/char_dev.c
===================================================================
--- linux-2.6.orig/fs/char_dev.c
+++ linux-2.6/fs/char_dev.c
@@ -546,6 +546,7 @@ static struct kobject *base_probe(dev_t
void __init chrdev_init(void)
{
cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
+ bdi_init(&directly_mappable_cdev_bdi);
}

Index: linux-2.6/fs/fuse/inode.c
===================================================================
--- linux-2.6.orig/fs/fuse/inode.c
+++ linux-2.6/fs/fuse/inode.c
@@ -401,6 +401,7 @@ static int fuse_show_options(struct seq_
static struct fuse_conn *new_conn(void)
{
struct fuse_conn *fc;
+ int err;

fc = kzalloc(sizeof(*fc), GFP_KERNEL);
if (fc) {
@@ -416,10 +417,17 @@ static struct fuse_conn *new_conn(void)
atomic_set(&fc->num_waiting, 0);
fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc->bdi.unplug_io_fn = default_unplug_io_fn;
+ err = bdi_init(&fc->bdi);
+ if (err) {
+ kfree(fc);
+ fc = NULL;
+ goto out;
+ }
fc->reqctr = 0;
fc->blocked = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
}
+out:
return fc;
}

@@ -429,6 +437,7 @@ void fuse_conn_put(struct fuse_conn *fc)
if (fc->destroy_req)
fuse_request_free(fc->destroy_req);
mutex_destroy(&fc->inst_mutex);
+ bdi_destroy(&fc->bdi);
kfree(fc);
}
}
Index: linux-2.6/fs/nfs/client.c
===================================================================
--- linux-2.6.orig/fs/nfs/client.c
+++ linux-2.6/fs/nfs/client.c
@@ -658,6 +658,7 @@ static void nfs_server_set_fsinfo(struct
if (server->rsize > NFS_MAX_FILE_IO_SIZE)
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;

if (server->wsize > max_rpc_payload)
@@ -708,6 +709,10 @@ static int nfs_probe_fsinfo(struct nfs_s
goto out_error;

nfs_server_set_fsinfo(server, &fsinfo);
+ error = bdi_init(&server->backing_dev_info);
+ if (error)
+ goto out_error;
+

/* Get some general file system info */
if (server->namelen == 0) {
@@ -787,6 +792,7 @@ void nfs_free_server(struct nfs_server *
nfs_put_client(server->nfs_client);

nfs_free_iostats(server->io_stats);
+ bdi_destroy(&server->backing_dev_info);
kfree(server);
nfs_release_automount_timer();
dprintk("<-- nfs_free_server()\n");
Index: linux-2.6/fs/hugetlbfs/inode.c
===================================================================
--- linux-2.6.orig/fs/hugetlbfs/inode.c
+++ linux-2.6/fs/hugetlbfs/inode.c
@@ -802,11 +802,15 @@ static int __init init_hugetlbfs_fs(void
int error;
struct vfsmount *vfsmount;

+ error = bdi_init(&hugetlbfs_backing_dev_info);
+ if (error)
+ return error;
+
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
0, 0, init_once, NULL);
if (hugetlbfs_inode_cachep == NULL)
- return -ENOMEM;
+ goto out2;

error = register_filesystem(&hugetlbfs_fs_type);
if (error)
@@ -824,6 +828,8 @@ static int __init init_hugetlbfs_fs(void
out:
if (error)
kmem_cache_destroy(hugetlbfs_inode_cachep);
+ out2:
+ bdi_destroy(&hugetlbfs_backing_dev_info);
return error;
}

@@ -831,6 +837,7 @@ static void __exit exit_hugetlbfs_fs(voi
{
kmem_cache_destroy(hugetlbfs_inode_cachep);
unregister_filesystem(&hugetlbfs_fs_type);
+ bdi_destroy(&hugetlbfs_backing_dev_info);
}

module_init(init_hugetlbfs_fs)
Index: linux-2.6/fs/ocfs2/dlm/dlmfs.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/dlm/dlmfs.c
+++ linux-2.6/fs/ocfs2/dlm/dlmfs.c
@@ -588,13 +588,17 @@ static int __init init_dlmfs_fs(void)

dlmfs_print_version();

+ status = bdi_init(&dlmfs_backing_dev_info);
+ if (status)
+ return status;
+
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
dlmfs_init_once, NULL);
if (!dlmfs_inode_cache)
- return -ENOMEM;
+ goto bail;
cleanup_inode = 1;

user_dlm_worker = create_singlethread_workqueue("user_dlm");
@@ -611,6 +615,7 @@ bail:
kmem_cache_destroy(dlmfs_inode_cache);
if (cleanup_worker)
destroy_workqueue(user_dlm_worker);
+ bdi_destroy(&dlmfs_backing_dev_info);
} else
printk("OCFS2 User DLM kernel interface loaded\n");
return status;
@@ -624,6 +629,8 @@ static void __exit exit_dlmfs_fs(void)
destroy_workqueue(user_dlm_worker);

kmem_cache_destroy(dlmfs_inode_cache);
+
+ bdi_destroy(&dlmfs_backing_dev_info);
}

MODULE_AUTHOR("Oracle");
Index: linux-2.6/fs/configfs/configfs_internal.h
===================================================================
--- linux-2.6.orig/fs/configfs/configfs_internal.h
+++ linux-2.6/fs/configfs/configfs_internal.h
@@ -55,6 +55,8 @@ extern int configfs_is_root(struct confi

extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern int configfs_inode_init(void);
+extern void configfs_inode_exit(void);

extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *,
Index: linux-2.6/fs/configfs/inode.c
===================================================================
--- linux-2.6.orig/fs/configfs/inode.c
+++ linux-2.6/fs/configfs/inode.c
@@ -256,4 +256,12 @@ void configfs_hash_and_remove(struct den
mutex_unlock(&dir->d_inode->i_mutex);
}

+int __init configfs_inode_init(void)
+{
+ return bdi_init(&configfs_backing_dev_info);
+}

+void __exit configfs_inode_exit(void)
+{
+ bdi_destroy(&configfs_backing_dev_info);
+}
Index: linux-2.6/fs/configfs/mount.c
===================================================================
--- linux-2.6.orig/fs/configfs/mount.c
+++ linux-2.6/fs/configfs/mount.c
@@ -154,8 +154,16 @@ static int __init configfs_init(void)
subsystem_unregister(&config_subsys);
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
+ goto out;
}

+ err = configfs_inode_init();
+ if (err) {
+ unregister_filesystem(&configfs_fs_type);
+ subsystem_unregister(&config_subsys);
+ kmem_cache_destroy(configfs_dir_cachep);
+ configfs_dir_cachep = NULL;
+ }
out:
return err;
}
@@ -166,6 +174,7 @@ static void __exit configfs_exit(void)
subsystem_unregister(&config_subsys);
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
+ configfs_inode_exit();
}

MODULE_AUTHOR("Oracle");
Index: linux-2.6/fs/ramfs/inode.c
===================================================================
--- linux-2.6.orig/fs/ramfs/inode.c
+++ linux-2.6/fs/ramfs/inode.c
@@ -222,7 +222,17 @@ module_exit(exit_ramfs_fs)

int __init init_rootfs(void)
{
- return register_filesystem(&rootfs_fs_type);
+ int err;
+
+ err = bdi_init(&ramfs_backing_dev_info);
+ if (err)
+ return err;
+
+ err = register_filesystem(&rootfs_fs_type);
+ if (err)
+ bdi_destroy(&ramfs_backing_dev_info);
+
+ return err;
}

MODULE_LICENSE("GPL");
Index: linux-2.6/fs/sysfs/inode.c
===================================================================
--- linux-2.6.orig/fs/sysfs/inode.c
+++ linux-2.6/fs/sysfs/inode.c
@@ -44,6 +44,11 @@ void sysfs_delete_inode(struct inode *in
return generic_delete_inode(inode);
}

+int __init sysfs_inode_init(void)
+{
+ return bdi_init(&sysfs_backing_dev_info);
+}
+
int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
{
struct inode * inode = dentry->d_inode;
Index: linux-2.6/fs/sysfs/mount.c
===================================================================
--- linux-2.6.orig/fs/sysfs/mount.c
+++ linux-2.6/fs/sysfs/mount.c
@@ -98,6 +98,10 @@ int __init sysfs_init(void)
if (!sysfs_dir_cachep)
goto out;

+ err = sysfs_inode_init();
+ if (err)
+ goto out_err;
+
err = register_filesystem(&sysfs_fs_type);
if (!err) {
sysfs_mount = kern_mount(&sysfs_fs_type);
Index: linux-2.6/fs/sysfs/sysfs.h
===================================================================
--- linux-2.6.orig/fs/sysfs/sysfs.h
+++ linux-2.6/fs/sysfs/sysfs.h
@@ -17,6 +17,7 @@ extern struct kmem_cache *sysfs_dir_cach
extern void sysfs_delete_inode(struct inode *inode);
extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *);
extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern int sysfs_inode_init(void);

extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *);
extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c
+++ linux-2.6/mm/shmem.c
@@ -2490,6 +2490,10 @@ static int __init init_tmpfs(void)
{
int error;

+ error = bdi_init(&shmem_backing_dev_info);
+ if (error)
+ goto out4;
+
error = init_inodecache();
if (error)
goto out3;
@@ -2514,6 +2518,8 @@ out1:
out2:
destroy_inodecache();
out3:
+ bdi_destroy(&shmem_backing_dev_info);
+out4:
shm_mnt = ERR_PTR(error);
return error;
}
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c
+++ linux-2.6/mm/swap.c
@@ -505,6 +505,10 @@ void __init swap_setup(void)
{
unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);

+#ifdef CONFIG_SWAP
+ bdi_init(swapper_space.backing_dev_info);
+#endif
+
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
page_cluster = 2;
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c
+++ linux-2.6/mm/readahead.c
@@ -75,6 +75,12 @@ static inline void ra_off(struct file_ra
return;
}

+static int __init readahead_init(void)
+{
+ return bdi_init(&default_backing_dev_info);
+}
+subsys_initcall(readahead_init);
+
/*
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -726,6 +726,8 @@ int __set_page_dirty_buffers(struct page
if (page->mapping) { /* Race with truncate? */
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
task_io_account_write(PAGE_CACHE_SIZE);
}
radix_tree_tag_set(&mapping->page_tree,
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -2,6 +2,7 @@
* mm/page-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@xxxxxxxxxx>
*
* Contains functions related to writing back dirty pages at the
* address_space level.
@@ -49,8 +50,6 @@
*/
static long ratelimit_pages = 32;

-static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
-
/*
* When balance_dirty_pages decides that the caller needs to perform some
* non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode);
static void background_writeout(unsigned long _min_pages);

/*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by keeping a floating proportion between BDIs, based on page
+ * writeback completions [end_page_writeback()]. Those devices that write out
+ * pages fastest will get the larger share, while the slower will get a smaller
+ * share.
+ *
+ * We use page writeout completions because we are interested in getting rid of
+ * dirty pages. Having them written out is the primary goal.
+ *
+ * We introduce a concept of time, a period over which we measure these events,
+ * because demand can/will vary over time. The length of this period itself is
+ * measured in page writeback completions.
+ *
+ */
+static struct prop_descriptor vm_completions;
+static struct prop_descriptor vm_dirties;
+
+static unsigned long determine_dirtyable_memory(void);
+
+/*
+ * couple the period to the dirty_ratio:
+ *
+ * period/2 ~ roundup_pow_of_two(dirty limit)
+ */
+static int calc_period_shift(void)
+{
+ unsigned long dirty_total;
+
+ dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+ return 2 + ilog2(dirty_total - 1);
+}
+
+/*
+ * update the period when the dirty ratio changes.
+ */
+int dirty_ratio_handler(ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_ratio = vm_dirty_ratio;
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+ int shift = calc_period_shift();
+ prop_change_shift(&vm_completions, shift);
+ prop_change_shift(&vm_dirties, shift);
+ }
+ return ret;
+}
+
+/*
+ * Increment the BDI's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+ __prop_inc_percpu(&vm_completions, &bdi->completions);
+}
+
+static inline void task_dirty_inc(struct task_struct *tsk)
+{
+ prop_inc_single(&vm_dirties, &tsk->dirties);
+}
+
+/*
+ * Obtain an accurate fraction of the BDI's portion.
+ */
+void bdi_writeout_fraction(struct backing_dev_info *bdi,
+ long *numerator, long *denominator)
+{
+ if (bdi_cap_writeback_dirty(bdi)) {
+ prop_fraction_percpu(&vm_completions, &bdi->completions,
+ numerator, denominator);
+ } else {
+ *numerator = 0;
+ *denominator = 1;
+ }
+}
+
+/*
+ * Clip the earned share of dirty pages to that which is actually available.
+ * This avoids exceeding the total dirty_limit when the floating averages
+ * fluctuate too quickly.
+ */
+static void
+clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+{
+ long avail_dirty;
+
+ avail_dirty = dirty -
+ (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_WRITEBACK) +
+ global_page_state(NR_UNSTABLE_NFS));
+
+ if (avail_dirty < 0)
+ avail_dirty = 0;
+
+ avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+ bdi_stat(bdi, BDI_WRITEBACK);
+
+ *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
+}
+
+static inline void task_dirties_fraction(struct task_struct *tsk,
+ long *numerator, long *denominator)
+{
+ prop_fraction_single(&vm_dirties, &tsk->dirties,
+ numerator, denominator);
+}
+
+/*
+ * scale the dirty limit
+ *
+ * task specific dirty limit:
+ *
+ * dirty -= (dirty/8) * p_{t}
+ */
+void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+{
+ long numerator, denominator;
+ long dirty = *pdirty;
+ long long inv = dirty >> 3;
+
+ task_dirties_fraction(tsk, &numerator, &denominator);
+ inv *= numerator;
+ do_div(inv, denominator);
+
+ dirty -= inv;
+ if (dirty < *pdirty/2)
+ dirty = *pdirty/2;
+
+ *pdirty = dirty;
+}
+
+/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
*
@@ -157,9 +291,9 @@ static unsigned long determine_dirtyable
return x + 1; /* Ensure that we never return 0 */
}

-static void
-get_dirty_limits(long *pbackground, long *pdirty,
- struct address_space *mapping)
+void
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+ struct backing_dev_info *bdi)
{
int background_ratio; /* Percentages */
int dirty_ratio;
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long
}
*pbackground = background;
*pdirty = dirty;
+
+ if (bdi) {
+ long long bdi_dirty = dirty;
+ long numerator, denominator;
+
+ /*
+ * Calculate this BDI's share of the dirty ratio.
+ */
+ bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+ bdi_dirty *= numerator;
+ do_div(bdi_dirty, denominator);
+
+ *pbdi_dirty = bdi_dirty;
+ clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
+ task_dirty_limit(current, pbdi_dirty);
+ }
}

/*
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long
*/
static void balance_dirty_pages(struct address_space *mapping)
{
- long nr_reclaimable;
+ long bdi_nr_reclaimable;
+ long bdi_nr_writeback;
long background_thresh;
long dirty_thresh;
+ long bdi_thresh;
unsigned long pages_written = 0;
unsigned long write_chunk = sync_writeback_pages();

@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct a
.range_cyclic = 1,
};

- get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
- dirty_thresh)
+ get_dirty_limits(&background_thresh, &dirty_thresh,
+ &bdi_thresh, bdi);
+ bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+ if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
break;

- if (!dirty_exceeded)
- dirty_exceeded = 1;
+ if (!bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 1;

/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
@@ -237,16 +390,37 @@ static void balance_dirty_pages(struct a
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
*/
- if (nr_reclaimable) {
+ if (bdi_nr_reclaimable) {
writeback_inodes(&wbc);
- get_dirty_limits(&background_thresh,
- &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable +
- global_page_state(NR_WRITEBACK)
- <= dirty_thresh)
- break;
+
+ get_dirty_limits(&background_thresh, &dirty_thresh,
+ &bdi_thresh, bdi);
+
+ /*
+ * In order to avoid the stacked BDI deadlock we need
+ * to ensure we accurately count the 'dirty' pages when
+ * the threshold is low.
+ *
+ * Otherwise it would be possible to get thresh+n pages
+ * reported dirty, even though there are thresh-m pages
+ * actually dirty; with m+n sitting in the percpu
+ * deltas.
+ */
+ if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+ bdi_nr_reclaimable =
+ bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+ bdi_nr_writeback =
+ bdi_stat_sum(bdi, BDI_WRITEBACK);
+ } else {
+ bdi_nr_reclaimable =
+ bdi_stat(bdi, BDI_RECLAIMABLE);
+ bdi_nr_writeback =
+ bdi_stat(bdi, BDI_WRITEBACK);
+ }
+
+ if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ break;
+
pages_written += write_chunk - wbc.nr_to_write;
if (pages_written >= write_chunk)
break; /* We've done our duty */
@@ -254,9 +428,9 @@ static void balance_dirty_pages(struct a
congestion_wait(WRITE, HZ/10);
}

- if (nr_reclaimable + global_page_state(NR_WRITEBACK)
- <= dirty_thresh && dirty_exceeded)
- dirty_exceeded = 0;
+ if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
+ bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 0;

if (writeback_in_progress(bdi))
return; /* pdflush is already working this queue */
@@ -270,7 +444,9 @@ static void balance_dirty_pages(struct a
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && (nr_reclaimable > background_thresh)))
+ (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+ + global_page_state(NR_UNSTABLE_NFS)
+ > background_thresh)))
pdflush_operation(background_writeout, 0);
}

@@ -306,7 +482,7 @@ void balance_dirty_pages_ratelimited_nr(
unsigned long *p;

ratelimit = ratelimit_pages;
- if (dirty_exceeded)
+ if (mapping->backing_dev_info->dirty_exceeded)
ratelimit = 8;

/*
@@ -342,7 +518,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
}

for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);

/*
* Boost the allowable dirty threshold a bit for page
@@ -377,7 +553,7 @@ static void background_writeout(unsigned
long background_thresh;
long dirty_thresh;

- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
if (global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) < background_thresh
&& min_pages <= 0)
@@ -582,9 +758,15 @@ static struct notifier_block __cpuinitda
*/
void __init page_writeback_init(void)
{
+ int shift;
+
mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
+
+ shift = calc_period_shift();
+ prop_descriptor_init(&vm_completions, shift);
+ prop_descriptor_init(&vm_dirties, shift);
}

/**
@@ -828,6 +1010,8 @@ int __set_page_dirty_nobuffers(struct pa
BUG_ON(mapping2 != mapping);
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
task_io_account_write(PAGE_CACHE_SIZE);
}
radix_tree_tag_set(&mapping->page_tree,
@@ -860,7 +1044,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage
* If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads.
*/
-int fastcall set_page_dirty(struct page *page)
+static int __set_page_dirty(struct page *page)
{
struct address_space *mapping = page_mapping(page);

@@ -878,6 +1062,14 @@ int fastcall set_page_dirty(struct page
}
return 0;
}
+
+int fastcall set_page_dirty(struct page *page)
+{
+ int ret = __set_page_dirty(page);
+ if (ret)
+ task_dirty_inc(current);
+ return ret;
+}
EXPORT_SYMBOL(set_page_dirty);

/*
@@ -954,6 +1146,8 @@ int clear_page_dirty_for_io(struct page
set_page_dirty(page);
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
return 1;
}
return 0;
@@ -968,14 +1162,20 @@ int test_clear_page_writeback(struct pag
int ret;

if (mapping) {
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;

write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestClearPageWriteback(page);
- if (ret)
+ if (ret) {
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
+ if (bdi_cap_writeback_dirty(bdi)) {
+ __dec_bdi_stat(bdi, BDI_WRITEBACK);
+ __bdi_writeout_inc(bdi);
+ }
+ }
write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
@@ -989,14 +1189,18 @@ int test_set_page_writeback(struct page
int ret;

if (mapping) {
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;

write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
- if (!ret)
+ if (!ret) {
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
+ if (bdi_cap_writeback_dirty(bdi))
+ __inc_bdi_stat(bdi, BDI_WRITEBACK);
+ }
if (!PageDirty(page))
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c
+++ linux-2.6/mm/truncate.c
@@ -72,6 +72,8 @@ void cancel_dirty_page(struct page *page
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
if (account_size)
task_io_account_cancelled_write(account_size);
}
Index: linux-2.6/lib/proportions.c
===================================================================
--- /dev/null
+++ linux-2.6/lib/proportions.c
@@ -0,0 +1,385 @@
+/*
+ * FLoating proportions
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@xxxxxxxxxx>
+ *
+ * Description:
+ *
+ * The floating proportion is a time derivative with an exponentially decaying
+ * history:
+ *
+ * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
+ *
+ * Where j is an element from {prop_local}, x_{j} is j's number of events,
+ * and i the time period over which the differential is taken. So d/dt_{-i} is
+ * the differential over the i-th last period.
+ *
+ * The decaying history gives smooth transitions. The time differential carries
+ * the notion of speed.
+ *
+ * The denominator is 2^(1+i) because we want the series to be normalised, ie.
+ *
+ * \Sum_{i=0} 1/2^(1+i) = 1
+ *
+ * Further more, if we measure time (t) in the same events as x; so that:
+ *
+ * t = \Sum_{j} x_{j}
+ *
+ * we get that:
+ *
+ * \Sum_{j} p_{j} = 1
+ *
+ * Writing this in an iterative fashion we get (dropping the 'd's):
+ *
+ * if (++x_{j}, ++t > period)
+ * t /= 2;
+ * for_each (j)
+ * x_{j} /= 2;
+ *
+ * so that:
+ *
+ * p_{j} = x_{j} / t;
+ *
+ * We optimize away the '/= 2' for the global time delta by noting that:
+ *
+ * if (++t > period) t /= 2:
+ *
+ * Can be approximated by:
+ *
+ * period/2 + (++t % period/2)
+ *
+ * [ Furthermore, when we choose period to be 2^n it can be written in terms of
+ * binary operations and wraparound artefacts disappear. ]
+ *
+ * Also note that this yields a natural counter of the elapsed periods:
+ *
+ * c = t / (period/2)
+ *
+ * [ Its monotonic increasing property can be applied to mitigate the wrap-
+ * around issue. ]
+ *
+ * This allows us to do away with the loop over all prop_locals on each period
+ * expiration. By remembering the period count under which it was last accessed
+ * as c_{j}, we can obtain the number of 'missed' cycles from:
+ *
+ * c - c_{j}
+ *
+ * We can then lazily catch up to the global period count every time we are
+ * going to use x_{j}, by doing:
+ *
+ * x_{j} /= 2^(c - c_{j}), c_{j} = c
+ */
+
+#include <linux/proportions.h>
+#include <linux/rcupdate.h>
+
+/*
+ * Limit the time part in order to ensure there are some bits left for the
+ * cycle counter.
+ */
+#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4)
+
+int prop_descriptor_init(struct prop_descriptor *pd, int shift)
+{
+ int err;
+
+ if (shift > PROP_MAX_SHIFT)
+ shift = PROP_MAX_SHIFT;
+
+ pd->index = 0;
+ pd->pg[0].shift = shift;
+ mutex_init(&pd->mutex);
+ err = percpu_counter_init_irq(&pd->pg[0].events, 0);
+ if (err)
+ goto out;
+
+ err = percpu_counter_init_irq(&pd->pg[1].events, 0);
+ if (err)
+ percpu_counter_destroy(&pd->pg[0].events);
+
+out:
+ return err;
+}
+
+/*
+ * We have two copies, and flip between them to make it seem like an atomic
+ * update. The update is not really atomic wrt the events counter, but
+ * it is internally consistent with the bit layout depending on shift.
+ *
+ * We copy the events count, move the bits around and flip the index.
+ */
+void prop_change_shift(struct prop_descriptor *pd, int shift)
+{
+ int index;
+ int offset;
+ u64 events;
+ unsigned long flags;
+
+ if (shift > PROP_MAX_SHIFT)
+ shift = PROP_MAX_SHIFT;
+
+ mutex_lock(&pd->mutex);
+
+ index = pd->index ^ 1;
+ offset = pd->pg[pd->index].shift - shift;
+ if (!offset)
+ goto out;
+
+ pd->pg[index].shift = shift;
+
+ local_irq_save(flags);
+ events = percpu_counter_sum(&pd->pg[pd->index].events);
+ if (offset < 0)
+ events <<= -offset;
+ else
+ events >>= offset;
+ percpu_counter_set(&pd->pg[index].events, events);
+
+ /*
+ * ensure the new pg is fully written before the switch
+ */
+ smp_wmb();
+ pd->index = index;
+ local_irq_restore(flags);
+
+ synchronize_rcu();
+
+out:
+ mutex_unlock(&pd->mutex);
+}
+
+/*
+ * wrap the access to the data in an rcu_read_lock() section;
+ * this is used to track the active references.
+ */
+static struct prop_global *prop_get_global(struct prop_descriptor *pd)
+{
+ int index;
+
+ rcu_read_lock();
+ index = pd->index;
+ /*
+ * match the wmb from vcd_flip()
+ */
+ smp_rmb();
+ return &pd->pg[index];
+}
+
+static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
+{
+ rcu_read_unlock();
+}
+
+static void
+prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
+{
+ int offset = *pl_shift - new_shift;
+
+ if (!offset)
+ return;
+
+ if (offset < 0)
+ *pl_period <<= -offset;
+ else
+ *pl_period >>= offset;
+
+ *pl_shift = new_shift;
+}
+
+/*
+ * PERCPU
+ */
+
+int prop_local_init_percpu(struct prop_local_percpu *pl)
+{
+ spin_lock_init(&pl->lock);
+ pl->shift = 0;
+ pl->period = 0;
+ return percpu_counter_init_irq(&pl->events, 0);
+}
+
+void prop_local_destroy_percpu(struct prop_local_percpu *pl)
+{
+ percpu_counter_destroy(&pl->events);
+}
+
+/*
+ * Catch up with missed period expirations.
+ *
+ * until (c_{j} == c)
+ * x_{j} -= x_{j}/2;
+ * c_{j}++;
+ */
+static
+void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
+{
+ unsigned long period = 1UL << (pg->shift - 1);
+ unsigned long period_mask = ~(period - 1);
+ unsigned long global_period;
+ unsigned long flags;
+
+ global_period = percpu_counter_read(&pg->events);
+ global_period &= period_mask;
+
+ /*
+ * Fast path - check if the local and global period count still match
+ * outside of the lock.
+ */
+ if (pl->period == global_period)
+ return;
+
+ spin_lock_irqsave(&pl->lock, flags);
+ prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
+ period = 1UL << (pg->shift - 1);
+ /*
+ * For each missed period, we half the local counter.
+ * basically:
+ * pl->events >> (global_period - pl->period);
+ *
+ * but since the distributed nature of percpu counters make division
+ * rather hard, use a regular subtraction loop. This is safe, because
+ * the events will only every be incremented, hence the subtraction
+ * can never result in a negative number.
+ */
+ while (pl->period != global_period) {
+ unsigned long val = percpu_counter_read(&pl->events);
+ unsigned long half = (val + 1) >> 1;
+
+ /*
+ * Half of zero won't be much less, break out.
+ * This limits the loop to shift iterations, even
+ * if we missed a million.
+ */
+ if (!val)
+ break;
+
+ percpu_counter_add(&pl->events, -half);
+ pl->period += period;
+ }
+ pl->period = global_period;
+ spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/*
+ * ++x_{j}, ++t
+ */
+void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
+{
+ struct prop_global *pg = prop_get_global(pd);
+
+ prop_norm_percpu(pg, pl);
+ percpu_counter_add(&pl->events, 1);
+ percpu_counter_add(&pg->events, 1);
+ prop_put_global(pd, pg);
+}
+
+/*
+ * Obtain an fraction of this proportion
+ *
+ * p_{j} = x_{j} / (period/2 + t % period/2)
+ */
+void prop_fraction_percpu(struct prop_descriptor *pd,
+ struct prop_local_percpu *pl,
+ long *numerator, long *denominator)
+{
+ struct prop_global *pg = prop_get_global(pd);
+ unsigned long period_2 = 1UL << (pg->shift - 1);
+ unsigned long counter_mask = period_2 - 1;
+ unsigned long global_count;
+
+ prop_norm_percpu(pg, pl);
+ *numerator = percpu_counter_read_positive(&pl->events);
+
+ global_count = percpu_counter_read(&pg->events);
+ *denominator = period_2 + (global_count & counter_mask);
+
+ prop_put_global(pd, pg);
+}
+
+/*
+ * SINGLE
+ */
+
+int prop_local_init_single(struct prop_local_single *pl)
+{
+ spin_lock_init(&pl->lock);
+ pl->shift = 0;
+ pl->period = 0;
+ pl->events = 0;
+ return 0;
+}
+
+void prop_local_destroy_single(struct prop_local_single *pl)
+{
+}
+
+/*
+ * Catch up with missed period expirations.
+ */
+static
+void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl)
+{
+ unsigned long period = 1UL << (pg->shift - 1);
+ unsigned long period_mask = ~(period - 1);
+ unsigned long global_period;
+ unsigned long flags;
+
+ global_period = percpu_counter_read(&pg->events);
+ global_period &= period_mask;
+
+ /*
+ * Fast path - check if the local and global period count still match
+ * outside of the lock.
+ */
+ if (pl->period == global_period)
+ return;
+
+ spin_lock_irqsave(&pl->lock, flags);
+ prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
+ /*
+ * For each missed period, we half the local counter.
+ */
+ period = (global_period - pl->period) >> (pg->shift - 1);
+ if (likely(period < BITS_PER_LONG))
+ pl->events >>= period;
+ else
+ pl->events = 0;
+ pl->period = global_period;
+ spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/*
+ * ++x_{j}, ++t
+ */
+void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
+{
+ struct prop_global *pg = prop_get_global(pd);
+
+ prop_norm_single(pg, pl);
+ pl->events++;
+ percpu_counter_add(&pg->events, 1);
+ prop_put_global(pd, pg);
+}
+
+/*
+ * Obtain an fraction of this proportion
+ *
+ * p_{j} = x_{j} / (period/2 + t % period/2)
+ */
+void prop_fraction_single(struct prop_descriptor *pd,
+ struct prop_local_single *pl,
+ long *numerator, long *denominator)
+{
+ struct prop_global *pg = prop_get_global(pd);
+ unsigned long period_2 = 1UL << (pg->shift - 1);
+ unsigned long counter_mask = period_2 - 1;
+ unsigned long global_count;
+
+ prop_norm_single(pg, pl);
+ *numerator = pl->events;
+
+ global_count = percpu_counter_read(&pg->events);
+ *denominator = period_2 + (global_count & counter_mask);
+
+ prop_put_global(pd, pg);
+}
Index: linux-2.6/include/linux/proportions.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/proportions.h
@@ -0,0 +1,119 @@
+/*
+ * FLoating proportions
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@xxxxxxxxxx>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef _LINUX_PROPORTIONS_H
+#define _LINUX_PROPORTIONS_H
+
+#include <linux/percpu_counter.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+struct prop_global {
+ /*
+ * The period over which we differentiate
+ *
+ * period = 2^shift
+ */
+ int shift;
+ /*
+ * The total event counter aka 'time'.
+ *
+ * Treated as an unsigned long; the lower 'shift - 1' bits are the
+ * counter bits, the remaining upper bits the period counter.
+ */
+ struct percpu_counter events;
+};
+
+/*
+ * global proportion descriptor
+ *
+ * this is needed to consitently flip prop_global structures.
+ */
+struct prop_descriptor {
+ int index;
+ struct prop_global pg[2];
+ struct mutex mutex; /* serialize the prop_global switch */
+};
+
+int prop_descriptor_init(struct prop_descriptor *pd, int shift);
+void prop_change_shift(struct prop_descriptor *pd, int new_shift);
+
+/*
+ * ----- PERCPU ------
+ */
+
+struct prop_local_percpu {
+ /*
+ * the local events counter
+ */
+ struct percpu_counter events;
+
+ /*
+ * snapshot of the last seen global state
+ */
+ int shift;
+ unsigned long period;
+ spinlock_t lock; /* protect the snapshot state */
+};
+
+int prop_local_init_percpu(struct prop_local_percpu *pl);
+void prop_local_destroy_percpu(struct prop_local_percpu *pl);
+void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl);
+void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl,
+ long *numerator, long *denominator);
+
+static inline
+void prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __prop_inc_percpu(pd, pl);
+ local_irq_restore(flags);
+}
+
+/*
+ * ----- SINGLE ------
+ */
+
+struct prop_local_single {
+ /*
+ * the local events counter
+ */
+ unsigned long events;
+
+ /*
+ * snapshot of the last seen global state
+ * and a lock protecting this state
+ */
+ int shift;
+ unsigned long period;
+ spinlock_t lock; /* protect the snapshot state */
+};
+
+#define INIT_PROP_LOCAL_SINGLE(name) \
+{ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
+}
+
+int prop_local_init_single(struct prop_local_single *pl);
+void prop_local_destroy_single(struct prop_local_single *pl);
+void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl);
+void prop_fraction_single(struct prop_descriptor *pd, struct prop_local_single *pl,
+ long *numerator, long *denominator);
+
+static inline
+void prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __prop_inc_single(pd, pl);
+ local_irq_restore(flags);
+}
+
+#endif /* _LINUX_PROPORTIONS_H */
Index: linux-2.6/lib/Makefile
===================================================================
--- linux-2.6.orig/lib/Makefile
+++ linux-2.6/lib/Makefile
@@ -5,7 +5,8 @@
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o \
idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
- sha1.o irq_regs.o reciprocal_div.o
+ sha1.o irq_regs.o reciprocal_div.o \
+ proportions.o

lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -160,6 +160,10 @@ extern ctl_table inotify_table[];
int sysctl_legacy_va_layout;
#endif

+extern int dirty_ratio_handler(ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+

/* The default sysctl tables: */

@@ -675,7 +679,7 @@ static ctl_table vm_table[] = {
.data = &vm_dirty_ratio,
.maxlen = sizeof(vm_dirty_ratio),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &dirty_ratio_handler,
.strategy = &sysctl_intvec,
.extra1 = &zero,
.extra2 = &one_hundred,
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -83,6 +83,7 @@ struct sched_param {
#include <linux/timer.h>
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
+#include <linux/proportions.h>

#include <asm/processor.h>

@@ -1076,6 +1077,7 @@ struct task_struct {
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
+ struct prop_local_single dirties;
};

static inline pid_t process_group(struct task_struct *tsk)
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -106,6 +106,7 @@ static struct kmem_cache *mm_cachep;

void free_task(struct task_struct *tsk)
{
+ prop_local_destroy_single(&tsk->dirties);
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
free_task_struct(tsk);
@@ -162,6 +163,7 @@ static struct task_struct *dup_task_stru
{
struct task_struct *tsk;
struct thread_info *ti;
+ int err;

prepare_to_copy(orig);

@@ -177,6 +179,14 @@ static struct task_struct *dup_task_stru

*tsk = *orig;
tsk->stack = ti;
+
+ err = prop_local_init_single(&tsk->dirties);
+ if (err) {
+ free_thread_info(ti);
+ free_task_struct(tsk);
+ return NULL;
+ }
+
setup_thread_stack(tsk, orig);

#ifdef CONFIG_CC_STACKPROTECTOR
Index: linux-2.6/include/linux/init_task.h
===================================================================
--- linux-2.6.orig/include/linux/init_task.h
+++ linux-2.6/include/linux/init_task.h
@@ -167,6 +167,7 @@ extern struct group_info init_groups;
[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
[PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \
}, \
+ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \
INIT_TRACE_IRQFLAGS \
INIT_LOCKDEP \
}

Attachment: signature.asc
Description: This is a digitally signed message part

Next message: Alexey Dobriyan: "Re: [PATCH resubmit] /drivers/ata ioremap returncode check"
Previous message: Maurizio Monge: "Spurious completions during NCQ with ahci/WDC WD1600BEVS-2"
In reply to: Martin Knoblauch: "RE: [PATCH 00/23] per device dirty throttling -v9"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]