Re: [dm-devel] Re: [PATCH 0/8] I/O bandwidth controller and BIOtracking

From: Hirokazu Takahashi
Date: Thu Nov 13 2008 - 02:26:40 EST


Hi, Kamezawa-san,

> > Hi everyone,
> >
> > This is a new release of dm-ioband and bio-cgroup. With this release,
> > the overhead of bio-cgroup is significantly reduced and the accuracy
> > of block I/O tracking is much improved. These patches are for
> > 2.6.28-rc2-mm1.
> >
>
> >From my point of view, a way to record bio_cgroup_id to page_cgroup is quite neat
> and nice.
>
> My concern is "bio_cgroup_id". It's provided only for bio_cgroup.
> In this summer, I tried to add swap_cgroup_id only for mem+swap controller but
> commenters said "please provide "id and lookup" in cgroup layer, it should be useful."
> And I agree them. (and postponed it ;)

Yup, that sounds really good.

> Could you try "id" in cgroup layer ? How do you think, Paul and others ?

It seems to easy to implement this feature since what I need to do is
move the code from bio-group to there.
Okay, I'll start it if Paul agrees with this approach.

> That's my only concern and if I/O controller people decides to live with
> this bio tracking infrastracture,
> ==
> page -> page_cgroup -> bio_cgroup_id
> ==
> I have no objections. And enqueue necessary changes to my queue.

It would be nice if you can include the following patch, which just makes
the page_cgroup infrastructure can be compiled in even when the cgroup
memory controller is compiled out.

> Thanks,
> -Kame

Thank you,
Hirokazu Takahashi.

---------------------------

This patch makes the page_cgroup framework be able to be used even if
the compile option of the cgroup memory controller is off.
So bio-cgroup can use this framework without the memory controller.

Signed-off-by: Hirokazu Takahashi <taka@xxxxxxxxxxxxx>

diff -dupr linux-2.6.28-rc2.bc0/include/linux/memcontrol.h linux-2.6.28-rc2/include/linux/memcontrol.h
--- linux-2.6.28-rc2.bc0/include/linux/memcontrol.h 2008-11-10 18:31:34.000000000 +0900
+++ linux-2.6.28-rc2/include/linux/memcontrol.h 2008-11-11 13:51:42.000000000 +0900
@@ -27,6 +27,9 @@ struct mm_struct;

#ifdef CONFIG_CGROUP_MEM_RES_CTLR

+extern void __init_mem_page_cgroup(struct page_cgroup *pc);
+#define mem_cgroup_disabled() mem_cgroup_subsys.disabled
+
extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask);
/* for swap handling */
@@ -81,6 +84,15 @@ extern long mem_cgroup_calc_reclaim(stru
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct mem_cgroup;

+static inline void __init_mem_page_cgroup(struct page_cgroup *pc)
+{
+}
+
+static inline int mem_cgroup_disabled(void)
+{
+ return 1;
+}
+
static inline int mem_cgroup_newpage_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
{
diff -dupr linux-2.6.28-rc2.bc0/include/linux/mmzone.h linux-2.6.28-rc2/include/linux/mmzone.h
--- linux-2.6.28-rc2.bc0/include/linux/mmzone.h 2008-11-10 18:50:50.000000000 +0900
+++ linux-2.6.28-rc2/include/linux/mmzone.h 2008-11-11 13:51:42.000000000 +0900
@@ -603,7 +603,7 @@ typedef struct pglist_data {
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
struct page_cgroup *node_page_cgroup;
#endif
#endif
@@ -952,7 +952,7 @@ struct mem_section {

/* See declaration of similar field in struct zone */
unsigned long *pageblock_flags;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
/*
* If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
* section. (see memcontrol.h/page_cgroup.h about this.)
diff -dupr linux-2.6.28-rc2.bc0/include/linux/page_cgroup.h linux-2.6.28-rc2/include/linux/page_cgroup.h
--- linux-2.6.28-rc2.bc0/include/linux/page_cgroup.h 2008-11-10 19:29:00.000000000 +0900
+++ linux-2.6.28-rc2/include/linux/page_cgroup.h 2008-11-11 14:46:47.000000000 +0900
@@ -1,7 +1,7 @@
#ifndef __LINUX_PAGE_CGROUP_H
#define __LINUX_PAGE_CGROUP_H

-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
#include <linux/bit_spinlock.h>
/*
* Page Cgroup can be considered as an extended mem_map.
@@ -12,9 +12,11 @@
*/
struct page_cgroup {
unsigned long flags;
- struct mem_cgroup *mem_cgroup;
struct page *page;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+ struct mem_cgroup *mem_cgroup;
struct list_head lru; /* per cgroup LRU list */
+#endif
};

void __init pgdat_page_cgroup_init(struct pglist_data *pgdat);
@@ -88,7 +90,7 @@ static inline void unlock_page_cgroup(st
bit_spin_unlock(PCG_LOCK, &pc->flags);
}

-#else /* CONFIG_CGROUP_MEM_RES_CTLR */
+#else /* CONFIG_CGROUP_PAGE */
struct page_cgroup;

static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
diff -dupr linux-2.6.28-rc2.bc0/init/Kconfig linux-2.6.28-rc2/init/Kconfig
--- linux-2.6.28-rc2.bc0/init/Kconfig 2008-11-10 18:31:34.000000000 +0900
+++ linux-2.6.28-rc2/init/Kconfig 2008-11-11 14:46:47.000000000 +0900
@@ -425,6 +425,10 @@ config CGROUP_MEM_RES_CTLR
This config option also selects MM_OWNER config option, which
could in turn add some fork/exit overhead.

+config CGROUP_PAGE
+ def_bool y
+ depends on CGROUP_MEM_RES_CTLR
+
config MM_OWNER
bool

diff -dupr linux-2.6.28-rc2.bc0/mm/Makefile linux-2.6.28-rc2/mm/Makefile
--- linux-2.6.28-rc2.bc0/mm/Makefile 2008-11-10 18:31:34.000000000 +0900
+++ linux-2.6.28-rc2/mm/Makefile 2008-11-11 14:46:47.000000000 +0900
@@ -34,5 +34,6 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o cpu_alloc.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
diff -dupr linux-2.6.28-rc2.bc0/mm/memcontrol.c linux-2.6.28-rc2/mm/memcontrol.c
--- linux-2.6.28-rc2.bc0/mm/memcontrol.c 2008-11-10 18:31:34.000000000 +0900
+++ linux-2.6.28-rc2/mm/memcontrol.c 2008-11-11 14:48:17.000000000 +0900
@@ -157,6 +157,11 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
0, /* FORCE */
};

+void __meminit __init_mem_page_cgroup(struct page_cgroup *pc)
+{
+ pc->mem_cgroup = NULL;
+}
+
/*
* Always modified under lru lock. Then, not necessary to preempt_disable()
*/
diff -dupr linux-2.6.28-rc2.bc0/mm/page_cgroup.c linux-2.6.28-rc2/mm/page_cgroup.c
--- linux-2.6.28-rc2.bc0/mm/page_cgroup.c 2008-11-10 18:31:34.000000000 +0900
+++ linux-2.6.28-rc2/mm/page_cgroup.c 2008-11-11 14:46:47.000000000 +0900
@@ -8,13 +8,14 @@
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
+#include <linux/memcontrol.h>

static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
{
pc->flags = 0;
- pc->mem_cgroup = NULL;
pc->page = pfn_to_page(pfn);
+ __init_mem_page_cgroup(pc);
}
static unsigned long total_usage;

@@ -69,7 +70,7 @@ void __init page_cgroup_init(void)

int nid, fail;

- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return;

for_each_online_node(nid) {
@@ -229,7 +230,7 @@ void __init page_cgroup_init(void)
unsigned long pfn;
int fail = 0;

- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return;

for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/