Re: [RFD 3/9] Display /proc/stat information per cgroup

From: Glauber Costa
Date: Tue Sep 27 2011 - 14:43:59 EST

Next message: Mark Brown: "[PATCH 1/3] mfd: Add WM1811 support"
Previous message: Simon Glass: "Re: [RFC PATCH 0/3] Add accurate boot timing to a Linux system"
In reply to: Balbir Singh: "Re: [RFD 3/9] Display /proc/stat information per cgroup"
Next in thread: Peter Zijlstra: "Re: [RFD 3/9] Display /proc/stat information per cgroup"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On 09/27/2011 02:01 PM, Balbir Singh wrote:

On Sat, Sep 24, 2011 at 3:50 AM, Glauber Costa<glommer@xxxxxxxxxxxxx> wrote:
Each cgroup has its own file, cpu.proc.stat that will
display the exact same format as /proc/stat. Users
that want to have access to a per-cgroup version of
this information, can query it for that purpose.

Signed-off-by: Glauber Costa<glommer@xxxxxxxxxxxxx>
...

Hi Balbir, thanks for reviewing this.

+static inline void task_cgroup_account_field(struct task_struct *p,
+ cputime64_t tmp, int index)
+{
+ struct kernel_stat *kstat;
+ struct task_group *tg = task_group(p);
+
+ do {
+ kstat = this_cpu_ptr(tg->cpustat);
+ kstat->cpustat[index] = cputime64_add(kstat->cpustat[index],
+ tmp);
+ tg = tg->parent;
+ } while (tg);
+}

What protects the walk (tg = tg->parent)? Could you please document it

I think that the fact that the hierarchy only grows down, thus parent never changes (or am I wrong?)

And since we run all this with preempt disabled and with the runqueue locked, we should have no problems.

Do you agree?

+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
@@ -3763,9 +3777,8 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
* @cputime_scaled: cputime scaled by cpu frequency
*/
void account_user_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
+ cputime_t cputime_scaled)
{
- cputime64_t *cpustat = kstat_this_cpu->cpustat;
cputime64_t tmp;

/* Add user time to process. */
@@ -3775,10 +3788,11 @@ void account_user_time(struct task_struct *p, cputime_t cputime,

/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
+
if (TASK_NICE(p)> 0)
- cpustat[NICE] = cputime64_add(cpustat[NICE], tmp);
+ task_cgroup_account_field(p, tmp, NICE);
else
- cpustat[USER] = cputime64_add(cpustat[USER], tmp);
+ task_cgroup_account_field(p, tmp, USER);

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
/* Account for user time used */
@@ -3824,7 +3838,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
*/
static inline
void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, cputime64_t *target_cputime64)
+ cputime_t cputime_scaled, int index)
{
cputime64_t tmp = cputime_to_cputime64(cputime);

@@ -3834,7 +3848,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
account_group_system_time(p, cputime);

/* Add system time to cpustat. */
- *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+ task_cgroup_account_field(p, tmp, index);
cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

/* Account for system time used */
@@ -3851,8 +3865,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime, cputime_t cputime_scaled)
{
- cputime64_t *cpustat = kstat_this_cpu->cpustat;
- cputime64_t *target_cputime64;
+ int index;

if ((p->flags& PF_VCPU)&& (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
@@ -3860,13 +3873,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
}

if (hardirq_count() - hardirq_offset)
- target_cputime64 =&cpustat[IRQ];
+ index = IRQ;
else if (in_serving_softirq())
- target_cputime64 =&cpustat[SOFTIRQ];
+ index = SOFTIRQ;
else
- target_cputime64 =&cpustat[SYSTEM];
+ index = SYSTEM;

- __account_system_time(p, cputime, cputime_scaled, target_cputime64);
+ __account_system_time(p, cputime, cputime_scaled, index);
}

/*
@@ -3941,27 +3954,29 @@ static __always_inline bool steal_account_process_tick(void)
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
+ struct rq *rq)
{
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
- cputime64_t *cpustat = kstat_this_cpu->cpustat;
+ struct task_group *tg;

if (steal_account_process_tick())
return;

+ tg = task_group(p);
+
if (irqtime_account_hi_update()) {
- cpustat[IRQ] = cputime64_add(cpustat[IRQ], tmp);
+ task_cgroup_account_field(p, tmp, IRQ);
} else if (irqtime_account_si_update()) {
- cpustat[SOFTIRQ] = cputime64_add(cpustat[SOFTIRQ], tmp);
+ task_cgroup_account_field(p, tmp, SOFTIRQ);
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-&cpustat[SOFTIRQ]);
+ __account_system_time(p, cputime_one_jiffy,
+ one_jiffy_scaled, SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
} else if (p == rq->idle) {
@@ -3969,8 +3984,8 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
} else if (p->flags& PF_VCPU) { /* System time or guest time */
account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
} else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-&cpustat[SYSTEM]);
+ __account_system_time(p, cputime_one_jiffy,
+ one_jiffy_scaled, SYSTEM);
}
}

@@ -8038,6 +8053,7 @@ void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;
+ struct kernel_stat *kstat;

#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
@@ -8092,6 +8108,18 @@ void __init sched_init(void)
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
+
+ root_task_group.cpustat = alloc_percpu(struct kernel_stat);
+ /* Failing that early an allocation means we're screwed anyway */
+ BUG_ON(!root_task_group.cpustat);
+ for_each_possible_cpu(i) {

for_each_possible_cpu might be an overkill, no?

+ kstat = per_cpu_ptr(root_task_group.cpustat, i);
+ kstat->cpustat[IDLE] = 0;
+ kstat->cpustat[IDLE_BASE] = 0;
+ kstat->cpustat[IOWAIT_BASE] = 0;
+ kstat->cpustat[IOWAIT] = 0;
+ }
+
#endif /* CONFIG_CGROUP_SCHED */

for_each_possible_cpu(i) {
@@ -8526,6 +8554,7 @@ static void free_sched_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
+ free_percpu(tg->cpustat);
kfree(tg);
}

@@ -8534,6 +8563,7 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
unsigned long flags;
+ int i;

tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -8545,6 +8575,19 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;

+ tg->cpustat = alloc_percpu(struct kernel_stat);
+ if (!tg->cpustat)
+ goto err;
+
+ for_each_possible_cpu(i) {
+ struct kernel_stat *kstat, *root_kstat;
+
+ kstat = per_cpu_ptr(tg->cpustat, i);
+ root_kstat = per_cpu_ptr(root_task_group.cpustat, i);
+ kstat->cpustat[IDLE_BASE] = root_kstat->cpustat[IDLE];
+ kstat->cpustat[IOWAIT_BASE] = root_kstat->cpustat[IOWAIT];
+ }
+
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list,&task_groups);

@@ -9092,7 +9135,9 @@ int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft, struct seq_fil
system = cputime64_add(system, kstat->cpustat[SYSTEM]);
idle = cputime64_add(idle, root_kstat->cpustat[IDLE]);
idle = cputime64_add(idle, arch_idle_time(i));
+ idle = cputime64_sub(idle, kstat->cpustat[IDLE_BASE]);
iowait = cputime64_add(iowait, root_kstat->cpustat[IOWAIT]);
+ iowait = cputime64_sub(iowait, kstat->cpustat[IOWAIT_BASE]);
irq = cputime64_add(irq, kstat->cpustat[IRQ]);
softirq = cputime64_add(softirq, kstat->cpustat[SOFTIRQ]);
steal = cputime64_add(steal, kstat->cpustat[STEAL]);
@@ -9134,7 +9179,9 @@ int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft, struct seq_fil
system = kstat->cpustat[SYSTEM];
idle = root_kstat->cpustat[IDLE];
idle = cputime64_add(idle, arch_idle_time(i));
+ idle = cputime64_sub(idle, kstat->cpustat[IDLE_BASE]);
iowait = root_kstat->cpustat[IOWAIT];
+ iowait = cputime64_sub(iowait, kstat->cpustat[IOWAIT_BASE]);
irq = kstat->cpustat[IRQ];
softirq = kstat->cpustat[SOFTIRQ];
steal = kstat->cpustat[STEAL];
@@ -9205,6 +9252,10 @@ static struct cftype cpu_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+ {
+ .name = "proc.stat",
+ .read_seq_string = cpu_cgroup_proc_stat,

Looks fine to me

Balbir Singh

Awesome.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Mark Brown: "[PATCH 1/3] mfd: Add WM1811 support"
Previous message: Simon Glass: "Re: [RFC PATCH 0/3] Add accurate boot timing to a Linux system"
In reply to: Balbir Singh: "Re: [RFD 3/9] Display /proc/stat information per cgroup"
Next in thread: Peter Zijlstra: "Re: [RFD 3/9] Display /proc/stat information per cgroup"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]