[RFC][PATCH 1/1] MAZE: Mazed processes monitor

From: Hirofumi Nakagawa
Date: Tue May 13 2008 - 08:21:38 EST


This patch for linux-2.6.25-mm1 that is tested on x86_64 hardware.

Signed-off-by: Hirofumi Nakagawa <hnakagawa@xxxxxxxxxxxxxxxx>
---
include/linux/maze.h | 92 +++++++++
include/linux/sched.h | 3
init/Kconfig | 10 +
init/main.c | 2
kernel/Makefile | 1
kernel/exit.c | 2
kernel/fork.c | 2
kernel/maze.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched.c | 3
9 files changed, 581 insertions(+)

--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25-mm1-maze/include/linux/maze.h 2008-05-13 17:52:23.000000000 +0900
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2007-2008 MIRACLE LINUX Corp.
+ *
+ * Written by Hirofumi Nakagawa <hnakagawa@xxxxxxxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2.
+ */
+#ifndef _LINUX_MAZE_H
+#define _LINUX_MAZE_H
+
+#ifdef __KERNEL__
+
+#include <linux/sched.h>
+#include <linux/list.h>
+
+struct maze_context {
+ struct task_struct *task;
+ int pid;
+
+ /* The stime of task when the maze count was reseted */
+ cputime_t reset_stime;
+ /* The utime of task when the maze count was reseted */
+ cputime_t reset_utime;
+
+ /* The soft limit of maze count */
+ unsigned long soft_limit;
+ /* THe hard limit of maze count */
+ unsigned long hard_limit;
+
+ /* The send signal when the maze count reach soft limit */
+ int soft_signal;
+ /* The send signal when the maze count reach hard limit */
+ int hard_signal;
+
+#define MAZE_SENT_SOFT_SIG 1
+#define MAZE_SENT_HARD_SIG 2
+ /* The flags of sent signal */
+ unsigned long flags;
+
+ /* This value is 1, if maze_context is enqueueing */
+ int enqueue;
+ /* This value is cpu number of enqueueing */
+ int enqueue_cpu;
+
+ /* This value is 1, if preempt_notifier is registered */
+ atomic_t registered_notifier;
+
+ struct preempt_notifier notifier;
+
+ /* This list_head is linked from the maze_queue */
+ struct list_head queue;
+ /* This list_head is linked from the maze_list */
+ struct list_head list;
+};
+
+#ifdef CONFIG_MAZE
+
+/* This function is called start_kernel() */
+extern void maze_init_early(void);
+
+/* This function is called do_exit() */
+extern void maze_exit(struct task_struct *task);
+
+/* This function is called copy_process() */
+extern void maze_fork(struct task_struct *task);
+
+/* This function is called sys_sched_yield() */
+extern void maze_sched_yield(struct task_struct *task);
+
+#else /* !CONFIG_MAZE */
+
+static inline void maze_init_early(void)
+{
+}
+
+static inline maze_exit(struct task_struct *task)
+{
+}
+
+static inline void maze_fork(struct task_struct *task)
+{
+}
+
+static inline void maze_sched_yield(struct task_struct *task)
+{
+}
+#endif /* CONFIG_MAZE */
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_MAZE_H */
--- linux-2.6.25-mm1-maze.orig/include/linux/sched.h 2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/include/linux/sched.h 2008-05-13 17:52:23.000000000 +0900
@@ -1254,6 +1254,9 @@ struct task_struct {
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
+#ifdef CONFIG_MAZE
+ struct maze_context *maze_context;
+#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
--- linux-2.6.25-mm1-maze.orig/init/Kconfig 2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/init/Kconfig 2008-05-13 17:52:23.000000000 +0900
@@ -371,6 +371,16 @@ config RESOURCE_COUNTERS
infrastructure that works with cgroups
depends on CGROUPS

+config MAZE
+ bool "MAZE monitor support"
+ select PREEMPT_NOTIFIERS
+ help
+ MAZE is a function to monitor mazed processes which use excessive CPU cycles.
+ This is a CGL (Carrier Grade Linux) requirement (AVL.14.0). MAZE detects such
+ processes and sends specified signals to the processes for their termination.
+
+ Say N if unsure.
+
config MM_OWNER
bool

--- linux-2.6.25-mm1-maze.orig/init/main.c 2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/init/main.c 2008-05-13 17:52:23.000000000 +0900
@@ -61,6 +61,7 @@
#include <linux/sched.h>
#include <linux/signal.h>
#include <linux/idr.h>
+#include <linux/maze.h>

#include <asm/io.h>
#include <asm/bugs.h>
@@ -543,6 +544,7 @@ asmlinkage void __init start_kernel(void
boot_init_stack_canary();
debug_objects_early_init();
cgroup_init_early();
+ maze_init_early();

local_irq_disable();
early_boot_irqs_off();
--- linux-2.6.25-mm1-maze.orig/kernel/Makefile 2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/kernel/Makefile 2008-05-13 17:52:23.000000000 +0900
@@ -80,6 +80,7 @@ obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_FTRACE) += trace/
obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_MAZE) += maze.o

ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@xxxxxxxxxxxxxxxx>, the -fno-omit-frame-pointer is
--- linux-2.6.25-mm1-maze.orig/kernel/exit.c 2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/kernel/exit.c 2008-05-13 17:52:23.000000000 +0900
@@ -44,6 +44,7 @@
#include <linux/resource.h>
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/maze.h>

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -1058,6 +1059,7 @@ NORET_TYPE void do_exit(long code)
__exit_fs(tsk);
check_stack_usage();
exit_thread();
+ maze_exit(tsk);
cgroup_exit(tsk, 1);
exit_keys(tsk);

--- linux-2.6.25-mm1-maze.orig/kernel/fork.c 2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/kernel/fork.c 2008-05-13 17:52:23.000000000 +0900
@@ -53,6 +53,7 @@
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
+#include <linux/maze.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1363,6 +1364,7 @@ static struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ maze_fork(p);
return p;

bad_fork_free_pid:
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.25-mm1-maze/kernel/maze.c 2008-05-13 20:07:55.000000000 +0900
@@ -0,0 +1,466 @@
+/*
+ * Copyright (C) 2007-2008 MIRACLE LINUX Corp.
+ *
+ * Written by Hirofumi Nakagawa <hnakagawa@xxxxxxxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/capability.h>
+#include <linux/maze.h>
+
+#define ERRPRINT(str, ...) printk("ERROR [%s] " str, __FUNCTION__, ##__VA_ARGS__);
+
+/* The root directory of procfs*/
+static struct proc_dir_entry *maze_proc_dir;
+
+static int maze_initialized;
+
+static struct list_head maze_list;
+static spinlock_t maze_list_lock;
+
+static int maze_timer_interval = 10;
+
+DEFINE_PER_CPU(struct list_head, maze_queue);
+DEFINE_PER_CPU(spinlock_t, maze_queue_lock);
+
+DEFINE_PER_CPU(struct timer_list, maze_timer);
+
+static inline void reset_maze_count(struct maze_context *context)
+{
+ context->reset_utime = context->task->utime;
+ context->reset_stime = context->task->stime;
+ context->flags = 0;
+}
+
+static inline cputime_t get_maze_count(struct maze_context *context)
+{
+ return (context->task->utime - context->reset_utime) +
+ (context->task->stime - context->reset_stime);
+}
+
+/*
+ * This function is called start_kernel()
+ */
+void maze_init_early(void)
+{
+ init_task.maze_context = NULL;
+}
+
+/*
+ * This function is called do_exit()
+ */
+void maze_exit(struct task_struct *task)
+{
+ unsigned long flags;
+
+ task_lock(task);
+ if (task->maze_context) {
+ spin_lock_irqsave(&per_cpu(maze_queue_lock,
+ task->maze_context->enqueue_cpu), flags);
+ if (task->maze_context->enqueue)
+ list_del(&task->maze_context->queue);
+ spin_unlock_irqrestore(&per_cpu(maze_queue_lock,
+ task->maze_context->enqueue_cpu), flags);
+
+ spin_lock(&maze_list_lock);
+ list_del(&task->maze_context->list);
+ spin_unlock(&maze_list_lock);
+
+ kfree(task->maze_context);
+ task->maze_context = NULL;
+ }
+ task_unlock(task);
+}
+
+static inline void copy_limit_and_signal(struct maze_context *to,
+ struct maze_context *from)
+{
+ to->soft_limit = from->soft_limit;
+ to->hard_limit = from->hard_limit;
+ to->soft_signal = from->soft_signal;
+ to->hard_signal = from->hard_signal;
+}
+
+/*
+ * Must be called under task_lock().
+ */
+static inline void enqueue(struct maze_context *context)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&__get_cpu_var(maze_queue_lock), flags);
+ if (!context->enqueue) {
+ list_add(&context->queue, &__get_cpu_var(maze_queue));
+ context->enqueue_cpu = smp_processor_id();
+ context->enqueue = 1;
+ }
+ spin_unlock_irqrestore(&__get_cpu_var(maze_queue_lock), flags);
+}
+
+static void sched_in_event(struct preempt_notifier *notifier, int cpu)
+{
+ task_lock(current);
+ if (current->maze_context) {
+ if (current->state != TASK_RUNNING)
+ reset_maze_count(current->maze_context);
+ enqueue(current->maze_context);
+ }
+ task_unlock(current);
+}
+
+static void sched_out_event(struct preempt_notifier *notifier,
+ struct task_struct *next)
+{
+ task_lock(next);
+ if (next->maze_context)
+ enqueue(next->maze_context);
+ task_unlock(next);
+}
+
+static struct preempt_ops preempt_ops = {
+ .sched_in = sched_in_event,
+ .sched_out = sched_out_event,
+};
+
+/*
+ * This function is called copy_process()
+ */
+void maze_fork(struct task_struct *task)
+{
+ task->maze_context = NULL;
+
+ if (!current->maze_context)
+ return;
+
+ task_lock(task);
+
+ task->maze_context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);
+ if (unlikely(!task->maze_context)) {
+ ERRPRINT("fail to alloc maze_context.\n");
+ goto task_unlock;
+ }
+ task->maze_context->task = task;
+ task->maze_context->pid = task->pid;
+
+ copy_limit_and_signal(task->maze_context, current->maze_context);
+ preempt_notifier_init(&task->maze_context->notifier, &preempt_ops);
+
+ spin_lock(&maze_list_lock);
+ list_add(&task->maze_context->list, &maze_list);
+ spin_unlock(&maze_list_lock);
+
+task_unlock:
+ task_unlock(task);
+}
+
+/*
+ * This function is called sys_sched_yield()
+ */
+void maze_sched_yield(struct task_struct *task)
+{
+ if (task->maze_context)
+ reset_maze_count(task->maze_context);
+}
+
+static void soft_limit_event(struct task_struct *task)
+{
+ send_sig(task->maze_context->soft_signal, task, 1);
+ task->maze_context->flags |= MAZE_SENT_SOFT_SIG;
+}
+
+static void hard_limit_event(struct task_struct *task)
+{
+ send_sig(task->maze_context->hard_signal, task, 1);
+ task->maze_context->flags |= MAZE_SENT_HARD_SIG;
+}
+
+static inline void set_preempt_notifir(struct maze_context *context)
+{
+ if (!atomic_xchg(&context->registered_notifier, 1))
+ preempt_notifier_register(&context->notifier);
+}
+
+static int add_maze_context(struct maze_context *context)
+{
+ struct task_struct *task;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ task = find_task_by_pid(context->pid);
+ if (unlikely(!task))
+ goto read_unlock;
+
+ task_lock(task);
+
+ if (!task->maze_context) {
+ task->maze_context = context;
+ context->task = task;
+
+ spin_lock(&maze_list_lock);
+ list_add(&context->list, &maze_list);
+ spin_unlock(&maze_list_lock);
+
+ reset_maze_count(context);
+
+ preempt_notifier_init(&context->notifier, &preempt_ops);
+ if (current == task)
+ set_preempt_notifir(context);
+ } else {
+ copy_limit_and_signal(task->maze_context, context);
+ ret = 1;
+ }
+
+ task_unlock(task);
+read_unlock:
+ rcu_read_unlock();
+ return ret;
+}
+
+static int build_maze_context(const char *read_line, struct maze_context *context)
+{
+ unsigned long soft_limit, hard_limit;
+ int soft_signal, hard_signal;
+ int pid, res;
+
+ res = sscanf(read_line, "%d %ld %ld %d %d", &pid, &soft_limit,
+ &hard_limit, &soft_signal, &hard_signal);
+
+ if (res != 5 || pid < 0 ||
+ soft_limit < 0 || hard_limit < 0 ||
+ soft_signal < 0 || hard_signal < 0)
+ return -EINVAL;
+
+ context->pid = pid;
+ context->soft_limit = soft_limit;
+ context->hard_limit = hard_limit;
+ context->soft_signal = soft_signal;
+ context->hard_signal = hard_signal;
+
+ return 0;
+}
+
+static void timer_handler(unsigned long data);
+
+/*
+ * Setup softirq timer
+ */
+static void continue_timer(int cpu)
+{
+ struct timer_list *t;
+
+ t = &per_cpu(maze_timer, cpu);
+ t->function = timer_handler;
+ t->expires = jiffies + maze_timer_interval;
+ init_timer(t);
+ add_timer_on(t, cpu);
+}
+
+static inline void check_limit(struct maze_context *context)
+{
+ cputime_t t = get_maze_count(context);
+
+ if (!(context->flags & MAZE_SENT_SOFT_SIG)) {
+ if (t >= context->soft_limit)
+ soft_limit_event(context->task);
+ } else if (!(context->flags & MAZE_SENT_HARD_SIG)) {
+ if (t >= context->hard_limit)
+ hard_limit_event(context->task);
+ }
+}
+
+static void timer_handler(unsigned long data)
+{
+ struct maze_context *context, *next;
+
+ spin_lock(&__get_cpu_var(maze_queue_lock));
+
+ if (current->maze_context) {
+ set_preempt_notifir(current->maze_context);
+ if (!current->maze_context->enqueue)
+ check_limit(current->maze_context);
+ }
+
+ if (!list_empty(&__get_cpu_var(maze_queue))) {
+ list_for_each_entry_safe (context, next,
+ &__get_cpu_var(maze_queue), queue) {
+ check_limit(context);
+ list_del(&context->queue);
+ context->enqueue = 0;
+ }
+ }
+
+ spin_unlock(&__get_cpu_var(maze_queue_lock));
+
+ continue_timer(smp_processor_id());
+}
+
+
+static int maze_entries_file_show(struct seq_file *seq, void *nouse)
+{
+ struct maze_context *context;
+
+ spin_lock(&maze_list_lock);
+
+ list_for_each_entry(context, &maze_list, list) {
+ seq_printf(seq, "pid:%5d ", context->pid);
+ seq_printf(seq, "count:%6ld ", get_maze_count(context));
+ seq_printf(seq, "soft limit:%6ld ", context->soft_limit);
+ seq_printf(seq, "hard limit:%6ld ", context->hard_limit);
+ seq_printf(seq, "soft signal:%2d ", context->soft_signal);
+ seq_printf(seq, "hard signal:%2d ", context->hard_signal);
+ seq_printf(seq, "\n");
+ }
+
+ spin_unlock(&maze_list_lock);
+
+ return 0;
+}
+
+/*
+ * Open operation of /proc/maze/entries
+ */
+static int maze_entries_file_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, maze_entries_file_show, NULL);
+}
+
+/*
+ * Release operation of /proc/maze/entries
+ */
+static int maze_entries_file_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+
+/*
+ * Write operation of /proc/maze/entries
+ */
+static ssize_t maze_entries_file_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct maze_context *context;
+ char read_line[32];
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (count > sizeof(read_line) - 1)
+ return -EINVAL;
+
+ if (copy_from_user(read_line, buffer, count))
+ return -EFAULT;
+
+ read_line[count] = '\0';
+
+ context = kzalloc(sizeof(struct maze_context), GFP_ATOMIC);
+ if (unlikely(!context)) {
+ ERRPRINT("fail to alloc maze_context.\n");
+ return -ENOMEM;
+ }
+
+ ret = build_maze_context(read_line, context);
+ if (ret) {
+ kfree(context);
+ return ret;
+ }
+
+ if (add_maze_context(context))
+ kfree(context);
+
+ return count;
+}
+
+static struct file_operations maze_entries_file_ops = {
+ .owner = THIS_MODULE,
+ .open = maze_entries_file_open,
+ .read = seq_read,
+ .write = maze_entries_file_write,
+ .llseek = seq_lseek,
+ .release = maze_entries_file_release,
+};
+
+/*
+ * Creating /proc/maze/
+ */
+static inline int init_dir(void)
+{
+ maze_proc_dir =
+ create_proc_entry("maze",
+ S_IFDIR | S_IRUGO | S_IXUGO,
+ NULL);
+
+ if (!maze_proc_dir)
+ panic("fail to create /proc/maze\n");
+
+ return 0;
+}
+
+
+/*
+ * Creating /proc/maze/entries
+ */
+static inline int init_entries(void)
+{
+ struct proc_dir_entry *p;
+
+ p = create_proc_entry("entries", S_IRUGO, maze_proc_dir);
+ if (p == NULL)
+ panic("fail to create /proc/maze/entries\n");
+
+ p->proc_fops = &maze_entries_file_ops;
+
+ return 0;
+}
+
+/*
+ * Initializing maze procfs
+ */
+static void __init init_proc(void)
+{
+ init_dir();
+ init_entries();
+}
+
+static void __init init_cpu(int cpu)
+{
+ spin_lock_init(&per_cpu(maze_queue_lock, cpu));
+ INIT_LIST_HEAD(&per_cpu(maze_queue, cpu));
+
+ /* Setup softirq timer */
+ continue_timer(cpu);
+}
+
+static int __init maze_init(void)
+{
+ int cpu;
+
+ printk(KERN_INFO "Maze: Initializing\n");
+ maze_initialized = 1;
+ init_proc();
+
+ spin_lock_init(&maze_list_lock);
+ INIT_LIST_HEAD(&maze_list);
+
+ for_each_online_cpu(cpu)
+ init_cpu(cpu);
+
+ return 0;
+}
+late_initcall(maze_init);
--- linux-2.6.25-mm1-maze.orig/kernel/sched.c 2008-05-13 17:40:43.000000000 +0900
+++ linux-2.6.25-mm1-maze/kernel/sched.c 2008-05-13 17:52:23.000000000 +0900
@@ -68,6 +68,7 @@
#include <linux/hrtimer.h>
#include <linux/tick.h>
#include <linux/ftrace.h>
+#include <linux/maze.h>

#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -5157,6 +5158,8 @@ asmlinkage long sys_sched_yield(void)
_raw_spin_unlock(&rq->lock);
preempt_enable_no_resched();

+ maze_sched_yield(current);
+
schedule();

return 0;



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/