[RFC] ps command race fix
From: KAMEZAWA Hiroyuki
Date: Fri Jul 14 2006 - 07:36:19 EST
Hi, this is an experimental patch for the probelm
- "ps command can miss some pid occationally"
please comment
the problem itself is very rare case, but the result is sometimes terrible
for example, when a user does
alive=`ps | grep command | grep -v command | wc -l`
to check process is alive or not (I think this user should use kill-0 ;)
-Kame
==
Now, prod_pid_readir() uses direct access to task and
indexing 'task list' as fallback.
Of course, entries in this list can be removed randomly.
So, following can happen when using 'ps' command.
==
1. assume task_list as
....-(taskA)-(taskB)-(taskC)-(taskD)-(taskE)-(taskF)-(taskG)-...
2. at getdents() iteration 'N', ps command's getdents() read entries before taskC.
and remenbers "I read X entries".
....-(taskA)-(taskB)-(taskC)-(taskD)-(taskE)-(taskF)-(taskG)-...
------(f_pos=X)---------^
getdents() remembers
- "taskC is next candidate to be read"
- "we already read X ents".
3. consider taskA and taskC exits, before next getdents(N+1)
....-(lost)-(taskB)-(lost)-(taskD)-(taskE)-(taskF)-(taskG)-...
------(f_pos=X)--------^
4. at getdents(N+1), becasue getdents() cannot find taskC, it skips 'X'
ents in the list.
from head of the list.
....-(taskB)-(taskD)-(taskE)-(taskF)-(taskG)-..
------(f_pos=X)--------^
5. in this case, taskD is skipped.
==
This patch changes indexing in the list to indexing in a table.
Table is created only for storing valid tgid.(not pid)
Tested on x86/ia64.
Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
fs/proc/base.c | 187 ++++++++++++++++++++++++++++++++----------------
include/linux/proc_fs.h | 4 +
include/linux/sched.h | 6 +
kernel/exit.c | 1
kernel/fork.c | 2
5 files changed, 138 insertions(+), 62 deletions(-)
Index: linux-2.6.18-rc1-kame/fs/proc/base.c
===================================================================
--- linux-2.6.18-rc1-kame.orig/fs/proc/base.c 2006-07-14 09:14:45.000000000 +0900
+++ linux-2.6.18-rc1-kame/fs/proc/base.c 2006-07-14 17:04:35.000000000 +0900
@@ -2121,59 +2121,126 @@
* In the case of a seek we start with &init_task and walk nr
* threads past it.
*/
-static struct task_struct *first_tgid(int tgid, unsigned int nr)
+
+rwlock_t tgid_table_lock;
+struct tgid_buffer {
+ struct list_head list;
+ int base_pos;
+ int nr_free;
+ int cache;
+ int *data;
+};
+
+LIST_HEAD(tgid_table);
+DECLARE_RWSEM(tgid_table_sem);
+int nr_tgid_table;
+int tgid_end_pos;
+#define get_tgid_buf(p) ((struct tgid_buffer *)((unsigned long)p & PAGE_MASK))
+#define ENT_PER_TGIDBUF ((PAGE_SIZE - sizeof(struct tgid_buffer))/sizeof(int))
+
+static inline int tgid_ent_to_pos(int *tgid)
{
- struct task_struct *pos;
- rcu_read_lock();
- if (tgid && nr) {
- pos = find_task_by_pid(tgid);
- if (pos && thread_group_leader(pos))
+ struct tgid_buffer *buf = get_tgid_buf(tgid);
+ return buf->base_pos + (tgid - buf->data);
+}
+
+static struct tgid_buffer *grow_tgid_buf(void)
+{
+ struct tgid_buffer *buf = (void *) get_zeroed_page(GFP_KERNEL);
+ BUG_ON(!buf);
+ INIT_LIST_HEAD(&buf->list);
+ buf->nr_free = ENT_PER_TGIDBUF;
+ buf->data = (void *)buf + sizeof(struct tgid_buffer);
+ buf->base_pos = nr_tgid_table * ENT_PER_TGIDBUF;
+ /* add to the tail */
+ list_add_tail(&buf->list,&tgid_table);
+ nr_tgid_table++;
+ tgid_end_pos += ENT_PER_TGIDBUF;
+ return buf;
+}
+
+void proc_register_tgid(struct task_struct *task)
+{
+ struct tgid_buffer *buf;
+ int tgid = task->tgid;
+ int *ent;
+ /* we just record thread group leader */
+ if (!thread_group_leader(task))
+ return;
+ down_write(&tgid_table_sem);
+ list_for_each_entry(buf, &tgid_table, list) {
+ if (buf->nr_free != 0)
goto found;
}
- /* If nr exceeds the number of processes get out quickly */
- pos = NULL;
- if (nr && nr >= nr_processes())
- goto done;
-
- /* If we haven't found our starting place yet start with
- * the init_task and walk nr tasks forward.
- */
- for (pos = next_task(&init_task); nr > 0; --nr) {
- pos = next_task(pos);
- if (pos == &init_task) {
- pos = NULL;
- goto done;
+ /* no free area... allocate new one */
+ buf = grow_tgid_buf();
+found:
+ ent = buf->data + buf->cache;
+ while (*ent) {
+ ent++;
+ if (ent - buf->data >= ENT_PER_TGIDBUF) {
+ ent = buf->data;
}
}
-found:
- get_task_struct(pos);
-done:
- rcu_read_unlock();
- return pos;
+ *ent = tgid;
+ task->tgid_buf_pos = ent;
+ buf->cache = (ent - buf->data);
+ buf->nr_free--;
+ up_write(&tgid_table_sem);
+ return;
}
-/*
- * Find the next task in the task list.
- * Return NULL if we loop or there is any error.
- *
- * The reference to the input task_struct is released.
- */
-static struct task_struct *next_tgid(struct task_struct *start)
+void proc_unregister_tgid(struct task_struct *task)
{
- struct task_struct *pos;
- rcu_read_lock();
- pos = start;
- if (pid_alive(start))
- pos = next_task(start);
- if (pid_alive(pos) && (pos != &init_task)) {
- get_task_struct(pos);
- goto done;
+ struct tgid_buffer *buf;
+ int *ent;
+ if (task->tgid_buf_pos == NULL)
+ return;
+ down_write(&tgid_table_sem);
+ ent = task->tgid_buf_pos;
+ buf = get_tgid_buf(ent);
+ *ent = 0;
+ buf->cache = ent - buf->data;
+ buf->nr_free++;
+ up_write(&tgid_table_sem);
+}
+
+static int *next_tgid(int *ent)
+{
+ struct tgid_buffer *buf;
+ int pos;
+ buf = get_tgid_buf(ent);
+ /* search next valid entry */
+ ent++;
+ pos = ent - buf->data;
+ do {
+ while (pos < ENT_PER_TGIDBUF) {
+ ent = buf->data + pos;
+ if (*ent)
+ return ent;
+ pos++;
+ }
+ buf = list_entry(buf->list.next, struct tgid_buffer, list);
+ pos = 0;
+ } while (&buf->list != &tgid_table);
+ return NULL;
+}
+
+static int *first_tgid(int nr)
+{
+ int *ent;
+ struct tgid_buffer *buf;
+ list_for_each_entry(buf, &tgid_table, list) {
+ if (nr < ENT_PER_TGIDBUF)
+ goto found;
+ nr -= ENT_PER_TGIDBUF;
}
- pos = NULL;
-done:
- rcu_read_unlock();
- put_task_struct(start);
- return pos;
+ return NULL;
+found:
+ ent = buf->data + nr;
+ if (*ent)
+ return ent;
+ return next_tgid(ent);
}
/* for the /proc/ directory itself, after non-process stuff has been done */
@@ -2181,8 +2248,7 @@
{
char buf[PROC_NUMBUF];
unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
- struct task_struct *task;
- int tgid;
+ int *tgid;
if (!nr) {
ino_t ino = fake_ino(0,PROC_TGID_INO);
@@ -2192,28 +2258,25 @@
nr++;
}
nr -= 1;
-
- /* f_version caches the tgid value that the last readdir call couldn't
- * return. lseek aka telldir automagically resets f_version to 0.
- */
- tgid = filp->f_version;
- filp->f_version = 0;
- for (task = first_tgid(tgid, nr);
- task;
- task = next_tgid(task), filp->f_pos++) {
+ if (nr >= tgid_end_pos)
+ return 0;
+ down_read(&tgid_table_sem);
+ for (tgid = first_tgid(nr);
+ tgid;
+ tgid = next_tgid(tgid)) {
int len;
ino_t ino;
- tgid = task->pid;
- len = snprintf(buf, sizeof(buf), "%d", tgid);
- ino = fake_ino(tgid, PROC_TGID_INO);
+ len = snprintf(buf, sizeof(buf), "%d", *tgid);
+ ino = fake_ino(*tgid, PROC_TGID_INO);
+ filp->f_pos = tgid_ent_to_pos(tgid) + FIRST_PROCESS_ENTRY + 1;
if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
- /* returning this tgid failed, save it as the first
- * pid for the next readir call */
- filp->f_version = tgid;
- put_task_struct(task);
break;
}
}
+ up_read(&tgid_table_sem);
+ if (!tgid) { /* EOF */
+ filp->f_pos = tgid_end_pos + FIRST_PROCESS_ENTRY + 1;
+ }
return 0;
}
Index: linux-2.6.18-rc1-kame/include/linux/proc_fs.h
===================================================================
--- linux-2.6.18-rc1-kame.orig/include/linux/proc_fs.h 2006-07-06 13:09:49.000000000 +0900
+++ linux-2.6.18-rc1-kame/include/linux/proc_fs.h 2006-07-14 19:52:23.000000000 +0900
@@ -200,6 +200,8 @@
remove_proc_entry(name,proc_net);
}
+extern void proc_register_tgid(struct task_struct *task);
+extern void proc_unregister_tgid(struct task_struct *task);
#else
#define proc_root_driver NULL
@@ -232,6 +234,8 @@
struct tty_driver;
static inline void proc_tty_register_driver(struct tty_driver *driver) {};
static inline void proc_tty_unregister_driver(struct tty_driver *driver) {};
+static inline void proc_register_tgid(struct task_struct *task) {};
+static inline void proc_unregister_tgid(struct task_struct *task) {};
extern struct proc_dir_entry proc_root;
Index: linux-2.6.18-rc1-kame/include/linux/sched.h
===================================================================
--- linux-2.6.18-rc1-kame.orig/include/linux/sched.h 2006-07-06 13:09:49.000000000 +0900
+++ linux-2.6.18-rc1-kame/include/linux/sched.h 2006-07-14 13:54:07.000000000 +0900
@@ -940,6 +940,12 @@
atomic_t fs_excl; /* holding fs exclusive resources */
struct rcu_head rcu;
+/*
+ * for proc root directory
+ */
+#ifdef CONFIG_PROC_FS
+ int *tgid_buf_pos;
+#endif
/*
* cache last used pipe for splice
Index: linux-2.6.18-rc1-kame/kernel/exit.c
===================================================================
--- linux-2.6.18-rc1-kame.orig/kernel/exit.c 2006-07-06 13:09:49.000000000 +0900
+++ linux-2.6.18-rc1-kame/kernel/exit.c 2006-07-14 13:54:07.000000000 +0900
@@ -166,6 +166,7 @@
sched_exit(p);
write_unlock_irq(&tasklist_lock);
+ proc_unregister_tgid(p);
proc_flush_task(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
Index: linux-2.6.18-rc1-kame/kernel/fork.c
===================================================================
--- linux-2.6.18-rc1-kame.orig/kernel/fork.c 2006-07-06 13:09:49.000000000 +0900
+++ linux-2.6.18-rc1-kame/kernel/fork.c 2006-07-14 13:54:07.000000000 +0900
@@ -179,6 +179,7 @@
atomic_set(&tsk->fs_excl, 0);
tsk->btrace_seq = 0;
tsk->splice_pipe = NULL;
+ tsk->tgid_buf_pos = NULL;
return tsk;
}
@@ -1244,6 +1245,7 @@
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
+ proc_register_tgid(p);
return p;
bad_fork_cleanup_namespace:
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/