mutex syscall patch, questions about clone and its possible bugs

Jakub Jelinek (jj@sunsite.ms.mff.cuni.cz)
Sat, 11 May 1996 23:47:58 +0200 (MET DST)


Hi!

I've started with my friends to write pthreads support for bound threads
(system wide contention scope). I would like to use the sys_clone syscall
for it. For the synchronization of threads, I think it would be valuable to
add a new syscall like the following one, since I think implementing simple
mutexes (which are in the thread world used just on every step) using semget
etc. would be quite expensive. This is a simple syscall called
int mutex(__linux_mutex_t *m, int lock)
where m is a 64bit structure that the process is required to initialize to
zero prior to a first call (otherwise he may expect -EINVALs etc.) This
structure can be in a data segment/on the stack of some process (in that
case only processes created with clone(CLONE_VM...) can use this mutex
lock), or in shared memory. The lock parameter says, if you want to lock the
mutex (and if already locked, sleep until it is unlocked), trylock (lock if
it is possible without blocking) or unlock. Only the process which has
locked the mutex can unlock it. The lock will be unlocked, if the owning
process dies. I'm new to the kernel hacking, so I would appreciate your
comments on the following patch very much.

My second question is: has anyone tried the clone system call? Was it
working? My experience (I'm testing on 1.3.98/i486) is that if clone is called
without the CLONE_VM flag, then everything goes ok (just if I set the new
stack somewhere, both the parent and child use the old stack - a simple test
programs tells me that before clone(CLONE_FILES|CLONE_FS|SIGCHLD,40000000)
the stack is somewhere at BFFFFD20 and after in both parent and child it is
similar), but if I call clone(CLONE_VM|CLONE_FILES|CLONE_FS|SIGCHLD,40000000)
(- 40000000 is a return value from malloc, in fact some digits are not
zeroes), the child SIGSEGV faults before doing anything. And of course, core
doesn't show anything interesting. I've looked at the i386 port and saw that
the esp in pt_regs is set up correctly, but then in RESTORE_ALL you just
call iret and don't change esp and copy the 16 or whatever bytes from the
old to the new stack. Am I right with this? Can this be fixed? I don't thing
restoring of esp in RESTORE_ALL would be a good idea, what about calling a
special RESTORE_CLONE_ALL on return from the sys_call routine?

Thank you for your comments and ideas on what's wrong with clone and how to
fix it. I'm not on the linux-kernel list, so please could you CC me the
answers?

Jakub Jelinek

diff -urN -X /usr/src/thread/backup/diffex linux-1.3.98/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S
--- linux-1.3.98/arch/i386/kernel/entry.S Wed Mar 27 15:18:12 1996
+++ linux/arch/i386/kernel/entry.S Thu May 9 21:50:28 1996
@@ -686,4 +686,5 @@
.long SYMBOL_NAME(sys_sched_rr_get_interval)
.long SYMBOL_NAME(sys_nanosleep)
.long SYMBOL_NAME(sys_mremap)
- .space (NR_syscalls-163)*4
+ .long SYMBOL_NAME(sys_mutex)
+ .space (NR_syscalls-164)*4
diff -urN -X /usr/src/thread/backup/diffex linux-1.3.98/include/asm-i386/unistd.h linux/include/asm-i386/unistd.h
--- linux-1.3.98/include/asm-i386/unistd.h Fri Mar 22 07:34:02 1996
+++ linux/include/asm-i386/unistd.h Thu May 9 21:28:16 1996
@@ -169,6 +169,7 @@
#define __NR_sched_rr_get_interval 161
#define __NR_nanosleep 162
#define __NR_mremap 163
+#define __NR_mutex 164

/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */
#define _syscall0(type,name) \
diff -urN -X /usr/src/thread/backup/diffex linux-1.3.98/include/linux/mutex.h linux/include/linux/mutex.h
--- linux-1.3.98/include/linux/mutex.h Thu Jan 1 01:00:00 1970
+++ linux/include/linux/mutex.h Sat May 11 10:19:09 1996
@@ -0,0 +1,20 @@
+#ifndef _LINUX_MUTEX_H
+#define _LINUX_MUTEX_H
+
+#define __LINUX_MUTEX_LOCK 0
+#define __LINUX_MUTEX_TRYLOCK 1
+#define __LINUX_MUTEX_UNLOCK 2
+
+typedef struct __linux_mutex_tag {
+ __u32 owner;
+ __u32 waiters;
+} __linux_mutex_t;
+
+#ifdef __KERNEL__
+struct mutex_undo {
+ __linux_mutex_t *mutex;
+ struct mutex_undo *next;
+};
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_MUTEX_H */
diff -urN -X /usr/src/thread/backup/diffex linux-1.3.98/include/linux/sched.h linux/include/linux/sched.h
--- linux-1.3.98/include/linux/sched.h Mon Apr 29 16:26:04 1996
+++ linux/include/linux/sched.h Sat May 11 10:21:15 1996
@@ -24,6 +24,7 @@
#include <linux/smp.h>
#include <linux/tty.h>
#include <linux/sem.h>
+#include <linux/mutex.h>

/*
* cloning flags:
@@ -234,6 +235,7 @@
/* ipc stuff */
struct sem_undo *semundo;
struct sem_queue *semsleeping;
+ struct mutex_undo *mutexundo;
/* ldt for this task - used by Wine. If NULL, default_ldt is used */
struct desc_struct *ldt;
/* tss for this task */
@@ -304,7 +306,7 @@
/* math */ 0, \
/* comm */ "swapper", \
/* fs info */ 0,NULL, \
-/* ipc */ NULL, NULL, \
+/* ipc */ NULL, NULL, NULL, \
/* ldt */ NULL, \
/* tss */ INIT_TSS, \
/* fs */ &init_fs, \
diff -urN -X /usr/src/thread/backup/diffex linux-1.3.98/ipc/Makefile linux/ipc/Makefile
--- linux-1.3.98/ipc/Makefile Tue Jan 9 11:26:52 1996
+++ linux/ipc/Makefile Fri May 10 23:14:22 1996
@@ -8,7 +8,7 @@
# Note 2! The CFLAGS definition is now in the main makefile...

O_TARGET := ipc.o
-O_OBJS := util.o
+O_OBJS := util.o mutex.o

ifdef CONFIG_KERNELD
CONFIG_SYSVIPC=1
diff -urN -X /usr/src/thread/backup/diffex linux-1.3.98/ipc/mutex.c linux/ipc/mutex.c
--- linux-1.3.98/ipc/mutex.c Thu Jan 1 01:00:00 1970
+++ linux/ipc/mutex.c Sat May 11 15:59:41 1996
@@ -0,0 +1,216 @@
+/*
+ * mutex.c
+ *
+ * Copyright (C) 1996 Jakub Jelinek <jj@sunsite.mff.cuni.cz>
+ */
+
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/malloc.h>
+#include <asm/errno.h>
+#include <linux/mutex.h>
+
+#define MUTEX_ROOT_WAITERS 32
+#define MUTEX_TABLE_WAITERS 1024
+#define MUTEX_WAITERS (MUTEX_ROOT_WAITERS * MUTEX_TABLE_WAITERS)
+
+static struct waiters_head {
+ struct wait_queue **queue;
+ int first_free;
+} mutex_waiters[MUTEX_ROOT_WAITERS] = { { NULL, 0 }, };
+static struct semaphore mutex_lock = MUTEX;
+
+static int add_mutex_undo(struct mutex_undo **p, __linux_mutex_t *mutex)
+{
+ struct mutex_undo *undo = (struct mutex_undo *)kmalloc (sizeof (struct mutex_undo), GFP_KERNEL);
+
+ if (!undo) return -ENOMEM;
+ undo->mutex = mutex;
+ if (!*p) {
+ undo->next = undo;
+ *p = undo;
+ } else {
+ undo->next = (*p)->next;
+ (*p)->next = undo;
+ }
+ return 0;
+}
+
+static void remove_mutex_undo(struct mutex_undo **p, __linux_mutex_t *mutex)
+{
+ struct mutex_undo *undo, *tmp;
+
+ if ((*p)->mutex == mutex) {
+ undo = *p;
+ if ((*p)->next == *p)
+ *p = NULL;
+ else {
+ tmp = undo;
+ while (tmp->next != undo)
+ tmp = tmp->next;
+ tmp->next = undo->next;
+ }
+ } else {
+ tmp = *p;
+ for (undo = tmp->next;
+ undo->mutex != mutex;
+ tmp = undo, undo = tmp->next)
+ if (undo == *p) {
+ printk("Aiee, mutex_undo queue broken");
+ return;
+ }
+
+ }
+ kfree (undo);
+}
+
+#define CHECK_WAITERS \
+ if (m.waiters > MUTEX_WAITERS + 1 || !mutex_waiters [(m.waiters - 1) / MUTEX_TABLE_WAITERS].queue) { \
+ up (&mutex_lock); \
+ return -EINVAL; \
+ }
+
+asmlinkage int sys_mutex(__linux_mutex_t *mutex, int lock)
+{
+ __linux_mutex_t m;
+
+ int retval = verify_area (VERIFY_WRITE, mutex, sizeof(__linux_mutex_t));
+ if (retval) return retval;
+ if ((unsigned int)lock > __LINUX_MUTEX_UNLOCK) return -EINVAL;
+
+ down (&mutex_lock);
+ memcpy_fromfs (&m, mutex, sizeof(__linux_mutex_t));
+ if (m.owner == (u32)current->pid) {
+ if (lock == __LINUX_MUTEX_UNLOCK) {
+ struct wait_queue **q = NULL;
+ if (m.waiters) {
+ CHECK_WAITERS
+ q = mutex_waiters [(m.waiters - 1) / MUTEX_TABLE_WAITERS].queue + ((m.waiters - 1) % MUTEX_TABLE_WAITERS);
+ }
+ remove_mutex_undo (&(current->mutexundo), mutex);
+ if (!q || *q == NULL) {
+ m.owner = 0;
+ m.waiters = 0;
+ memcpy_tofs(mutex, &m, sizeof(__linux_mutex_t));
+ up (&mutex_lock);
+ return 0;
+ }
+ m.owner = 0;
+ memcpy_tofs(mutex, &m, sizeof(__linux_mutex_t));
+ wake_up(q);
+ up (&mutex_lock);
+ return 0;
+ } else {
+ up (&mutex_lock);
+ /* Should we shout we're trying to lock the mutex again? */
+ return 0;
+ }
+ } else {
+ if (!m.owner) {
+ if (lock == __LINUX_MUTEX_UNLOCK) {
+ up (&mutex_lock);
+ return -EINVAL;
+ }
+ if (add_mutex_undo (&(current->mutexundo), mutex) < 0) {
+ up (&mutex_lock);
+ return -ENOMEM;
+ }
+ m.owner = (u32)current->pid;
+ m.waiters = 0;
+ memcpy_tofs(mutex, &m, sizeof(__linux_mutex_t));
+ up (&mutex_lock);
+ return 0;
+ } else switch (lock) {
+ case __LINUX_MUTEX_UNLOCK:
+ up (&mutex_lock);
+ return -EINVAL;
+ case __LINUX_MUTEX_TRYLOCK:
+ up (&mutex_lock);
+ return -EBUSY;
+ case __LINUX_MUTEX_LOCK:
+ {
+ struct wait_queue **q;
+ struct wait_queue wait = { current, NULL };
+ unsigned long flags;
+
+ if (!m.waiters) {
+ int i;
+ struct wait_queue **tmp;
+
+ for (i = 0; i < MUTEX_ROOT_WAITERS; i++) {
+ if (!mutex_waiters [i].queue) {
+ if (!(mutex_waiters [i].queue = (struct wait_queue **) kmalloc (MUTEX_TABLE_WAITERS * sizeof(struct wait_queue *), GFP_KERNEL))) {
+ return -ENOMEM;
+ }
+ mutex_waiters [i].first_free = 0;
+ memset (mutex_waiters [i].queue, 0, MUTEX_TABLE_WAITERS * sizeof(struct wait_queue *));
+ }
+ if (mutex_waiters [i].first_free < MUTEX_TABLE_WAITERS)
+ break;
+ }
+ if (i == MUTEX_ROOT_WAITERS) {
+ return -ENOMEM;
+ }
+ m.waiters = mutex_waiters [i].first_free + i * MUTEX_TABLE_WAITERS + 1;
+ i = mutex_waiters [i].first_free + 1;
+ q = mutex_waiters [i].queue + i - 1;
+ for (tmp = q; i < MUTEX_TABLE_WAITERS; i++, tmp++)
+ if (!*tmp) break;
+ mutex_waiters [i].first_free = i;
+ memcpy_tofs(mutex, &m, sizeof(__linux_mutex_t));
+ } else {
+ CHECK_WAITERS
+ q = mutex_waiters [(m.waiters - 1) / MUTEX_TABLE_WAITERS].queue + ((m.waiters - 1) % MUTEX_TABLE_WAITERS);
+ }
+ add_wait_queue(q, &wait);
+ do {
+ current->state = TASK_UNINTERRUPTIBLE;
+ up (&mutex_lock);
+ save_flags(flags);
+ sti();
+ schedule();
+ restore_flags(flags);
+ down (&mutex_lock);
+ m.owner = get_user(&mutex->owner);
+ } while (m.owner && m.owner != (u32)current->pid);
+ remove_wait_queue(q, &wait);
+ if (*q == NULL && m.waiters) {
+ int i = (m.waiters - 1) / MUTEX_TABLE_WAITERS, j;
+
+ j = (m.waiters - 1) - i * MUTEX_TABLE_WAITERS;
+ if (mutex_waiters [i].first_free > j)
+ mutex_waiters [i].first_free = j;
+ m.waiters = 0;
+ }
+ if (add_mutex_undo (&(current->mutexundo), mutex) < 0) {
+ m.owner = 0;
+ memcpy_tofs(mutex, &m, sizeof(__linux_mutex_t));
+ up (&mutex_lock);
+ return -ENOMEM;
+ }
+ m.owner = (u32)current->pid;
+ memcpy_tofs(mutex, &m, sizeof(__linux_mutex_t));
+ up (&mutex_lock);
+ }
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+void mutex_exit(void)
+{
+ struct mutex_undo *undo;
+ down(&mutex_lock);
+ while ((undo = current->mutexundo) != NULL) {
+ if (!verify_area (VERIFY_WRITE, undo->mutex, sizeof(__linux_mutex_t)) &&
+ get_user(&((undo->mutex)->owner)) == (u32)current->pid) {
+ up(&mutex_lock);
+ sys_mutex(undo->mutex, __LINUX_MUTEX_UNLOCK);
+ down(&mutex_lock);
+ }
+ remove_mutex_undo(&(current->mutexundo), undo->mutex);
+ }
+ up(&mutex_lock);
+}
+
diff -urN -X /usr/src/thread/backup/diffex linux-1.3.98/kernel/exit.c linux/kernel/exit.c
--- linux-1.3.98/kernel/exit.c Thu Apr 25 14:08:44 1996
+++ linux/kernel/exit.c Sat May 11 08:50:00 1996
@@ -20,6 +20,7 @@
#include <asm/pgtable.h>

extern void sem_exit (void);
+extern void mutex_exit (void);
extern void acct_process (long exitcode);
extern void kerneld_exit(void);

@@ -551,6 +552,7 @@
acct_process(code);
current->flags |= PF_EXITING;
del_timer(&current->real_timer);
+ mutex_exit();
sem_exit();
kerneld_exit();
__exit_mm(current);
diff -urN -X /usr/src/thread/backup/diffex linux-1.3.98/kernel/fork.c linux/kernel/fork.c
--- linux-1.3.98/kernel/fork.c Mon Apr 22 12:08:47 1996
+++ linux/kernel/fork.c Sat May 11 21:38:20 1996
@@ -265,6 +265,7 @@
goto bad_fork_cleanup_sighand;
copy_thread(nr, clone_flags, usp, p, regs);
p->semundo = NULL;
+ p->mutexundo = NULL;

/* ok, now we should be set up.. */
p->swappable = 1;

+---------------------------------------------------------------------------+
| Jakub Jelinek http://sunsite.mff.cuni.cz/~jj |
| Administrator of SunSITE Czech Republic jj@sunsite.mff.cuni.cz |
| Na Orechovce 7, 162 00 Praha 6, Czech Republic jj@gnu.ai.mit.edu |
| School: MFF UK, Praha; Work: MFF UK & VC CVUT, Praha jj@jfch.vc.cvut.cz |
+---------------------------------------------------------------------------+