Re: 2.2.17pre10 pauses fixed by Andrea's VM-global-patch-1

From: Andrea Arcangeli (andrea@suse.de)
Date: Wed Jul 12 2000 - 11:32:48 EST


On Sat, 8 Jul 2000, Andrea Arcangeli wrote:

>Thanks for the feedback! However I must warn that such patch have a
>deadlock condition (pointed out by SCT, thanks!) that could cause a task

I've fixed the deadlock condition in this new patch (can be applied to
17pre9 or 17pre11 indifferently):

        ftp://ftp.*.kernel.org/pub/linux/kernel/people/andrea/patches/v2.2/2.2.17pre11/VM-global-patch-2.gz

The diff between VM-global-patch-1 and VM-global-patch-2 is here below
(it's the bit that fixes the deadlock). The new -2 patch run Cerberus for
more than 2 hours without any apparent problem. (however Cerberus wasn't
able to exploit the deadlock condition anyway and old patch or new patch
won't make difference from a Cerberus point of view)

diff -urN old/fs/binfmt_aout.c 2.2.17pre9-VM/fs/binfmt_aout.c
--- old/fs/binfmt_aout.c Sun Apr 2 21:07:49 2000
+++ 2.2.17pre9-VM/fs/binfmt_aout.c Wed Jul 12 00:50:29 2000
@@ -62,9 +62,9 @@
 static int dump_write(struct file *file, const void *addr, int nr)
 {
         int r;
- down(&file->f_dentry->d_inode->i_sem);
+ fs_down(&file->f_dentry->d_inode->i_sem);
         r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
- up(&file->f_dentry->d_inode->i_sem);
+ fs_up(&file->f_dentry->d_inode->i_sem);
         return r;
 }
 
diff -urN old/fs/binfmt_elf.c 2.2.17pre9-VM/fs/binfmt_elf.c
--- old/fs/binfmt_elf.c Tue Jun 13 03:48:14 2000
+++ 2.2.17pre9-VM/fs/binfmt_elf.c Wed Jul 12 00:50:34 2000
@@ -948,9 +948,9 @@
 static int dump_write(struct file *file, const void *addr, int nr)
 {
         int r;
- down(&file->f_dentry->d_inode->i_sem);
+ fs_down(&file->f_dentry->d_inode->i_sem);
         r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
- up(&file->f_dentry->d_inode->i_sem);
+ fs_up(&file->f_dentry->d_inode->i_sem);
         return r;
 }
 
diff -urN old/fs/buffer.c 2.2.17pre9-VM/fs/buffer.c
--- old/fs/buffer.c Wed Jul 12 04:02:11 2000
+++ 2.2.17pre9-VM/fs/buffer.c Wed Jul 12 00:50:54 2000
@@ -362,9 +362,9 @@
                 goto out_putf;
 
         /* We need to protect against concurrent writers.. */
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
         err = file->f_op->fsync(file, dentry);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
 
 out_putf:
         fput(file);
@@ -399,9 +399,9 @@
                 goto out_putf;
 
         /* this needs further work, at the moment it is identical to fsync() */
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
         err = file->f_op->fsync(file, dentry);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
 
 out_putf:
         fput(file);
@@ -1467,7 +1467,7 @@
                 return 0;
         }
 
- if (!(page = __get_free_page(GFP_BUFFER)))
+ if (!(page = __get_free_page(GFP_KERNEL)))
                 return 0;
         bh = create_buffers(page, size, 0);
         if (!bh) {
diff -urN old/fs/coda/file.c 2.2.17pre9-VM/fs/coda/file.c
--- old/fs/coda/file.c Mon Jan 17 16:44:41 2000
+++ 2.2.17pre9-VM/fs/coda/file.c Wed Jul 12 00:58:02 2000
@@ -193,10 +193,10 @@
                 return -1;
         }
 
- down(&cont_inode->i_sem);
+ fs_down(&cont_inode->i_sem);
         result = cont_file.f_op->write(&cont_file , buff, count,
                                        &(cont_file.f_pos));
- up(&cont_inode->i_sem);
+ fs_up(&cont_inode->i_sem);
         coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
         
         if (result)
@@ -232,14 +232,14 @@
         coda_prepare_openfile(coda_inode, coda_file, cont_inode,
                               &cont_file, &cont_dentry);
 
- down(&cont_inode->i_sem);
+ fs_down(&cont_inode->i_sem);
 
         result = file_fsync(&cont_file ,&cont_dentry);
         if ( result == 0 ) {
                 result = venus_fsync(coda_inode->i_sb, &(cnp->c_fid));
         }
 
- up(&cont_inode->i_sem);
+ fs_up(&cont_inode->i_sem);
 
         coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
         return result;
diff -urN old/fs/dcache.c 2.2.17pre9-VM/fs/dcache.c
--- old/fs/dcache.c Wed Jul 12 04:02:11 2000
+++ 2.2.17pre9-VM/fs/dcache.c Mon Jul 3 17:44:25 2000
@@ -475,7 +475,7 @@
  */
 void shrink_dcache_memory(int priority, unsigned int gfp_mask)
 {
- if (gfp_mask & __GFP_IO) {
+ if (gfp_mask & __GFP_IO && !current->fs_locks) {
                 int count = 0;
                 if (priority > 1)
                         count = dentry_stat.nr_unused / priority;
diff -urN old/fs/ext2/super.c 2.2.17pre9-VM/fs/ext2/super.c
--- old/fs/ext2/super.c Wed Jul 12 04:02:11 2000
+++ 2.2.17pre9-VM/fs/ext2/super.c Mon Jul 3 17:40:21 2000
@@ -589,7 +589,7 @@
                                        EXT2_BLOCKS_PER_GROUP(sb);
         db_count = (sb->u.ext2_sb.s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
                    EXT2_DESC_PER_BLOCK(sb);
- sb->u.ext2_sb.s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_BUFFER);
+ sb->u.ext2_sb.s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
         if (sb->u.ext2_sb.s_group_desc == NULL) {
                 printk ("EXT2-fs: not enough memory\n");
                 goto failed_mount;
diff -urN old/fs/open.c 2.2.17pre9-VM/fs/open.c
--- old/fs/open.c Sun Apr 2 21:07:49 2000
+++ 2.2.17pre9-VM/fs/open.c Wed Jul 12 00:54:09 2000
@@ -73,7 +73,7 @@
         if ((off_t) length < 0)
                 return -EINVAL;
 
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
         newattrs.ia_size = length;
         newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
         error = notify_change(dentry, &newattrs);
@@ -83,7 +83,7 @@
                 if (inode->i_op && inode->i_op->truncate)
                         inode->i_op->truncate(inode);
         }
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
         return error;
 }
 
diff -urN old/fs/read_write.c 2.2.17pre9-VM/fs/read_write.c
--- old/fs/read_write.c Wed Jun 28 17:13:14 2000
+++ 2.2.17pre9-VM/fs/read_write.c Wed Jul 12 00:55:13 2000
@@ -166,9 +166,9 @@
         if (!file->f_op || !(write = file->f_op->write))
                 goto out;
 
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
         ret = write(file, buf, count, &file->f_pos);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
 out:
         fput(file);
 bad_file:
@@ -314,9 +314,9 @@
         if (!file)
                 goto bad_file;
         if (file->f_op && file->f_op->write && (file->f_mode & FMODE_WRITE)) {
- down(&file->f_dentry->d_inode->i_sem);
+ fs_down(&file->f_dentry->d_inode->i_sem);
                 ret = do_readv_writev(VERIFY_READ, file, vector, count);
- up(&file->f_dentry->d_inode->i_sem);
+ fs_up(&file->f_dentry->d_inode->i_sem);
         }
         fput(file);
 
@@ -386,9 +386,9 @@
         if (pos < 0)
                 goto out;
 
- down(&file->f_dentry->d_inode->i_sem);
+ fs_down(&file->f_dentry->d_inode->i_sem);
         ret = write(file, buf, count, &pos);
- up(&file->f_dentry->d_inode->i_sem);
+ fs_up(&file->f_dentry->d_inode->i_sem);
 
 out:
         fput(file);
diff -urN old/include/linux/fs.h 2.2.17pre9-VM/include/linux/fs.h
--- old/include/linux/fs.h Wed Jul 12 04:02:11 2000
+++ 2.2.17pre9-VM/include/linux/fs.h Wed Jul 12 01:20:19 2000
@@ -919,6 +919,9 @@
 
 extern __u32 inode_generation_count;
 
+#define fs_down(sem) do { current->fs_locks++; down(sem); } while (0)
+#define fs_up(sem) do { up(sem); current->fs_locks--; } while (0)
+
 #endif /* __KERNEL__ */
 
 #endif
diff -urN old/include/linux/locks.h 2.2.17pre9-VM/include/linux/locks.h
--- old/include/linux/locks.h Wed Jul 12 04:02:11 2000
+++ 2.2.17pre9-VM/include/linux/locks.h Wed Jul 12 01:22:36 2000
@@ -51,10 +51,12 @@
         if (sb->s_lock)
                 __wait_on_super(sb);
         sb->s_lock = 1;
+ current->fs_locks++;
 }
 
 extern inline void unlock_super(struct super_block * sb)
 {
+ current->fs_locks--;
         sb->s_lock = 0;
         wake_up(&sb->s_wait);
 }
diff -urN old/include/linux/sched.h 2.2.17pre9-VM/include/linux/sched.h
--- old/include/linux/sched.h Wed Jul 12 04:02:11 2000
+++ 2.2.17pre9-VM/include/linux/sched.h Wed Jul 12 01:22:35 2000
@@ -317,6 +317,7 @@
 /* memory management info */
         struct mm_struct *mm;
         struct list_head local_pages; int allocation_order, nr_local_pages;
+ int fs_locks;
 
 /* signal handlers */
         spinlock_t sigmask_lock; /* Protects signal and blocked */
@@ -397,7 +398,7 @@
 /* tss */ INIT_TSS, \
 /* fs */ &init_fs, \
 /* files */ &init_files, \
-/* mm */ &init_mm, { &init_task.local_pages, &init_task.local_pages}, 0, 0, \
+/* mm */ &init_mm, { &init_task.local_pages, &init_task.local_pages}, 0, 0, 0, \
 /* signals */ SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \
 /* exec cts */ 0,0, \
 /* oom */ 0, \
diff -urN old/mm/filemap.c 2.2.17pre9-VM/mm/filemap.c
--- old/mm/filemap.c Wed Jul 12 04:02:11 2000
+++ 2.2.17pre9-VM/mm/filemap.c Wed Jul 12 18:09:11 2000
@@ -834,12 +834,12 @@
 
         if (size > count)
                 size = count;
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
         old_fs = get_fs();
         set_fs(KERNEL_DS);
         written = file->f_op->write(file, area, size, &file->f_pos);
         set_fs(old_fs);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
         if (written < 0) {
                 desc->error = written;
                 written = 0;
@@ -1134,9 +1134,9 @@
          * and file could be released ... increment the count to be safe.
          */
         file->f_count++;
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
         result = do_write_page(inode, file, (const char *) page, offset);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
         fput(file);
         return result;
 }
@@ -1358,9 +1358,9 @@
                         if (file) {
                                 struct dentry * dentry = file->f_dentry;
                                 struct inode * inode = dentry->d_inode;
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
                                 error = file_fsync(file, dentry);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
                         }
                 }
                 return error;
diff -urN old/mm/vmscan.c 2.2.17pre9-VM/mm/vmscan.c
--- old/mm/vmscan.c Wed Jul 12 04:02:11 2000
+++ 2.2.17pre9-VM/mm/vmscan.c Mon Jul 3 17:43:02 2000
@@ -106,7 +106,7 @@
          * we cannot do I/O! Avoid recursing on FS
          * locks etc.
          */
- if (!(gfp_mask & __GFP_IO))
+ if (!(gfp_mask & __GFP_IO) || current->fs_locks)
                 return 0;
 
         /*
@@ -395,7 +395,7 @@
                 }
 
                 /* Try to get rid of some shared memory pages.. */
- if (gfp_mask & __GFP_IO) {
+ if (gfp_mask & __GFP_IO && !current->fs_locks) {
                         while (shm_swap(priority, gfp_mask)) {
                                 if (!--count)
                                         goto done;
I checked that the fix works by writing the worse possible testcase
suggestd in earlier email by Stephen. If I drop the fs_locks trick the
below testcase deadlocks the machine. With the above fs_locks stuff
included it doesn't deadlock and I still avoid kpiod asynchronous
behaviour as much as possible.

/*
 * mmap003.c
 *
 * Writing a swapped out buffer to a file that is MAP_SHARED and
 * its pages are all dirty. Idea from Stephen C. Tweedie <sct@redhat.com>.
 *
 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <asm/page.h>

#define TMPFILENAME "/tmp/deleteme.mmap003"

static unsigned long get_ramsize(void)
{
        FILE * meminfo;
        char * sed = "sed -n 's/^Mem:\\(.*\\)$/\\1/p' < /proc/meminfo";
        unsigned long _free = 0, cache = 0, buffers = 0, ramsize;

        meminfo = popen(sed, "r");
        if (!meminfo)
                perror("popen meminfo"), exit(1);
        if (!fscanf(meminfo, "%lu %lu %lu %lu %lu %lu\n",
                    &ramsize, &ramsize, &_free, &ramsize, &buffers, &cache))
                perror("fscan meminfo"), exit(1);
        if (pclose(meminfo) < 0)
                perror("pclose meminfo"), exit(1);

        ramsize = _free + cache + buffers;
        ramsize >>= 20;

        printf("Detected %lu MB of RAM\n", ramsize);

        return ramsize;
}

int main(int argc, char * arvg[])
{
        char * swapout, * tmp;
        unsigned long ramsize, i;
        int file;

        ramsize = ((get_ramsize() << 20) * 2 / 3) & PAGE_MASK;
        printf("Using %lu MB of VM\n", ramsize >> 20);

        tmp = malloc(ramsize);
        if (!tmp)
                perror("malloc tmp"), exit(1);

        swapout = malloc(ramsize);
        if (!swapout)
                perror("malloc swapout"), exit(1);

        printf("Allocating `swapout' memory..."); fflush(stdout);
        bzero(swapout, ramsize);
        printf("done.\n");

        printf("Swapping out `swapout' memory..."); fflush(stdout);
        bzero(tmp, ramsize);
        printf("done.\n");

        free(tmp);

        printf("Creating file..."); fflush(stdout);
        file = open(TMPFILENAME,
                    O_CREAT|O_EXCL|O_TRUNC|O_RDWR, 0600);
        if (file < 0)
                perror("open file"), exit(1);
        printf("done.\n");

        printf("Truncating file..."); fflush(stdout);
        if (ftruncate(file, ramsize) < 0)
                perror("ftruncate file"), exit(1);
        printf("done.\n");

        printf("Mapping file..."); fflush(stdout);
        tmp = mmap(NULL, ramsize, PROT_READ|PROT_WRITE, MAP_SHARED, file, 0);
        if (tmp == MAP_FAILED)
                perror("mmap file");
        printf("done.\n");

        printf("Dirtifying VM..."); fflush(stdout);
        i = ramsize;
        while (i) {
                *(unsigned long *)tmp = 0;
                tmp += PAGE_SIZE;
                i -= PAGE_SIZE;
        }
        printf("done.\n");

        printf("Writing to file..."); fflush(stdout);
        if (write(file, swapout, ramsize) < 0)
                perror("write file"), exit(1);
        printf("done.\n");

        printf("Closing file..."); fflush(stdout);
        if (close(file) < 0)
                perror("close file"), exit(1);
        printf("done.\n");

        printf("Unlinking file..."); fflush(stdout);
        if (unlink(TMPFILENAME))
                perror("unlink file"), exit(1);
        printf("done.\n");

        printf("Releasing swapout memory..."); fflush(stdout);
        free(swapout);
        printf("done.\n");

        printf("Unmapping file..."); fflush(stdout);
        munmap(tmp - ramsize, ramsize);
        printf("done.\n");

        return 0;
}

I think the VM-global-patch-2 patch is the best possible compromise we can
do with the 2.2.x VM and I believe it should fix all the MAP_SHARED
issues (with DBMS too) without hurting interactive performance of unloaded
system.

In the patch there's included the no-swapout patch, if you don't want it
and I'll provide an incremental patch to back it out.

Andrea

PS. Both mmap002 and the above mmap003 works fine with 8mbyte of RAM too
    with VM-global-patch-2 applied on top of 17pre11.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sat Jul 15 2000 - 21:00:15 EST