madvise() first draft

Chuck Lever (cel@monkey.org)
Wed, 25 Aug 1999 18:52:55 -0400 (EDT)


i'm posting a first draft of an implementation of the madvise(2) system
call. this is for feedback about approach and algorithms. i'd like to
see this eventually go into the kernel along with the mmap read-ahead
stuff i'm working on (and in fact madvise will be more meaningful if the
read-ahead stuff is there too). this is against 2.3.15-pre3.

it still needs some help in the non-i386 architecture specific department.

Binary files linux-2.3.15-ref/ID and linux/ID differ
diff -ruN linux-2.3.15-ref/arch/alpha/kernel/osf_sys.c linux/arch/alpha/kernel/osf_sys.c
--- linux-2.3.15-ref/arch/alpha/kernel/osf_sys.c Tue Aug 24 14:40:29 1999
+++ linux/arch/alpha/kernel/osf_sys.c Wed Aug 25 16:38:44 1999
@@ -205,6 +205,8 @@
}


+#if 0
+/* Linux has an madvise implementation now. */
/*
* Heh. As documented by DEC..
*/
@@ -212,6 +214,7 @@
{
return 0;
}
+#endif

/*
* No need to acquire the kernel lock, we're local..
diff -ruN linux-2.3.15-ref/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S
--- linux-2.3.15-ref/arch/i386/kernel/entry.S Tue Aug 24 14:40:29 1999
+++ linux/arch/i386/kernel/entry.S Wed Aug 25 16:35:31 1999
@@ -560,6 +560,7 @@
.long SYMBOL_NAME(sys_ni_syscall) /* streams1 */
.long SYMBOL_NAME(sys_ni_syscall) /* streams2 */
.long SYMBOL_NAME(sys_vfork) /* 190 */
+ .long SYMBOL_NAME(sys_madvise)

/*
* NOTE!! This doesn't have to be exact - we just have
@@ -567,6 +568,6 @@
* entries. Don't panic if you notice that this hasn't
* been shrunk every time we add a new system call.
*/
- .rept NR_syscalls-190
+ .rept NR_syscalls-189
.long SYMBOL_NAME(sys_ni_syscall)
.endr
diff -ruN linux-2.3.15-ref/arch/m68k/kernel/entry.S linux/arch/m68k/kernel/entry.S
--- linux-2.3.15-ref/arch/m68k/kernel/entry.S Tue Aug 24 14:41:01 1999
+++ linux/arch/m68k/kernel/entry.S Wed Aug 25 16:35:50 1999
@@ -600,6 +600,7 @@
.long SYMBOL_NAME(sys_ni_syscall) /* streams1 */
.long SYMBOL_NAME(sys_ni_syscall) /* streams2 */
.long SYMBOL_NAME(sys_vfork) /* 190 */
+ .long SYMBOL_NAME(sys_madvise)

.rept NR_syscalls-(.-SYMBOL_NAME(sys_call_table))/4
.long SYMBOL_NAME(sys_ni_syscall)
diff -ruN linux-2.3.15-ref/arch/mips/kernel/syscalls.h linux/arch/mips/kernel/syscalls.h
--- linux-2.3.15-ref/arch/mips/kernel/syscalls.h Tue Aug 24 14:39:26 1999
+++ linux/arch/mips/kernel/syscalls.h Wed Aug 25 16:37:06 1999
@@ -225,3 +225,4 @@
SYS(sys_sendfile, 3)
SYS(sys_ni_syscall, 0)
SYS(sys_ni_syscall, 0)
+SYS(sys_madvise, 3) /* 4210 */
diff -ruN linux-2.3.15-ref/arch/mips/kernel/sysirix.c linux/arch/mips/kernel/sysirix.c
--- linux-2.3.15-ref/arch/mips/kernel/sysirix.c Tue Aug 24 14:39:26 1999
+++ linux/arch/mips/kernel/sysirix.c Wed Aug 25 16:40:30 1999
@@ -1136,6 +1136,10 @@
return retval;
}

+/*
+ * XXX Linux has a native madvise() system call now.
+ * This may need to do something Irix-specific.
+ */
asmlinkage int irix_madvise(unsigned long addr, int len, int behavior)
{
lock_kernel();
diff -ruN linux-2.3.15-ref/arch/ppc/kernel/misc.S linux/arch/ppc/kernel/misc.S
--- linux-2.3.15-ref/arch/ppc/kernel/misc.S Tue Aug 24 14:39:28 1999
+++ linux/arch/ppc/kernel/misc.S Wed Aug 25 16:36:05 1999
@@ -894,4 +894,5 @@
.long sys_ni_syscall /* streams1 */
.long sys_ni_syscall /* streams2 */
.long sys_vfork
+ .long sys_madvise
.space (NR_syscalls-183)*4
diff -ruN linux-2.3.15-ref/include/asm-alpha/mman.h linux/include/asm-alpha/mman.h
--- linux-2.3.15-ref/include/asm-alpha/mman.h Sun Jan 25 19:31:47 1998
+++ linux/include/asm-alpha/mman.h Tue Aug 24 17:02:21 1999
@@ -31,6 +31,13 @@
#define MCL_CURRENT 8192 /* lock all currently mapped pages */
#define MCL_FUTURE 16384 /* lock all additions to address space */

+/* used by madvise() */
+#define MADV_NORMAL 0x0 /* default page-in behavior */
+#define MADV_RANDOM 0x1 /* page-in minimum required */
+#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
+#define MADV_WILLNEED 0x3 /* pre-fault pages */
+#define MADV_DONTNEED 0x4 /* free these pages */
+
/* compatibility flags */
#define MAP_ANON MAP_ANONYMOUS
#define MAP_FILE 0
diff -ruN linux-2.3.15-ref/include/asm-arm/mman.h linux/include/asm-arm/mman.h
--- linux-2.3.15-ref/include/asm-arm/mman.h Tue Jan 20 19:39:42 1998
+++ linux/include/asm-arm/mman.h Tue Aug 24 17:03:12 1999
@@ -22,6 +22,12 @@
#define MS_INVALIDATE 2 /* invalidate the caches */
#define MS_SYNC 4 /* synchronous memory sync */

+#define MADV_NORMAL 0x0 /* default page-in behavior */
+#define MADV_RANDOM 0x1 /* page-in minimum required */
+#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
+#define MADV_WILLNEED 0x3 /* pre-fault pages */
+#define MADV_DONTNEED 0x4 /* free these pages */
+
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */

diff -ruN linux-2.3.15-ref/include/asm-i386/mman.h linux/include/asm-i386/mman.h
--- linux-2.3.15-ref/include/asm-i386/mman.h Mon Oct 7 01:55:48 1996
+++ linux/include/asm-i386/mman.h Tue Aug 24 17:03:48 1999
@@ -22,6 +22,12 @@
#define MS_INVALIDATE 2 /* invalidate the caches */
#define MS_SYNC 4 /* synchronous memory sync */

+#define MADV_NORMAL 0x0 /* default page-in behavior */
+#define MADV_RANDOM 0x1 /* page-in minimum required */
+#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
+#define MADV_WILLNEED 0x3 /* pre-fault pages */
+#define MADV_DONTNEED 0x4 /* free these pages */
+
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */

diff -ruN linux-2.3.15-ref/include/asm-m68k/mman.h linux/include/asm-m68k/mman.h
--- linux-2.3.15-ref/include/asm-m68k/mman.h Fri Nov 22 08:56:36 1996
+++ linux/include/asm-m68k/mman.h Tue Aug 24 17:04:22 1999
@@ -22,6 +22,12 @@
#define MS_INVALIDATE 2 /* invalidate the caches */
#define MS_SYNC 4 /* synchronous memory sync */

+#define MADV_NORMAL 0x0 /* default page-in behavior */
+#define MADV_RANDOM 0x1 /* page-in minimum required */
+#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
+#define MADV_WILLNEED 0x3 /* pre-fault pages */
+#define MADV_DONTNEED 0x4 /* free these pages */
+
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */

diff -ruN linux-2.3.15-ref/include/asm-mips/mman.h linux/include/asm-mips/mman.h
--- linux-2.3.15-ref/include/asm-mips/mman.h Thu Jun 26 15:33:40 1997
+++ linux/include/asm-mips/mman.h Tue Aug 24 17:05:07 1999
@@ -51,6 +51,15 @@
#define MS_INVALIDATE 2 /* invalidate mappings & caches */

/*
+ * Flags for madvise
+ */
+#define MADV_NORMAL 0x0 /* default page-in behavior */
+#define MADV_RANDOM 0x1 /* page-in minimum required */
+#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
+#define MADV_WILLNEED 0x3 /* pre-fault pages */
+#define MADV_DONTNEED 0x4 /* free these pages */
+
+/*
* Flags for mlockall
*/
#define MCL_CURRENT 1 /* lock all current mappings */
diff -ruN linux-2.3.15-ref/include/asm-ppc/mman.h linux/include/asm-ppc/mman.h
--- linux-2.3.15-ref/include/asm-ppc/mman.h Wed Dec 18 03:54:09 1996
+++ linux/include/asm-ppc/mman.h Tue Aug 24 17:05:44 1999
@@ -22,6 +22,12 @@
#define MS_INVALIDATE 2 /* invalidate the caches */
#define MS_SYNC 4 /* synchronous memory sync */

+#define MADV_NORMAL 0x0 /* default page-in behavior */
+#define MADV_RANDOM 0x1 /* page-in minimum required */
+#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
+#define MADV_WILLNEED 0x3 /* pre-fault pages */
+#define MADV_DONTNEED 0x4 /* free these pages */
+
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */

diff -ruN linux-2.3.15-ref/include/asm-sparc/mman.h linux/include/asm-sparc/mman.h
--- linux-2.3.15-ref/include/asm-sparc/mman.h Sat Nov 9 03:29:41 1996
+++ linux/include/asm-sparc/mman.h Tue Aug 24 17:06:17 1999
@@ -28,6 +28,12 @@
#define MS_INVALIDATE 2 /* invalidate the caches */
#define MS_SYNC 4 /* synchronous memory sync */

+#define MADV_NORMAL 0x0 /* default page-in behavior */
+#define MADV_RANDOM 0x1 /* page-in minimum required */
+#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
+#define MADV_WILLNEED 0x3 /* pre-fault pages */
+#define MADV_DONTNEED 0x4 /* free these pages */
+
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */

diff -ruN linux-2.3.15-ref/include/asm-sparc64/mman.h linux/include/asm-sparc64/mman.h
--- linux-2.3.15-ref/include/asm-sparc64/mman.h Fri Dec 13 04:37:47 1996
+++ linux/include/asm-sparc64/mman.h Tue Aug 24 17:06:48 1999
@@ -28,6 +28,12 @@
#define MS_INVALIDATE 2 /* invalidate the caches */
#define MS_SYNC 4 /* synchronous memory sync */

+#define MADV_NORMAL 0x0 /* default page-in behavior */
+#define MADV_RANDOM 0x1 /* page-in minimum required */
+#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
+#define MADV_WILLNEED 0x3 /* pre-fault pages */
+#define MADV_DONTNEED 0x4 /* free these pages */
+
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */

diff -ruN linux-2.3.15-ref/include/linux/mm.h linux/include/linux/mm.h
--- linux-2.3.15-ref/include/linux/mm.h Tue Aug 24 14:51:52 1999
+++ linux/include/linux/mm.h Wed Aug 25 17:30:37 1999
@@ -57,6 +57,7 @@
unsigned long vm_offset;
struct file * vm_file;
unsigned long vm_pte; /* shared mem */
+ unsigned vm_rd_behavior;
};

/*
diff -ruN linux-2.3.15-ref/mm/filemap.c linux/mm/filemap.c
--- linux-2.3.15-ref/mm/filemap.c Tue Aug 24 14:43:19 1999
+++ linux/mm/filemap.c Wed Aug 25 17:30:16 1999
@@ -33,6 +33,8 @@
*
* finished 'unifying' the page and buffer cache and SMP-threaded the
* page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * madvise(2), 24.08.1999, Chuck Lever <cel@monkey.org>
*/

atomic_t page_cache_size = ATOMIC_INIT(0);
@@ -1367,7 +1369,8 @@
* Otherwise, we're off the end of a privately mapped file,
* so we need to map a zero page.
*/
- if (offset < inode->i_size)
+ if ((offset < inode->i_size) &&
+ (area->vm_rd_behavior != MADV_RANDOM))
read_cluster_nonblocking(file, offset);
else
page_cache_read(file, offset);
@@ -1881,6 +1884,175 @@
err = written ? written : status;
out:
return err;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * The application can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area. The idea is to help the kernel
+ * use appropriate read-ahead techniques. The information provided is
+ * advisory only, and can be safely disregarded by the kernel.
+ *
+ * behavior values:
+ * MADV_NORMAL - the default behavior is to read clusters. This
+ * results in some read-ahead and read-behind.
+ * MADV_RANDOM - the system should read the minimum amount of data
+ * on any access, since it is unlikely that the application
+ * will need more than what it asks for.
+ * MADV_SEQUENTIAL - pages in the given range will probably be
+ * accessed once, so they can be aggressively read ahead,
+ * and can be freed soon after they are accessed.
+ * MADV_WILLNEED - the application is notifying the system to read
+ * some pages ahead.
+ * MADV_DONTNEED - the application is finished with the given range,
+ * so the kernel can free resources associated with it.
+ *
+ * return values:
+ * zero = success
+ * -1 = some error occurred, errno value set (see below).
+ *
+ * errno values:
+ * EINVAL - start + len < 0, or start is not page-aligned, or
+ * behavior is not a valid value.
+ * ENOMEM - addresses in the specified range are not mapped, or are
+ * outside the AS of the process.
+ * EIO - an I/O error occurred while paging in data.
+ */
+
+/*
+ * This reads in data the application thinks it will need eventually.
+ * Today it is synchronous, but some day this should initiate asynch-
+ * ronous read-ahead and return immediately.
+ */
+static inline int madvise_willneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ int write = (vma->vm_flags & VM_WRITE) != 0;
+ while (start < end) {
+ if (handle_mm_fault(current, vma, start, write) < 0) {
+ return -EIO;
+ }
+ start += PAGE_CACHE_SIZE;
+ }
+ return 0;
+}
+
+/*
+ * Application no longer needs these pages. If there is dirty data,
+ * it's OK to just throw it away. The app will be more careful about
+ * data it wants to keep. Be sure to free swap resources too.
+ *
+ * This is a simple-minded "lazy" implementation that just signals
+ * shrink_mmap to do the work later, if the system needs these pages
+ * back.
+ */
+static inline int madvise_dontneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ struct page * page;
+
+ while (start < end) {
+ page = page_cache_entry(start);
+ clear_bit(PG_referenced, &page->flags);
+ start += PAGE_CACHE_SIZE;
+ }
+ return 0;
+}
+
+static int madvise_area(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end, int behavior)
+{
+ int error = 0;
+
+ switch (behavior) {
+ case MADV_NORMAL:
+ case MADV_SEQUENTIAL:
+ case MADV_RANDOM:
+ vma->vm_rd_behavior = behavior;
+ break;
+
+ case MADV_WILLNEED:
+ error = madvise_willneed(vma, start, end);
+ break;
+
+ case MADV_DONTNEED:
+ error = madvise_dontneed(vma, start, end);
+ break;
+
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * The system call API is given a range of addresses in the
+ * current process's address space. This function does sanity
+ * checking, then breaks the address range up into a set of
+ * vm areas for processing.
+ */
+asmlinkage int sys_madvise(unsigned long start, size_t len, int behavior)
+{
+ unsigned long end;
+ struct vm_area_struct * vma;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ down(&current->mm->mmap_sem);
+
+ if (start & ~PAGE_MASK)
+ goto out;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ error = 0;
+ if (end == start)
+ goto out;
+
+ /*
+ * If the interval [start,end) covers some unmapped address ranges,
+ * just ignore them, but return -ENOMEM at the end.
+ */
+ vma = find_vma(current->mm, start);
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if (end <= vma->vm_end) {
+ if (start < end) {
+ error = madvise_area(vma, start, end, behavior);
+ if (error)
+ goto out;
+ }
+ error = unmapped_error;
+ goto out;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = madvise_area(vma, start, vma->vm_end, behavior);
+ if (error)
+ goto out;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ }
+
+out:
+ up(&current->mm->mmap_sem);
+ return error;
}

/*

- Chuck Lever

--
corporate:	<chuckl@netscape.com>
personal:	<chucklever@netscape.net> or <cel@monkey.org>

The Linux Scalability project: http://www.citi.umich.edu/projects/linux-scalability/

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.rutgers.edu Please read the FAQ at http://www.tux.org/lkml/