Re: [RFC 1/3] /dev/low_mem_notify
From: Minchan Kim
Date: Tue Jan 17 2012 - 18:20:35 EST
On Tue, Jan 17, 2012 at 08:51:13PM +0200, Pekka Enberg wrote:
> Hello,
>
> Ok, so here's a proof of concept patch that implements sample-base
> per-process free threshold VM event watching using perf-like syscall
> ABI. I'd really like to see something like this that's much more
> extensible and clean than the /dev based ABIs that people have
> proposed so far.
>
> Pekka
>
> ------------------->
>
> From a07f93fdca360b20daef4a5d66f2a5746f31f6a6 Mon Sep 17 00:00:00 2001
> From: Pekka Enberg <penberg@xxxxxxxxxx>
> Date: Tue, 17 Jan 2012 17:51:48 +0200
> Subject: [PATCH] vmnotify: VM event notification system
>
> This patch implements a new sys_vmnotify_fd() system call that returns a
> pollable file descriptor that can be used to watch VM events.
>
> For example, to watch for VM event when free memory is below 99% of available
> memory using 1 second sample period, you'd do something like this:
>
> struct vmnotify_config config;
> struct vmnotify_event event;
> struct pollfd pollfd;
> int fd;
>
> config = (struct vmnotify_config) {
> .type = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
> .sample_period_ns = 1000000000L,
> .free_threshold = 99,
> };
>
> fd = sys_vmnotify_fd(&config);
>
> pollfd.fd = fd;
> pollfd.events = POLLIN;
>
> if (poll(&pollfd, 1, -1) < 0) {
> perror("poll failed");
> exit(1);
> }
>
> memset(&event, 0, sizeof(event));
>
> if (read(fd, &event, sizeof(event)) < 0) {
> perror("read failed");
> exit(1);
> }
Hi Pekka,
I didn't look into your code(will do) but as I read description,
still I don't convince we need really some process specific threshold like 99%
I think application can know it by polling /proc/meminfo without this mechanism
if they really want.
I would like to notify when system has a trobule with memory pressure without
some process specific threshold. Of course, applicatoin can't expect it.(ie,
application can know system memory pressure by /proc/meminfo but it can't know
when swapout really happens). Kernel low mem notify have to give such notification
to user space, I think.
>
> Signed-off-by: Pekka Enberg <penberg@xxxxxxxxxx>
> ---
> arch/x86/include/asm/unistd_64.h | 2 +
> include/linux/vmnotify.h | 44 ++++++
> mm/Kconfig | 6 +
> mm/Makefile | 1 +
> mm/vmnotify.c | 235 ++++++++++++++++++++++++++++++++
> tools/testing/vmnotify/vmnotify-test.c | 68 +++++++++
> 6 files changed, 356 insertions(+), 0 deletions(-)
> create mode 100644 include/linux/vmnotify.h
> create mode 100644 mm/vmnotify.c
> create mode 100644 tools/testing/vmnotify/vmnotify-test.c
>
> diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
> index 0431f19..b0928cd 100644
> --- a/arch/x86/include/asm/unistd_64.h
> +++ b/arch/x86/include/asm/unistd_64.h
> @@ -686,6 +686,8 @@ __SYSCALL(__NR_getcpu, sys_getcpu)
> __SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
> #define __NR_process_vm_writev 311
> __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
> +#define __NR_vmnotify_fd 312
> +__SYSCALL(__NR_vmnotify_fd, sys_vmnotify_fd)
>
> #ifndef __NO_STUBS
> #define __ARCH_WANT_OLD_READDIR
> diff --git a/include/linux/vmnotify.h b/include/linux/vmnotify.h
> new file mode 100644
> index 0000000..8f8642b
> --- /dev/null
> +++ b/include/linux/vmnotify.h
> @@ -0,0 +1,44 @@
> +#ifndef _LINUX_VMNOTIFY_H
> +#define _LINUX_VMNOTIFY_H
> +
> +#include <linux/types.h>
> +
> +enum {
> + VMNOTIFY_TYPE_FREE_THRESHOLD = 1ULL << 0,
> + VMNOTIFY_TYPE_SAMPLE = 1ULL << 1,
> +};
> +
> +struct vmnotify_config {
> + /*
> + * Size of the struct for ABI extensibility.
> + */
> + __u32 size;
> +
> + /*
> + * Notification type bitmask
> + */
> + __u64 type;
> +
> + /*
> + * Free memory threshold in percentages [1..99]
> + */
> + __u32 free_threshold;
> +
> + /*
> + * Sample period in nanoseconds
> + */
> + __u64 sample_period_ns;
> +};
> +
> +struct vmnotify_event {
> + /* Size of the struct for ABI extensibility. */
> + __u32 size;
> +
> + __u64 nr_avail_pages;
> +
> + __u64 nr_swap_pages;
> +
> + __u64 nr_free_pages;
> +};
> +
> +#endif /* _LINUX_VMNOTIFY_H */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 011b110..6631167 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -373,3 +373,9 @@ config CLEANCACHE
> in a negligible performance hit.
>
> If unsure, say Y to enable cleancache
> +
> +config VMNOTIFY
> + bool "Enable VM event notification system"
> + default n
> + help
> + If unsure, say N to disable vmnotify
> diff --git a/mm/Makefile b/mm/Makefile
> index 50ec00e..e1b5db3 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -51,3 +51,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
> obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
> obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
> obj-$(CONFIG_CLEANCACHE) += cleancache.o
> +obj-$(CONFIG_VMNOTIFY) += vmnotify.o
> diff --git a/mm/vmnotify.c b/mm/vmnotify.c
> new file mode 100644
> index 0000000..6800450
> --- /dev/null
> +++ b/mm/vmnotify.c
> @@ -0,0 +1,235 @@
> +#include <linux/anon_inodes.h>
> +#include <linux/vmnotify.h>
> +#include <linux/syscalls.h>
> +#include <linux/file.h>
> +#include <linux/list.h>
> +#include <linux/poll.h>
> +#include <linux/slab.h>
> +#include <linux/swap.h>
> +
> +#define VMNOTIFY_MAX_FREE_THRESHOD 100
> +
> +struct vmnotify_watch {
> + struct vmnotify_config config;
> +
> + struct mutex mutex;
> + bool pending;
> + struct vmnotify_event event;
> +
> + /* sampling */
> + struct hrtimer timer;
> +
> + /* poll */
> + wait_queue_head_t waitq;
> +};
> +
> +static bool vmnotify_match(struct vmnotify_watch *watch, struct vmnotify_event *event)
> +{
> + if (watch->config.type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
> + u64 threshold;
> +
> + if (!event->nr_avail_pages)
> + return false;
> +
> + threshold = event->nr_free_pages * 100 / event->nr_avail_pages;
> + if (threshold > watch->config.free_threshold)
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static void vmnotify_sample(struct vmnotify_watch *watch)
> +{
> + struct vmnotify_event event;
> + struct sysinfo si;
> +
> + memset(&event, 0, sizeof(event));
> +
> + event.size = sizeof(event);
> + event.nr_free_pages = global_page_state(NR_FREE_PAGES);
> +
> + si_meminfo(&si);
> + event.nr_avail_pages = si.totalram;
> +
> +#ifdef CONFIG_SWAP
> + si_swapinfo(&si);
> + event.nr_swap_pages = si.totalswap;
> +#endif
> +
> + if (!vmnotify_match(watch, &event))
> + return;
> +
> + mutex_lock(&watch->mutex);
> +
> + watch->pending = true;
> +
> + memcpy(&watch->event, &event, sizeof(event));
> +
> + mutex_unlock(&watch->mutex);
> +}
> +
> +static enum hrtimer_restart vmnotify_timer_fn(struct hrtimer *hrtimer)
> +{
> + struct vmnotify_watch *watch = container_of(hrtimer, struct vmnotify_watch, timer);
> + u64 sample_period = watch->config.sample_period_ns;
> +
> + vmnotify_sample(watch);
> +
> + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
> +
> + wake_up(&watch->waitq);
> +
> + return HRTIMER_RESTART;
> +}
> +
> +static void vmnotify_start_timer(struct vmnotify_watch *watch)
> +{
> + u64 sample_period = watch->config.sample_period_ns;
> +
> + hrtimer_init(&watch->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + watch->timer.function = vmnotify_timer_fn;
> +
> + hrtimer_start(&watch->timer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static unsigned int vmnotify_poll(struct file *file, poll_table *wait)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> + unsigned int events = 0;
> +
> + poll_wait(file, &watch->waitq, wait);
> +
> + mutex_lock(&watch->mutex);
> +
> + if (watch->pending)
> + events |= POLLIN;
> +
> + mutex_unlock(&watch->mutex);
> +
> + return events;
> +}
> +
> +static ssize_t vmnotify_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> + int ret = 0;
> +
> + mutex_lock(&watch->mutex);
> +
> + if (!watch->pending)
> + goto out_unlock;
> +
> + if (copy_to_user(buf, &watch->event, sizeof(struct vmnotify_event))) {
> + ret = -EFAULT;
> + goto out_unlock;
> + }
> +
> + ret = watch->event.size;
> +
> + watch->pending = false;
> +
> +out_unlock:
> + mutex_unlock(&watch->mutex);
> +
> + return ret;
> +}
> +
> +static int vmnotify_release(struct inode *inode, struct file *file)
> +{
> + struct vmnotify_watch *watch = file->private_data;
> +
> + hrtimer_cancel(&watch->timer);
> +
> + kfree(watch);
> +
> + return 0;
> +}
> +
> +static const struct file_operations vmnotify_fops = {
> + .poll = vmnotify_poll,
> + .read = vmnotify_read,
> + .release = vmnotify_release,
> +};
> +
> +static struct vmnotify_watch *vmnotify_watch_alloc(void)
> +{
> + struct vmnotify_watch *watch;
> +
> + watch = kzalloc(sizeof *watch, GFP_KERNEL);
> + if (!watch)
> + return NULL;
> +
> + mutex_init(&watch->mutex);
> +
> + init_waitqueue_head(&watch->waitq);
> +
> + return watch;
> +}
> +
> +static int vmnotify_copy_config(struct vmnotify_config __user *uconfig,
> + struct vmnotify_config *config)
> +{
> + int ret;
> +
> + ret = copy_from_user(config, uconfig, sizeof(struct vmnotify_config));
> + if (ret)
> + return -EFAULT;
> +
> + if (!config->type)
> + return -EINVAL;
> +
> + if (config->type & VMNOTIFY_TYPE_SAMPLE) {
> + if (config->sample_period_ns < NSEC_PER_MSEC)
> + return -EINVAL;
> + }
> +
> + if (config->type & VMNOTIFY_TYPE_FREE_THRESHOLD) {
> + if (config->free_threshold > VMNOTIFY_MAX_FREE_THRESHOD)
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +SYSCALL_DEFINE1(vmnotify_fd,
> + struct vmnotify_config __user *, uconfig)
> +{
> + struct vmnotify_watch *watch;
> + struct file *file;
> + int err;
> + int fd;
> +
> + watch = vmnotify_watch_alloc();
> + if (!watch)
> + return -ENOMEM;
> +
> + err = vmnotify_copy_config(uconfig, &watch->config);
> + if (err)
> + goto err_free;
> +
> + fd = get_unused_fd_flags(O_RDONLY);
> + if (fd < 0) {
> + err = fd;
> + goto err_free;
> + }
> +
> + file = anon_inode_getfile("[vmnotify]", &vmnotify_fops, watch, O_RDONLY);
> + if (IS_ERR(file)) {
> + err = PTR_ERR(file);
> + goto err_fd;
> + }
> +
> + fd_install(fd, file);
> +
> + if (watch->config.type & VMNOTIFY_TYPE_SAMPLE)
> + vmnotify_start_timer(watch);
> +
> + return fd;
> +
> +err_fd:
> + put_unused_fd(fd);
> +err_free:
> + kfree(watch);
> + return err;
> +}
> diff --git a/tools/testing/vmnotify/vmnotify-test.c b/tools/testing/vmnotify/vmnotify-test.c
> new file mode 100644
> index 0000000..3c6b26d
> --- /dev/null
> +++ b/tools/testing/vmnotify/vmnotify-test.c
> @@ -0,0 +1,68 @@
> +#include "../../../include/linux/vmnotify.h"
> +
> +#if defined(__x86_64__)
> +#include "../../../arch/x86/include/asm/unistd.h"
> +#endif
> +
> +#include <stdlib.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <poll.h>
> +
> +static int sys_vmnotify_fd(struct vmnotify_config *config)
> +{
> + config->size = sizeof(*config);
> +
> + return syscall(__NR_vmnotify_fd, config);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + struct vmnotify_config config;
> + struct vmnotify_event event;
> + struct pollfd pollfd;
> + int i;
> + int fd;
> +
> + config = (struct vmnotify_config) {
> + .type = VMNOTIFY_TYPE_SAMPLE|VMNOTIFY_TYPE_FREE_THRESHOLD,
> + .sample_period_ns = 1000000000L,
> + .free_threshold = 99,
> + };
> +
> + fd = sys_vmnotify_fd(&config);
> + if (fd < 0) {
> + perror("vmnotify_fd failed");
> + exit(1);
> + }
> +
> + for (i = 0; i < 10; i++) {
> + pollfd.fd = fd;
> + pollfd.events = POLLIN;
> +
> + if (poll(&pollfd, 1, -1) < 0) {
> + perror("poll failed");
> + exit(1);
> + }
> +
> + memset(&event, 0, sizeof(event));
> +
> + if (read(fd, &event, sizeof(event)) < 0) {
> + perror("read failed");
> + exit(1);
> + }
> +
> + printf("VM event:\n");
> + printf("\tsize=%lu\n", event.size);
> + printf("\tnr_avail_pages=%Lu\n", event.nr_avail_pages);
> + printf("\tnr_swap_pages=%Lu\n", event.nr_swap_pages);
> + printf("\tnr_free_pages=%Lu\n", event.nr_free_pages);
> + }
> + if (close(fd) < 0) {
> + perror("close failed");
> + exit(1);
> + }
> +
> + return 0;
> +}
> --
> 1.7.6.4
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/