Re: [RFC PATCH v9 12/16] Add mp(mediate passthru) device.

From: xiaohui . xin
Date: Mon Sep 20 2010 - 03:52:16 EST


From: Xin Xiaohui <xiaohui.xin@xxxxxxxxx>

---
Michael,
I have move the ioctl to configure the locked memory to vhost and
check the limit with mm->locked_vm. please have a look.

Thanks
Xiaohui

drivers/vhost/mpassthru.c | 74 +++++++++----------------------------------
drivers/vhost/net.c | 78 ++++++++++++++++++++++++++++++++++++++------
include/linux/vhost.h | 3 ++
3 files changed, 85 insertions(+), 70 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index d86d94c..fd3827b 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -109,9 +109,6 @@ struct page_ctor {
int wq_len;
int rq_len;
spinlock_t read_lock;
- /* record the locked pages */
- int lock_pages;
- struct rlimit o_rlim;
struct net_device *dev;
struct mpassthru_port port;
struct page_info **hash_table;
@@ -231,7 +228,6 @@ static int page_ctor_attach(struct mp_struct *mp)
ctor->port.ctor = page_ctor;
ctor->port.sock = &mp->socket;
ctor->port.hash = mp_lookup;
- ctor->lock_pages = 0;

/* locked by mp_mutex */
dev->mp_port = &ctor->port;
@@ -264,37 +260,6 @@ struct page_info *info_dequeue(struct page_ctor *ctor)
return info;
}

-static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
- unsigned long cur, unsigned long max)
-{
- struct rlimit new_rlim, *old_rlim;
- int retval;
-
- if (resource != RLIMIT_MEMLOCK)
- return -EINVAL;
- new_rlim.rlim_cur = cur;
- new_rlim.rlim_max = max;
-
- old_rlim = current->signal->rlim + resource;
-
- /* remember the old rlimit value when backend enabled */
- ctor->o_rlim.rlim_cur = old_rlim->rlim_cur;
- ctor->o_rlim.rlim_max = old_rlim->rlim_max;
-
- if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
- !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- retval = security_task_setrlimit(resource, &new_rlim);
- if (retval)
- return retval;
-
- task_lock(current->group_leader);
- *old_rlim = new_rlim;
- task_unlock(current->group_leader);
- return 0;
-}
-
static void relinquish_resource(struct page_ctor *ctor)
{
if (!(ctor->dev->flags & IFF_UP) &&
@@ -322,8 +287,6 @@ static void mp_ki_dtor(struct kiocb *iocb)
info->ctor->rq_len--;
} else
info->ctor->wq_len--;
- /* Decrement the number of locked pages */
- info->ctor->lock_pages -= info->pnum;
kmem_cache_free(ext_page_info_cache, info);
relinquish_resource(info->ctor);

@@ -349,7 +312,7 @@ static struct kiocb *create_iocb(struct page_info *info, int size)
iocb->ki_dtor(iocb);
iocb->private = (void *)info;
iocb->ki_dtor = mp_ki_dtor;
-
+ iocb->ki_user_data = info->pnum;
return iocb;
}

@@ -375,10 +338,6 @@ static int page_ctor_detach(struct mp_struct *mp)

relinquish_resource(ctor);

- set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
- ctor->o_rlim.rlim_cur,
- ctor->o_rlim.rlim_max);
-
/* locked by mp_mutex */
ctor->dev->mp_port = NULL;
dev_put(ctor->dev);
@@ -565,21 +524,23 @@ static struct page_info *alloc_page_info(struct page_ctor *ctor,
int rc;
int i, j, n = 0;
int len;
- unsigned long base, lock_limit;
+ unsigned long base, lock_limit, locked;
struct page_info *info = NULL;

- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- lock_limit >>= PAGE_SHIFT;
+ down_write(&current->mm->mmap_sem);
+ locked = count + current->mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

- if (ctor->lock_pages + count > lock_limit && npages) {
- printk(KERN_INFO "exceed the locked memory rlimit.");
- return NULL;
- }
+ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+ goto out;

info = kmem_cache_alloc(ext_page_info_cache, GFP_KERNEL);

if (!info)
- return NULL;
+ goto out;
+
+ up_write(&current->mm->mmap_sem);
+
info->skb = NULL;
info->next = info->prev = NULL;

@@ -633,8 +594,7 @@ static struct page_info *alloc_page_info(struct page_ctor *ctor,
for (i = 0; i < j; i++)
mp_hash_insert(ctor, info->pages[i], info);
}
- /* increment the number of locked pages */
- ctor->lock_pages += j;
+
return info;

failed:
@@ -642,7 +602,9 @@ failed:
put_page(info->pages[i]);

kmem_cache_free(ext_page_info_cache, info);
-
+ return NULL;
+out:
+ up(&current->mm->mmap_sem);
return NULL;
}

@@ -1006,12 +968,6 @@ proceed:
count--;
}

- if (!ctor->lock_pages || !ctor->rq_len) {
- set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
- iocb->ki_user_data * 4096 * 2,
- iocb->ki_user_data * 4096 * 2);
- }
-
/* Translate address to kernel */
info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
if (!info)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c4bc815..da78837 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -42,6 +42,7 @@ enum {
};

static struct kmem_cache *notify_cache;
+static struct rlimit orig_rlim;

enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
@@ -136,13 +137,7 @@ static void handle_async_rx_events_notify(struct vhost_net *net,
struct vhost_log *vq_log = NULL;
int rx_total_len = 0;
unsigned int head, log, in, out;
- int size;
- int count;
-
- struct virtio_net_hdr_mrg_rxbuf hdr = {
- .hdr.flags = 0,
- .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
- };
+ int size, free = 0;

if (!is_async_vq(vq))
return;
@@ -160,7 +155,7 @@ static void handle_async_rx_events_notify(struct vhost_net *net,
size = iocb->ki_nbytes;
head = iocb->ki_pos;
rx_total_len += iocb->ki_nbytes;
-
+ free += iocb->ki_user_data;
if (iocb->ki_dtor)
iocb->ki_dtor(iocb);
kmem_cache_free(net->cache, iocb);
@@ -192,6 +187,7 @@ static void handle_async_rx_events_notify(struct vhost_net *net,
size = iocb->ki_nbytes;
head = iocb->ki_pos;
rx_total_len += iocb->ki_nbytes;
+ free += iocb->ki_user_data;

if (iocb->ki_dtor)
iocb->ki_dtor(iocb);
@@ -211,7 +207,6 @@ static void handle_async_rx_events_notify(struct vhost_net *net,
break;

i++;
- iocb == NULL;
if (count)
iocb = notify_dequeue(vq);
}
@@ -219,6 +214,10 @@ static void handle_async_rx_events_notify(struct vhost_net *net,
&net->dev, vq, vq->heads, hc);
}
}
+ /* record locked memroy */
+ down_write(&current->mm->mmap_sem);
+ current->mm->locked_vm -= free;
+ up_write(&current->mm->mmap_sem);
}

static void handle_async_tx_events_notify(struct vhost_net *net,
@@ -227,7 +226,7 @@ static void handle_async_tx_events_notify(struct vhost_net *net,
struct kiocb *iocb = NULL;
struct list_head *entry, *tmp;
unsigned long flags;
- int tx_total_len = 0;
+ int tx_total_len = 0, free = 0;

if (!is_async_vq(vq))
return;
@@ -242,7 +241,7 @@ static void handle_async_tx_events_notify(struct vhost_net *net,
vhost_add_used_and_signal(&net->dev, vq,
iocb->ki_pos, 0);
tx_total_len += iocb->ki_nbytes;
-
+ free += iocb->ki_user_data;
if (iocb->ki_dtor)
iocb->ki_dtor(iocb);

@@ -253,6 +252,10 @@ static void handle_async_tx_events_notify(struct vhost_net *net,
}
}
spin_unlock_irqrestore(&vq->notify_lock, flags);
+ /* record locked memroy */
+ down_write(&current->mm->mmap_sem);
+ current->mm->locked_vm -= free;
+ up_write(&current->mm->mmap_sem);
}

static struct kiocb *create_iocb(struct vhost_net *net,
@@ -581,6 +584,7 @@ static void handle_rx_net(struct work_struct *work)
static int vhost_net_open(struct inode *inode, struct file *f)
{
struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+ struct rlimit *old_rlim;
int r;
if (!n)
return -ENOMEM;
@@ -597,6 +601,12 @@ static int vhost_net_open(struct inode *inode, struct file *f)
n->tx_poll_state = VHOST_NET_POLL_DISABLED;
n->cache = NULL;

+ old_rlim = current->signal->rlim + RLIMIT_MEMLOCK;
+
+ /* remember the old rlimit value when backend enabled */
+ orig_rlim.rlim_cur = old_rlim->rlim_cur;
+ orig_rlim.rlim_max = old_rlim->rlim_max;
+
f->private_data = n;

return 0;
@@ -659,6 +669,39 @@ static void vhost_net_flush(struct vhost_net *n)
vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
}

+static long vhost_net_set_mem_locked(struct vhost_net *n,
+ unsigned long cur,
+ unsigned long max)
+{
+ struct rlimit new_rlim, *old_rlim;
+ int retval = 0;
+
+ mutex_lock(&n->dev.mutex);
+ new_rlim.rlim_cur = cur;
+ new_rlim.rlim_max = max;
+
+ old_rlim = current->signal->rlim + RLIMIT_MEMLOCK;
+
+ if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+ !capable(CAP_SYS_RESOURCE)) {
+ retval = -EPERM;
+ goto err;
+ }
+
+ retval = security_task_setrlimit(RLIMIT_MEMLOCK, &new_rlim);
+ if (retval) {
+ retval = retval;
+ goto err;
+ }
+
+ task_lock(current->group_leader);
+ *old_rlim = new_rlim;
+ task_unlock(current->group_leader);
+err:
+ mutex_unlock(&n->dev.mutex);
+ return retval;
+}
+
static void vhost_async_cleanup(struct vhost_net *n)
{
/* clean the notifier */
@@ -691,6 +734,10 @@ static int vhost_net_release(struct inode *inode, struct file *f)
* since jobs can re-queue themselves. */
vhost_net_flush(n);
vhost_async_cleanup(n);
+ /* return back the rlimit */
+ vhost_net_set_mem_locked(n,
+ orig_rlim.rlim_cur,
+ orig_rlim.rlim_max);
kfree(n);
return 0;
}
@@ -846,6 +893,7 @@ err:
return r;
}

+
static long vhost_net_reset_owner(struct vhost_net *n)
{
struct socket *tx_sock = NULL;
@@ -913,6 +961,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
void __user *argp = (void __user *)arg;
u64 __user *featurep = argp;
struct vhost_vring_file backend;
+ struct rlimit rlim;
u64 features;
int r;
switch (ioctl) {
@@ -933,6 +982,13 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
return vhost_net_set_features(n, features);
case VHOST_RESET_OWNER:
return vhost_net_reset_owner(n);
+ case VHOST_SET_MEM_LOCKED:
+ r = copy_from_user(&rlim, argp, sizeof rlim);
+ if (r < 0)
+ return r;
+ return vhost_net_set_mem_locked(n,
+ rlim.rlim_cur,
+ rlim.rlim_max);
default:
mutex_lock(&n->dev.mutex);
r = vhost_dev_ioctl(&n->dev, ioctl, arg);
diff --git a/include/linux/vhost.h b/include/linux/vhost.h
index e847f1e..df93f5a 100644
--- a/include/linux/vhost.h
+++ b/include/linux/vhost.h
@@ -92,6 +92,9 @@ struct vhost_memory {
/* Specify an eventfd file descriptor to signal on log write. */
#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)

+/* Specify how much locked memory can be used */
+#define VHOST_SET_MEM_LOCKED _IOW(VHOST_VIRTIO, 0x08, struct rlimit)
+
/* Ring setup. */
/* Set number of descriptors in ring. This parameter can not
* be modified while ring is running (bound to a device). */
--
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/