Re: [RFC PATCH] shm: extend forced shm destroy to support objects from several IPC nses

From: Manfred Spraul
Date: Thu Jul 22 2021 - 14:46:23 EST


This is a multi-part message in MIME format. Hi Alexander,

A few more remarks.

On 7/14/21 7:30 PM, Alexander Mikhalitsyn wrote:
This is total rework of fix.
Thanks to Eric Biederman for suggestions (but may be I've misunderstood some of them :))

I've tested it with reproducer of the original problem. But of course it needs
detailed testing. I hope that I get some general comments about design and implementation.

ToDo: remove unneeded "ns" argument from shm_destroy, shm_rmid and other functions.

What ensures the that shp->ns is not destroyed prematurely?

I did some tests, and it seems that shmat() acquires a namespace refcount, and shm_release() puts it again, and the shm_release is late enough to ensure that the ns cannot go out of scope.

But I haven't checked all combinations (with/without shm_rmid_forced, delete via exit(), shmctl(), shmdt(), mmap()).

And: This should be documented somewhere.


--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -65,6 +65,7 @@ struct shmid_kernel /* private to the kernel */
/* The task created the shm object. NULL if the task is dead. */
struct task_struct *shm_creator;
struct list_head shm_clist; /* list by creator */

I think the comments are wrong/outdated.

Some parts of the new code checks with list_empty(shm_clist), not by looking at shm_creator.

+ struct ipc_namespace *ns;
} __randomize_layout;
/* shm_mode upper byte flags */
@@ -115,6 +116,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
struct shmid_kernel *shp;
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ BUG_ON(shp->ns && ns != shp->ns);

Is shp->ns == NULL allowed/possible? From what I see, it is impossible.

I think we should not have  NULL check in a few codepaths, but not in other codepaths. Either everywhere, or nowhere.


if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
@@ -225,9 +227,32 @@ static void shm_rcu_free(struct rcu_head *head)
kfree(shp);
}
+static inline void task_shm_clist_lock(struct task_struct *task)
+{
+ spin_lock(&task->sysvshm.shm_clist_lock);
+}
+
+static inline void task_shm_clist_unlock(struct task_struct *task)
+{
+ spin_unlock(&task->sysvshm.shm_clist_lock);
+}
+
+void shm_clist_rm(struct shmid_kernel *shp)
+{
+ if (!shp->shm_creator)
+ return;
+
+ task_shm_clist_lock(shp->shm_creator);
+ list_del_init(&shp->shm_clist);
+ task_shm_clist_unlock(shp->shm_creator);
+ shp->shm_creator = NULL;
+}
+
static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
{
- list_del(&s->shm_clist);
+ WARN_ON(s->ns && ns != s->ns);
+ //list_del_init(&s->shm_clist);
+ shm_clist_rm(s);
ipc_rmid(&shm_ids(ns), &s->shm_perm);
}
@@ -306,10 +331,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
*
* 2) sysctl kernel.shm_rmid_forced is set to 1.
*/
-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+static bool shm_may_destroy(struct shmid_kernel *shp)
{
return (shp->shm_nattch == 0) &&
- (ns->shm_rmid_forced ||
+ (shp->ns->shm_rmid_forced ||
(shp->shm_perm.mode & SHM_DEST));
}
As written before: what ensures that shp->ns->shm_rmid_forced was not released already?
@@ -340,7 +365,7 @@ static void shm_close(struct vm_area_struct *vma)
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
shp->shm_dtim = ktime_get_real_seconds();
shp->shm_nattch--;
- if (shm_may_destroy(ns, shp))
+ if (shm_may_destroy(shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
@@ -361,10 +386,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
*
* As shp->* are changed under rwsem, it's safe to skip shp locking.
*/
- if (shp->shm_creator != NULL)
+ if (!list_empty(&shp->shm_clist))
return 0;
This collides with the comment above: here, list_empty() is used.
- if (shm_may_destroy(ns, shp)) {
+ if (shm_may_destroy(shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}
@@ -379,51 +404,77 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
up_write(&shm_ids(ns).rwsem);
}
+void shm_init_task(struct task_struct *task)
+{
+ INIT_LIST_HEAD(&(task)->sysvshm.shm_clist);
+ spin_lock_init(&task->sysvshm.shm_clist_lock);
+}
+
/* Locking assumes this will only be called with task == current */
void exit_shm(struct task_struct *task)
{
- struct ipc_namespace *ns = task->nsproxy->ipc_ns;
+ LIST_HEAD(tmp);
struct shmid_kernel *shp, *n;
if (list_empty(&task->sysvshm.shm_clist))
return;
- /*
- * If kernel.shm_rmid_forced is not set then only keep track of
- * which shmids are orphaned, so that a later set of the sysctl
- * can clean them up.
- */
- if (!ns->shm_rmid_forced) {
- down_read(&shm_ids(ns).rwsem);
- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
- shp->shm_creator = NULL;
- /*
- * Only under read lock but we are only called on current
- * so no entry on the list will be shared.
- */
- list_del(&task->sysvshm.shm_clist);
- up_read(&shm_ids(ns).rwsem);
- return;
- }
+ rcu_read_lock(); /* for refcount_inc_not_zero */
+ task_shm_clist_lock(task);
- /*
- * Destroy all already created segments, that were not yet mapped,
- * and mark any mapped as orphan to cover the sysctl toggling.
- * Destroy is skipped if shm_may_destroy() returns false.
- */
- down_write(&shm_ids(ns).rwsem);
list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
+ struct ipc_namespace *ns = shp->ns;
+
+ /*
+ * Remove shm from task list and nullify shm_creator which
+ * marks object as orphaned.
+ *
+ * If kernel.shm_rmid_forced is not set then only keep track of
+ * which shmids are orphaned, so that a later set of the sysctl
+ * can clean them up.
+ */
+ list_del_init(&shp->shm_clist);
shp->shm_creator = NULL;
- if (shm_may_destroy(ns, shp)) {
- shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
+ printk("exit_shm() %px refcnt=%u, id=%d,key=%x\n", shp,
+ refcount_read(&shp->shm_perm.refcount),
+ shp->shm_perm.id, shp->shm_perm.key
+ );
+
+ /*
+ * Will destroy all already created segments, that were not yet mapped,
+ * and mark any mapped as orphan to cover the sysctl toggling.
+ * Destroy is skipped if shm_may_destroy() returns false.
+ */
+ if (shp->ns->shm_rmid_forced && shm_may_destroy(shp)) {
+ /*
+ * We may race with shm_exit_ns() if refcounter
+ * already zero. Let's skip shm_destroy() of such
+ * shm object as it will be destroyed during shm_exit_ns()
+ */
+ if (!refcount_inc_not_zero(&ns->ns.count)) /* get_ipc_ns */
+ continue;
+
+ list_add(&shp->shm_clist, &tmp);
}
}
- /* Remove the list head from any segments still attached. */
list_del(&task->sysvshm.shm_clist);
- up_write(&shm_ids(ns).rwsem);
+ task_shm_clist_unlock(task);
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(shp, n, &tmp, shm_clist) {
+ struct ipc_namespace *ns = shp->ns;
+
+ list_del_init(&shp->shm_clist);
+
+ down_write(&shm_ids(ns).rwsem);
+ shm_lock_by_ptr(shp);
+ /* will do put_ipc_ns(shp->ns) */
+ shm_destroy(ns, shp);
+ up_write(&shm_ids(ns).rwsem);
+ put_ipc_ns(ns); /* see refcount_inc_not_zero */
+ }
}

I do not see the advantage of first collecting everything in a local list, and then destroying the elements.

Attached is my current test case. Feel free to merge whatever you consider as useful into your change.


--

    Manfred