[PATCH 1/1] userns: Fix/clarify memory ordering

From: Christian Brauner
Date: Thu Nov 02 2017 - 07:04:04 EST


Nikolay noticed a number of undocumented memory barriers in this code;
the ordering is fairly simple but not explicitly described. Cure that.

Switch over to smp_store_release() / smp_load_acquire() as that is the
natural fit for the pattern and includes the missing but required
WRITE_ONCE()/READ_ONCE()s.

CC: Eric Biederman <ebiederm@xxxxxxxxxxxx>
Cc: Linux Containers <containers@xxxxxxxxxxxxxxxxxxxxxxxxxx>
Reported-by: Nikolay Borisov <nborisov@xxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Christian Brauner <christian.brauner@xxxxxxxxxx>
---
kernel/user_namespace.c | 74 +++++++++++++++++++++++++++++++------------------
1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 899c31060ff3..2129762a930e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -27,8 +27,47 @@
#include <linux/sort.h>

static struct kmem_cache *user_ns_cachep __read_mostly;
+
+/**
+ * The userns_state_mutex serializes all writes to any given map.
+ *
+ * Any map is only ever written once.
+ *
+ * An id map fits within 1 cache line on most architectures.
+ */
static DEFINE_MUTEX(userns_state_mutex);

+/**
+ * There is a data dependency between reading the count of the extents and the
+ * values of the extents. The desired behavior is to see the values of the
+ * extents that were written before the count of the extents.
+ *
+ * To achieve this smp_store_release() is used to guarantee the write order and
+ * smp_load_acquire() is guaranteed that we observe the written data.
+ */
+static inline void map_store_extents(struct uid_gid_map *map,
+ unsigned int extents)
+{
+ /*
+ * Ensure the map->extent[] stores happen-before we grow map->nr_extents
+ * to cover it.
+ *
+ * Matches the load_acquire in map_load_extents().
+ */
+ smp_store_release(&map->nr_extents, extents);
+}
+
+static inline unsigned int map_load_extents(struct uid_gid_map *map)
+{
+ /*
+ * Ensure the map->nr_extents load happens-before we try and access
+ * map->extent[], such that we guarantee the data is in fact there.
+ *
+ * Matches the store-release in map_store_extents().
+ */
+ return smp_load_acquire(&map->nr_extents);
+}
+
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
@@ -296,9 +335,9 @@ map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 co
static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
struct uid_gid_extent *extent;
- unsigned extents = map->nr_extents;
- smp_rmb();
+ unsigned extents;

+ extents = map_load_extents(map);
if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
extent = map_id_range_down_base(extents, map, id, count);
else
@@ -359,9 +398,9 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
static u32 map_id_up(struct uid_gid_map *map, u32 id)
{
struct uid_gid_extent *extent;
- unsigned extents = map->nr_extents;
- smp_rmb();
+ unsigned extents;

+ extents = map_load_extents(map);
if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
extent = map_id_up_base(extents, map, id);
else
@@ -647,9 +686,9 @@ static void *m_start(struct seq_file *seq, loff_t *ppos,
struct uid_gid_map *map)
{
loff_t pos = *ppos;
- unsigned extents = map->nr_extents;
- smp_rmb();
+ unsigned extents;

+ extents = map_load_extents(map);
if (pos >= extents)
return NULL;

@@ -860,25 +899,6 @@ static ssize_t map_write(struct file *file, const char __user *buf,
char *kbuf = NULL, *pos, *next_line;
ssize_t ret = -EINVAL;

- /*
- * The userns_state_mutex serializes all writes to any given map.
- *
- * Any map is only ever written once.
- *
- * An id map fits within 1 cache line on most architectures.
- *
- * On read nothing needs to be done unless you are on an
- * architecture with a crazy cache coherency model like alpha.
- *
- * There is a one time data dependency between reading the
- * count of the extents and the values of the extents. The
- * desired behavior is to see the values of the extents that
- * were written before the count of the extents.
- *
- * To achieve this smp_wmb() is used on guarantee the write
- * order and smp_rmb() is guaranteed that we don't have crazy
- * architectures returning stale data.
- */
mutex_lock(&userns_state_mutex);

memset(&new_map, 0, sizeof(struct uid_gid_map));
@@ -1015,8 +1035,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
map->forward = new_map.forward;
map->reverse = new_map.reverse;
}
- smp_wmb();
- map->nr_extents = new_map.nr_extents;
+
+ map_store_extents(map, new_map.nr_extents);

*ppos = count;
ret = count;
--
2.14.1