[RFC][PATCH 1/2] fsnotify: separate fsnotify_mark_srcu for groups with permission events

From: Amir Goldstein
Date: Mon Nov 14 2016 - 06:49:10 EST


fsnotify_mark_srcu[1] protects reads of the first half of inode/vfsmount
mark lists, which consist of the high priority (>0) marks, whose masks
may contain permission events.

fsnotify_mark_srcu[0] protects reads of the second half of inode/vfsmount
mark lists, which consist of the low priority (0) marks, whose masks
do not contain permission events.

High priority marks and low priority marks are added to different
destroy lists and freed by different reapers, who synchronize only
the relevant srcu.

A follow up change will use this separation to guaranty that
destroying low priority groups will not block on handling of
permission events.

Signed-off-by: Amir Goldstein <amir73il@xxxxxxxxx>
---
fs/notify/fsnotify.c | 34 ++++++++++++++++++-----
fs/notify/fsnotify.h | 17 ++++++++++--
fs/notify/group.c | 2 +-
fs/notify/mark.c | 77 +++++++++++++++++++++++++++++++++++++++-------------
4 files changed, 100 insertions(+), 30 deletions(-)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index db39de2..af5c523a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -192,9 +192,10 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
{
struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
- struct fsnotify_group *inode_group, *vfsmount_group;
+ struct fsnotify_group *inode_group, *vfsmount_group, *group;
struct mount *mnt;
- int idx, ret = 0;
+ int perm_idx, idx;
+ int ret = 0;
/* global tests shouldn't care about events on child only the specific event */
__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);

@@ -223,7 +224,14 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
!(mnt && test_mask & mnt->mnt_fsnotify_mask))
return 0;

- idx = srcu_read_lock(&fsnotify_mark_srcu);
+ /*
+ * First mark on the list may be either low or high priority, so
+ * when traversing to first mark we must hold read side of both srcu.
+ * When we reach the first low priority mark, we may drop the
+ * read side of fsnotify_mark_srcu[1]
+ */
+ idx = srcu_read_lock(&fsnotify_mark_srcu[0]);
+ perm_idx = srcu_read_lock(&fsnotify_mark_srcu[1]);

if ((mask & FS_MODIFY) ||
(test_mask & to_tell->i_fsnotify_mask))
@@ -252,13 +260,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
if (inode_node) {
inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
struct fsnotify_mark, obj_list);
- inode_group = inode_mark->group;
+ group = inode_group = inode_mark->group;
}

if (vfsmount_node) {
vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
struct fsnotify_mark, obj_list);
- vfsmount_group = vfsmount_mark->group;
+ group = vfsmount_group = vfsmount_mark->group;
}

if (inode_group && vfsmount_group) {
@@ -270,8 +278,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
} else if (cmp < 0) {
vfsmount_group = NULL;
vfsmount_mark = NULL;
+ group = inode_group;
}
}
+
+ if (group->priority == 0 && perm_idx >= 0) {
+ /* We are done iterating the high priority marks */
+ srcu_read_unlock(&fsnotify_mark_srcu[1], perm_idx);
+ perm_idx = -1;
+ }
+
ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
data, data_is, cookie, file_name);

@@ -287,7 +303,9 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
}
ret = 0;
out:
- srcu_read_unlock(&fsnotify_mark_srcu, idx);
+ srcu_read_unlock(&fsnotify_mark_srcu[0], idx);
+ if (perm_idx >= 0)
+ srcu_read_unlock(&fsnotify_mark_srcu[1], perm_idx);

return ret;
}
@@ -299,7 +317,9 @@ static __init int fsnotify_init(void)

BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);

- ret = init_srcu_struct(&fsnotify_mark_srcu);
+ ret = init_srcu_struct(&fsnotify_mark_srcu[0]);
+ if (!ret)
+ ret = init_srcu_struct(&fsnotify_mark_srcu[1]);
if (ret)
panic("initializing fsnotify_mark_srcu");

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 0a3bc2c..c1d6ae6 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -11,8 +11,19 @@
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);

-/* protects reads of inode and vfsmount marks list */
-extern struct srcu_struct fsnotify_mark_srcu;
+/*
+ * fsnotify_mark_srcu[1] protects reads of the first half of inode/vfsmount
+ * mark lists, which consist of the high priority (>0) marks, whose masks
+ * may contain permission events.
+ *
+ * fsnotify_mark_srcu[0] protects reads of the second half of inode/vfsmount
+ * mark lists, which consist of the low priority (0) marks, whose masks
+ * do not contain permission events.
+ */
+#define FS_PRIO_SRCU(prio) ((int)(prio > FS_PRIO_0))
+#define FS_PRIO_SRCU_NUM 2
+
+extern struct srcu_struct fsnotify_mark_srcu[FS_PRIO_SRCU_NUM];

/* Calculate mask of events for a list of marks */
extern u32 fsnotify_recalc_mask(struct hlist_head *head);
@@ -61,7 +72,7 @@ extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
/*
* wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
*/
-extern void fsnotify_mark_destroy_list(void);
+extern void fsnotify_mark_destroy_list(int priority);

/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
diff --git a/fs/notify/group.c b/fs/notify/group.c
index fbe3cbe..ef946c0 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -73,7 +73,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
* Wait for fsnotify_mark_srcu period to end and free all marks in
* destroy_list
*/
- fsnotify_mark_destroy_list();
+ fsnotify_mark_destroy_list(group->priority);

/*
* Since we have waited for fsnotify_mark_srcu in
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d3fea0b..44e39a5 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -93,12 +93,50 @@

#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */

-struct srcu_struct fsnotify_mark_srcu;
+/*
+ * fsnotify_mark_srcu[1] protects reads of the first half of inode/vfsmount
+ * mark lists, which consist of the high priority (>0) marks, whose masks
+ * may contain permission events.
+ *
+ * fsnotify_mark_srcu[0] protects reads of the second half of inode/vfsmount
+ * mark lists, which consist of the low priority (0) marks, whose masks
+ * do not contain permission events.
+ *
+ * High priority marks and low priority marks are added to different
+ * destroy lists and freed by different reapers, who synchronize only
+ * the relevant srcu.
+ */
+struct srcu_struct fsnotify_mark_srcu[FS_PRIO_SRCU_NUM];
static DEFINE_SPINLOCK(destroy_lock);
-static LIST_HEAD(destroy_list);
+static LIST_HEAD(destroy_list_0);
+static LIST_HEAD(destroy_list_1);
+static struct list_head *destroy_lists[FS_PRIO_SRCU_NUM] = {
+ &destroy_list_0,
+ &destroy_list_1
+};

static void fsnotify_mark_destroy_workfn(struct work_struct *work);
-static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
+static DECLARE_DELAYED_WORK(reaper_work_0, fsnotify_mark_destroy_workfn);
+static DECLARE_DELAYED_WORK(reaper_work_1, fsnotify_mark_destroy_workfn);
+static struct delayed_work *reapers[FS_PRIO_SRCU_NUM] = {
+ &reaper_work_0,
+ &reaper_work_1
+};
+
+static inline void fsnotify_destroy_list_add(struct fsnotify_mark *mark,
+ int priority)
+{
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, destroy_lists[FS_PRIO_SRCU(priority)]);
+ spin_unlock(&destroy_lock);
+}
+
+static inline void fsnotify_destroy_list_reap(int priority)
+{
+ queue_delayed_work(system_unbound_wq,
+ reapers[FS_PRIO_SRCU(priority)],
+ FSNOTIFY_REAPER_DELAY);
+}

void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -202,9 +240,7 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);

- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
+ fsnotify_destroy_list_add(mark, group->priority);

return true;
}
@@ -216,10 +252,8 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
*/
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
- if (__fsnotify_free_mark(mark)) {
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
- }
+ if (__fsnotify_free_mark(mark))
+ fsnotify_destroy_list_reap(mark->group->priority);
}

void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -407,11 +441,8 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,

spin_unlock(&mark->lock);

- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
+ fsnotify_destroy_list_add(mark, group->priority);
+ fsnotify_destroy_list_reap(group->priority);

return ret;
}
@@ -537,18 +568,26 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
/*
* Destroy all marks in destroy_list, waits for SRCU period to finish before
* actually freeing marks.
+ * High priority (>0) marks are on destroy_lists[1] and protected by
+ * fsnotify_mark_srcu[1].
+ * Low priority (0) marks are on destroy_lists[0] and protected by
+ * fsnotify_mark_srcu[0].
*/
-void fsnotify_mark_destroy_list(void)
+void fsnotify_mark_destroy_list(int priority)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
+ int id = FS_PRIO_SRCU(priority);

spin_lock(&destroy_lock);
/* exchange the list head */
- list_replace_init(&destroy_list, &private_destroy_list);
+ list_replace_init(destroy_lists[id], &private_destroy_list);
spin_unlock(&destroy_lock);

- synchronize_srcu(&fsnotify_mark_srcu);
+ if (list_empty(&private_destroy_list))
+ return;
+
+ synchronize_srcu(&fsnotify_mark_srcu[id]);

list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
list_del_init(&mark->g_list);
@@ -558,5 +597,5 @@ void fsnotify_mark_destroy_list(void)

static void fsnotify_mark_destroy_workfn(struct work_struct *work)
{
- fsnotify_mark_destroy_list();
+ fsnotify_mark_destroy_list((void *)work != reapers[0]);
}
--
2.7.4