[PATCH 4.19 50/52] staging: erofs: fix race when the managed cache is enabled

From: Greg Kroah-Hartman
Date: Mon Mar 18 2019 - 05:41:53 EST

Next message: Greg Kroah-Hartman: "[PATCH 4.19 47/52] perf/x86/intel: Make dev_attr_allow_tsx_force_abort static"
Previous message: Elena Reshetova: "[RFC PATCH] x86/entry/64: randomize kernel stack offset upon syscall"
In reply to: Greg Kroah-Hartman: "[PATCH 4.19 52/52] vhost/vsock: fix vhost vsock cid hashing inconsistent"
Next in thread: Greg Kroah-Hartman: "[PATCH 4.19 47/52] perf/x86/intel: Make dev_attr_allow_tsx_force_abort static"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

4.19-stable review patch. If anyone has any objections, please let me know.

------------------

From: Gao Xiang <gaoxiang25@xxxxxxxxxx>

commit 51232df5e4b268936beccde5248f312a316800be upstream.

When the managed cache is enabled, the last reference count
of a workgroup must be used for its workstation.

Otherwise, it could lead to incorrect (un)freezes in
the reclaim path, and it would be harmful.

A typical race as follows:

Thread 1 (In the reclaim path) Thread 2
workgroup_freeze(grp, 1) refcnt = 1
...
workgroup_unfreeze(grp, 1) refcnt = 1
workgroup_get(grp) refcnt = 2 (x)
workgroup_put(grp) refcnt = 1 (x)
...unexpected behaviors

* grp is detached but still used, which violates cache-managed
freeze constraint.

Reviewed-by: Chao Yu <yuchao0@xxxxxxxxxx>
Signed-off-by: Gao Xiang <gaoxiang25@xxxxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>

---
drivers/staging/erofs/internal.h | 1
drivers/staging/erofs/utils.c | 139 ++++++++++++++++++++++++++++-----------
2 files changed, 101 insertions(+), 39 deletions(-)

--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -260,6 +260,7 @@ repeat:
}

#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount)
+#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount)

extern int erofs_workgroup_put(struct erofs_workgroup *grp);

--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -87,12 +87,21 @@ int erofs_register_workgroup(struct supe
grp = (void *)((unsigned long)grp |
1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);

- err = radix_tree_insert(&sbi->workstn_tree,
- grp->index, grp);
+ /*
+ * Bump up reference count before making this workgroup
+ * visible to other users in order to avoid potential UAF
+ * without serialized by erofs_workstn_lock.
+ */
+ __erofs_workgroup_get(grp);

- if (!err) {
- __erofs_workgroup_get(grp);
- }
+ err = radix_tree_insert(&sbi->workstn_tree,
+ grp->index, grp);
+ if (unlikely(err))
+ /*
+ * it's safe to decrease since the workgroup isn't visible
+ * and refcount >= 2 (cannot be freezed).
+ */
+ __erofs_workgroup_put(grp);

erofs_workstn_unlock(sbi);
radix_tree_preload_end();
@@ -101,19 +110,99 @@ int erofs_register_workgroup(struct supe

extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);

+static void __erofs_workgroup_free(struct erofs_workgroup *grp)
+{
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ erofs_workgroup_free_rcu(grp);
+}
+
int erofs_workgroup_put(struct erofs_workgroup *grp)
{
int count = atomic_dec_return(&grp->refcount);

if (count == 1)
atomic_long_inc(&erofs_global_shrink_cnt);
- else if (!count) {
- atomic_long_dec(&erofs_global_shrink_cnt);
- erofs_workgroup_free_rcu(grp);
- }
+ else if (!count)
+ __erofs_workgroup_free(grp);
return count;
}

+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+/* for cache-managed case, customized reclaim paths exist */
+static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp)
+{
+ erofs_workgroup_unfreeze(grp, 0);
+ __erofs_workgroup_free(grp);
+}
+
+bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *grp,
+ bool cleanup)
+{
+ void *entry;
+
+ /*
+ * for managed cache enabled, the refcount of workgroups
+ * themselves could be < 0 (freezed). So there is no guarantee
+ * that all refcount > 0 if managed cache is enabled.
+ */
+ if (!erofs_workgroup_try_to_freeze(grp, 1))
+ return false;
+
+ /*
+ * note that all cached pages should be unlinked
+ * before delete it from the radix tree.
+ * Otherwise some cached pages of an orphan old workgroup
+ * could be still linked after the new one is available.
+ */
+ if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
+ erofs_workgroup_unfreeze(grp, 1);
+ return false;
+ }
+
+ /*
+ * it is impossible to fail after the workgroup is freezed,
+ * however in order to avoid some race conditions, add a
+ * DBG_BUGON to observe this in advance.
+ */
+ entry = radix_tree_delete(&sbi->workstn_tree, grp->index);
+ DBG_BUGON((void *)((unsigned long)entry &
+ ~RADIX_TREE_EXCEPTIONAL_ENTRY) != grp);
+
+ /*
+ * if managed cache is enable, the last refcount
+ * should indicate the related workstation.
+ */
+ erofs_workgroup_unfreeze_final(grp);
+ return true;
+}
+
+#else
+/* for nocache case, no customized reclaim path at all */
+bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+ struct erofs_workgroup *grp,
+ bool cleanup)
+{
+ int cnt = atomic_read(&grp->refcount);
+ void *entry;
+
+ DBG_BUGON(cnt <= 0);
+ DBG_BUGON(cleanup && cnt != 1);
+
+ if (cnt > 1)
+ return false;
+
+ entry = radix_tree_delete(&sbi->workstn_tree, grp->index);
+ DBG_BUGON((void *)((unsigned long)entry &
+ ~RADIX_TREE_EXCEPTIONAL_ENTRY) != grp);
+
+ /* (rarely) could be grabbed again when freeing */
+ erofs_workgroup_put(grp);
+ return true;
+}
+
+#endif
+
unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
unsigned long nr_shrink,
bool cleanup)
@@ -130,43 +219,15 @@ repeat:
batch, first_index, PAGEVEC_SIZE);

for (i = 0; i < found; ++i) {
- int cnt;
struct erofs_workgroup *grp = (void *)
((unsigned long)batch[i] &
~RADIX_TREE_EXCEPTIONAL_ENTRY);

first_index = grp->index + 1;

- cnt = atomic_read(&grp->refcount);
- BUG_ON(cnt <= 0);
-
- if (cleanup)
- BUG_ON(cnt != 1);
-
-#ifndef EROFS_FS_HAS_MANAGED_CACHE
- else if (cnt > 1)
-#else
- if (!erofs_workgroup_try_to_freeze(grp, 1))
-#endif
- continue;
-
- if (radix_tree_delete(&sbi->workstn_tree,
- grp->index) != grp) {
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-skip:
- erofs_workgroup_unfreeze(grp, 1);
-#endif
+ /* try to shrink each valid workgroup */
+ if (!erofs_try_to_release_workgroup(sbi, grp, cleanup))
continue;
- }
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
- if (erofs_try_to_free_all_cached_pages(sbi, grp))
- goto skip;
-
- erofs_workgroup_unfreeze(grp, 1);
-#endif
- /* (rarely) grabbed again when freeing */
- erofs_workgroup_put(grp);

++freed;
if (unlikely(!--nr_shrink))

Next message: Greg Kroah-Hartman: "[PATCH 4.19 47/52] perf/x86/intel: Make dev_attr_allow_tsx_force_abort static"
Previous message: Elena Reshetova: "[RFC PATCH] x86/entry/64: randomize kernel stack offset upon syscall"
In reply to: Greg Kroah-Hartman: "[PATCH 4.19 52/52] vhost/vsock: fix vhost vsock cid hashing inconsistent"
Next in thread: Greg Kroah-Hartman: "[PATCH 4.19 47/52] perf/x86/intel: Make dev_attr_allow_tsx_force_abort static"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]