Re: [PATCH] dax: Fix missed PMD wakeups

From: Matthew Wilcox
Date: Wed Jul 03 2019 - 15:53:07 EST


On Wed, Jul 03, 2019 at 10:01:37AM -0700, Dan Williams wrote:
> On Wed, Jul 3, 2019 at 5:17 AM Matthew Wilcox <willy@xxxxxxxxxxxxx> wrote:
> >
> > On Wed, Jul 03, 2019 at 12:24:54AM -0700, Dan Williams wrote:
> > > This fix may increase waitqueue contention, but a fix for that is saved
> > > for a larger rework. In the meantime this fix is suitable for -stable
> > > backports.
> >
> > I think this is too big for what it is; just the two-line patch to stop
> > incorporating the low bits of the PTE would be more appropriate.
>
> Sufficient, yes, "appropriate", not so sure. All those comments about
> pmd entry size are stale after this change.

But then they'll have to be put back in again. This seems to be working
for me, although I doubt I'm actually hitting the edge case that rocksdb
hits:

diff --git a/fs/dax.c b/fs/dax.c
index 2e48c7ebb973..e77bd6aef10c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -198,6 +198,10 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
* if it did.
*
* Must be called with the i_pages lock held.
+ *
+ * If the xa_state refers to a larger entry, then it may return a locked
+ * smaller entry (eg a PTE entry) without waiting for the smaller entry
+ * to be unlocked.
*/
static void *get_unlocked_entry(struct xa_state *xas)
{
@@ -211,7 +215,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
for (;;) {
entry = xas_find_conflict(xas);
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
- !dax_is_locked(entry))
+ !dax_is_locked(entry) ||
+ dax_entry_order(entry) < xas_get_order(xas))
return entry;

wq = dax_entry_waitqueue(xas, entry, &ewait.key);
@@ -253,8 +258,12 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)

static void put_unlocked_entry(struct xa_state *xas, void *entry)
{
- /* If we were the only waiter woken, wake the next one */
- if (entry)
+ /*
+ * If we were the only waiter woken, wake the next one.
+ * Do not wake anybody if the entry is locked; that indicates
+ * we weren't woken.
+ */
+ if (entry && !dax_is_locked(entry))
dax_wake_entry(xas, entry, false);
}

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 052e06ff4c36..b17289d92af4 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1529,6 +1529,27 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index,
#endif
}

+/**
+ * xas_get_order() - Get the order of the entry being operated on.
+ * @xas: XArray operation state.
+ *
+ * Return: The order of the entry.
+ */
+static inline unsigned int xas_get_order(const struct xa_state *xas)
+{
+ unsigned int order = xas->xa_shift;
+
+#ifdef CONFIG_XARRAY_MULTI
+ unsigned int sibs = xas->xa_sibs;
+
+ while (sibs) {
+ order++;
+ sibs /= 2;
+ }
+#endif
+ return order;
+}
+
/**
* xas_set_update() - Set up XArray operation state for a callback.
* @xas: XArray operation state.
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 7df4f7f395bf..af024477ec93 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -95,6 +95,17 @@ static noinline void check_xa_err(struct xarray *xa)
// XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL);
}

+static noinline void check_xas_order(struct xarray *xa)
+{
+ XA_STATE(xas, xa, 0);
+ unsigned int i;
+
+ for (i = 0; i < sizeof(long) * 8; i++) {
+ xas_set_order(&xas, 0, i);
+ XA_BUG_ON(xa, xas_get_order(&xas) != i);
+ }
+}
+
static noinline void check_xas_retry(struct xarray *xa)
{
XA_STATE(xas, xa, 0);
@@ -1583,6 +1594,7 @@ static DEFINE_XARRAY(array);
static int xarray_checks(void)
{
check_xa_err(&array);
+ check_xas_order(&array);
check_xas_retry(&array);
check_xa_load(&array);
check_xa_mark(&array);