[PATCH v3 1/4] lib/dlock-list: Distributed and lock-protected lists

From: Waiman Long
Date: Fri Jul 15 2016 - 13:40:05 EST


Linked list is used everywhere in the Linux kernel. However, if many
threads are trying to add or delete entries into the same linked list,
it can create a performance bottleneck.

This patch introduces a new list APIs that provide a set of distributed
lists (one per CPU), each of which is protected by its own spinlock.
To the callers, however, the set of lists acts like a single
consolidated list. This allows list entries insertion and deletion
operations to happen in parallel instead of being serialized with a
global list and lock.

List entry insertion is strictly per cpu. List deletion, however, can
happen in a cpu other than the one that did the insertion. So we still
need lock to protect the list. Because of that, there may still be
a small amount of contention when deletion is being done.

A new header file include/linux/dlock-list.h will be added with the
associated dlock_list_head and dlock_list_node structures. The following
functions are provided to manage the per-cpu list:

1. int init_dlock_list_head(struct dlock_list_head *dlist)
2. void dlock_list_add(struct dlock_list_node *node,
struct dlock_list_head *dlist)
3. void dlock_list_del(struct dlock_list *node)

Iteration of all the list entries within a group of per-cpu lists is
done by calling either the dlock_list_next() or dlock_list_next_safe()
functions in a while loop. They correspond to the list_for_each_entry()
and list_for_each_entry_safe() macros respectively. The iteration
states are keep in a dlock_list_iter structure that is passed to the
iteration functions.

Suggested-by: Tejun Heo <tj@xxxxxxxxxx>
Signed-off-by: Waiman Long <Waiman.Long@xxxxxxx>
Reviewed-by: Jan Kara <jack@xxxxxxx>
---
include/linux/dlock-list.h | 135 +++++++++++++++++++++++
lib/Makefile | 2 +-
lib/dlock-list.c | 254 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 390 insertions(+), 1 deletions(-)
create mode 100644 include/linux/dlock-list.h
create mode 100644 lib/dlock-list.c

diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
new file mode 100644
index 0000000..2647b7d
--- /dev/null
+++ b/include/linux/dlock-list.h
@@ -0,0 +1,135 @@
+/*
+ * Distributed and locked list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <waiman.long@xxxxxxx>
+ */
+#ifndef __LINUX_DLOCK_LIST_H
+#define __LINUX_DLOCK_LIST_H
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+
+/*
+ * include/linux/dlock-list.h
+ *
+ * A distributed (per-cpu) set of lists each of which is protected by its
+ * own spinlock, but acts like a single consolidated list to the callers.
+ *
+ * The dlock_list_head_percpu structure contains the spinlock, the other
+ * dlock_list_node structures only contains a pointer to the spinlock in
+ * dlock_list_head_percpu.
+ */
+struct dlock_list_head_percpu {
+ struct list_head list;
+ spinlock_t lock;
+};
+
+struct dlock_list_head {
+ struct dlock_list_head_percpu __percpu *head;
+};
+
+/*
+ * dlock list node data structure
+ */
+struct dlock_list_node {
+ struct list_head list;
+ spinlock_t *lockptr;
+};
+
+#define DLOCK_LIST_HEAD_PERCPU_INIT(name) \
+ { \
+ .list.prev = &name.list, \
+ .list.next = &name.list, \
+ .list.lock = __SPIN_LOCK_UNLOCKED(name), \
+ }
+
+/*
+ * dlock list iteration state
+ */
+struct dlock_list_iter {
+ int cpu;
+ spinlock_t *lock;
+ struct list_head *head; /* List head of current per-cpu list */
+ struct dlock_list_node *curr;
+ struct dlock_list_node *next;
+};
+
+#define DLOCK_LIST_ITER_INIT() \
+ { \
+ .cpu = -1, \
+ }
+
+#define DEFINE_DLOCK_LIST_ITER(s) \
+ struct dlock_list_iter s = DLOCK_LIST_ITER_INIT()
+
+static inline void init_dlock_list_iter(struct dlock_list_iter *iter)
+{
+ *iter = (struct dlock_list_iter)DLOCK_LIST_ITER_INIT();
+}
+
+#define DLOCK_LIST_NODE_INIT(name) \
+ { \
+ .list.prev = &name.list, \
+ .list.next = &name.list, \
+ }
+
+static inline void init_dlock_list_node(struct dlock_list_node *node)
+{
+ INIT_LIST_HEAD(&node->list);
+ node->lockptr = NULL;
+}
+
+/*
+ * Check if all the dlock lists are empty
+ *
+ * This can be a pretty expensive function call. If this function is required
+ * in a performance critical path, we may have to maintain a global count
+ * of the list entries in the global dlock_list_head structure instead.
+ */
+static inline bool dlock_list_empty(struct dlock_list_head *dlist)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (!list_empty(&per_cpu_ptr(dlist->head, cpu)->list))
+ return false;
+ return true;
+}
+
+/*
+ * Allocation and freeing of dlock list
+ */
+extern int alloc_dlock_list_head(struct dlock_list_head *dlist);
+extern void free_dlock_list_head(struct dlock_list_head *dlist);
+
+/*
+ * The dlock list iteration functions which return true if iteration has
+ * to be continued.
+ */
+extern bool dlock_list_next(struct dlock_list_head *dlist,
+ struct dlock_list_iter *iter);
+extern bool dlock_list_next_safe(struct dlock_list_head *dlist,
+ struct dlock_list_iter *iter);
+
+/*
+ * The dlock list addition and deletion functions here are not irq-safe.
+ * Special irq-safe variants will have to be added if we need them.
+ */
+extern void dlock_list_add(struct dlock_list_node *node,
+ struct dlock_list_head *dlist);
+extern void dlock_list_del(struct dlock_list_node *node);
+
+#endif /* __LINUX_DLOCK_LIST_H */
diff --git a/lib/Makefile b/lib/Makefile
index 499fb35..92e8c38 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -40,7 +40,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
bsearch.o find_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
- once.o
+ once.o dlock-list.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += hexdump.o
diff --git a/lib/dlock-list.c b/lib/dlock-list.c
new file mode 100644
index 0000000..af4a9f3
--- /dev/null
+++ b/lib/dlock-list.c
@@ -0,0 +1,254 @@
+/*
+ * Distributed and locked list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <waiman.long@xxxxxxx>
+ */
+#include <linux/dlock-list.h>
+#include <linux/lockdep.h>
+#include <linux/export.h>
+
+/*
+ * As all the locks in the dlock list are dynamically allocated, they need
+ * to belong to their own special lock class to avoid warning and stack
+ * trace in kernel log when lockdep is enabled. Statically allocated locks
+ * don't have this problem.
+ */
+static struct lock_class_key dlock_list_key;
+
+/**
+ * alloc_dlock_list_head - Initialize and allocate the per-cpu list head
+ * @dlist: Pointer to the dlock_list_head structure to be initialized
+ * Return: 0 if successful, -ENOMEM if memory allocation error
+ *
+ * This function does not allocate the dlock_list_head structure itself. The
+ * callers will have to do their own memory allocation, if necessary. However,
+ * this allows embedding the dlock_list_head structure directly into other
+ * structures.
+ */
+int alloc_dlock_list_head(struct dlock_list_head *dlist)
+{
+ struct dlock_list_head dlist_tmp;
+ int cpu;
+
+ dlist_tmp.head = alloc_percpu(struct dlock_list_head_percpu);
+ if (!dlist_tmp.head)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct dlock_list_head_percpu *head;
+
+ head = per_cpu_ptr(dlist_tmp.head, cpu);
+ INIT_LIST_HEAD(&head->list);
+ head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
+ lockdep_set_class(&head->lock, &dlock_list_key);
+ }
+
+ dlist->head = dlist_tmp.head;
+ return 0;
+}
+EXPORT_SYMBOL(alloc_dlock_list_head);
+
+/**
+ * free_dlock_list_head - Free the per-cpu list head of dlock list
+ * @dlist: Pointer of the dlock_list_head structure to be freed
+ *
+ * This function doesn't free the dlock_list_head structure itself. So
+ * the caller will have to do it, if necessary.
+ */
+void free_dlock_list_head(struct dlock_list_head *dlist)
+{
+ free_percpu(dlist->head);
+ dlist->head = NULL;
+}
+EXPORT_SYMBOL(free_dlock_list_head);
+
+/**
+ * dlock_list_add - Adds a node to the given dlock list
+ * @node : The node to be added
+ * @dlist: The dlock list where the node is to be added
+ *
+ * List selection is based on the CPU being used when the dlock_list_add()
+ * function is called. However, deletion may be done by a different CPU.
+ * So we still need to use a lock to protect the content of the list.
+ */
+void dlock_list_add(struct dlock_list_node *node,
+ struct dlock_list_head *dlist)
+{
+ struct dlock_list_head_percpu *head;
+
+ /*
+ * Disable preemption to make sure that CPU won't gets changed.
+ */
+ head = get_cpu_ptr(dlist->head);
+ spin_lock(&head->lock);
+ node->lockptr = &head->lock;
+ list_add(&node->list, &head->list);
+ spin_unlock(&head->lock);
+ put_cpu_ptr(dlist->head);
+}
+EXPORT_SYMBOL(dlock_list_add);
+
+/**
+ * dlock_list_del - Delete a node from a dlock list
+ * @node : The node to be deleted
+ *
+ * We need to check the lock pointer again after taking the lock to guard
+ * against concurrent deletion of the same node. If the lock pointer changes
+ * (becomes NULL or to a different one), we assume that the deletion was done
+ * elsewhere. A warning will be printed if this happens as it is likely to be
+ * a bug.
+ */
+void dlock_list_del(struct dlock_list_node *node)
+{
+ spinlock_t *lock = READ_ONCE(node->lockptr);
+
+ if (unlikely(!lock)) {
+ WARN_ONCE(1,
+ "dlock_list_del: node 0x%lx has no associated lock\n",
+ (unsigned long)node);
+ return;
+ }
+
+ spin_lock(lock);
+ if (likely(lock == node->lockptr)) {
+ list_del_init(&node->list);
+ node->lockptr = NULL;
+ } else {
+ /*
+ * This path should never be executed.
+ */
+ WARN_ON_ONCE(1);
+ }
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(dlock_list_del);
+
+/*
+ * Helper function to find the first entry of the next per-cpu list
+ * It works somewhat like for_each_possible_cpu(cpu).
+ *
+ * Return: true if the entry is found, false if all the lists exhausted
+ *
+ */
+static inline bool dlock_list_next_cpu(struct dlock_list_head *dlist,
+ struct dlock_list_iter *iter)
+{
+ if (iter->lock)
+ spin_unlock(iter->lock);
+next_cpu:
+ /*
+ * for_each_possible_cpu(cpu)
+ */
+ iter->cpu = cpumask_next(iter->cpu, cpu_possible_mask);
+ if (iter->cpu >= nr_cpu_ids)
+ return false; /* All the per-cpu lists iterated */
+
+ iter->head = &per_cpu_ptr(dlist->head, iter->cpu)->list;
+ if (list_empty(iter->head))
+ goto next_cpu;
+
+ iter->lock = &per_cpu_ptr(dlist->head, iter->cpu)->lock;
+ spin_lock(iter->lock);
+ /*
+ * There is a slight chance that the list may become empty just
+ * before the lock is acquired. So an additional check is
+ * needed to make sure that iter->curr points to a valid entry.
+ */
+ if (list_empty(iter->head)) {
+ spin_unlock(iter->lock);
+ goto next_cpu;
+ }
+ iter->curr = list_entry(iter->head->next,
+ struct dlock_list_node, list);
+ return true;
+}
+
+/**
+ * dlock_list_next - Iterate to the next entry of the dlock list
+ * @dlist: Pointer to the dlock_list_head structure
+ * @iter : Pointer to the dlock list iterator structure
+ * Return: true if the next entry is found, false if all the entries iterated
+ *
+ * The iterator has to be properly initialized before calling this function.
+ * This iteration function isn't safe with respect to list entry removal, but
+ * it can correctly iterate newly added entries right after the current one.
+ * This iteration function is designed to be used in a while loop.
+ *
+ * Usage example:
+ *
+ * DEFINE_DLOCK_LIST_ITER(iter);
+ * while (dlock_list_next(dlist, &iter)) {
+ * ...
+ * }
+ */
+bool dlock_list_next(struct dlock_list_head *dlist,
+ struct dlock_list_iter *iter)
+{
+ /*
+ * Find next entry
+ */
+ if (iter->curr)
+ iter->curr = list_next_entry(iter->curr, list);
+
+ if (!iter->curr || (&iter->curr->list == iter->head)) {
+ /*
+ * The current per-cpu list has been exhausted, try the next
+ * per-cpu list.
+ */
+ if (!dlock_list_next_cpu(dlist, iter))
+ return false;
+ }
+
+ WARN_ON_ONCE(iter->curr->lockptr != iter->lock);
+ return true; /* Continue the iteration */
+}
+EXPORT_SYMBOL(dlock_list_next);
+
+/**
+ * dlock_list_next_safe - Removal-safe iterator of dlock list
+ * @dlist: Pointer to the dlock_list_head structure
+ * @iter : Pointer to the dlock list iterator structure
+ * Return: true if the next entry is found, false if all the entries iterated
+ *
+ * The iterator has to be properly initialized before calling this function.
+ * This iteration function is safe with respect to list entry removal.
+ * However, it cannot correctly iterate newly added entries right after the
+ * current one.
+ */
+bool dlock_list_next_safe(struct dlock_list_head *dlist,
+ struct dlock_list_iter *iter)
+{
+ /*
+ * Find next entry
+ */
+ if (iter->curr) {
+ iter->curr = iter->next;
+ iter->next = list_next_entry(iter->next, list);
+ }
+
+ if (!iter->curr || (&iter->curr->list == iter->head)) {
+ /*
+ * The current per-cpu list has been exhausted, try the next
+ * per-cpu list.
+ */
+ if (!dlock_list_next_cpu(dlist, iter))
+ return false;
+ iter->next = list_next_entry(iter->curr, list);
+ }
+
+ WARN_ON_ONCE(iter->curr->lockptr != iter->lock);
+ return true; /* Continue the iteration */
+}
+EXPORT_SYMBOL(dlock_list_next_safe);
--
1.7.1