Re: [PATCH v3 1/3] lib/percpu-list: Per-cpu list with associated per-cpu locks
From: Boqun Feng
Date: Tue Feb 23 2016 - 21:00:59 EST
Hi Waiman,
On Tue, Feb 23, 2016 at 02:04:30PM -0500, Waiman Long wrote:
> Linked list is used everywhere in the Linux kernel. However, if many
> threads are trying to add or delete entries into the same linked list,
> it can create a performance bottleneck.
>
> This patch introduces a new per-cpu list subystem with associated
> per-cpu locks for protecting each of the lists individually. This
> allows list entries insertion and deletion operations to happen in
> parallel instead of being serialized with a global list and lock.
>
> List entry insertion is strictly per cpu. List deletion, however, can
> happen in a cpu other than the one that did the insertion. So we still
> need lock to protect the list. Because of that, there may still be
> a small amount of contention when deletion is being done.
>
> A new header file include/linux/percpu-list.h will be added with the
> associated pcpu_list_head and pcpu_list_node structures. The following
> functions are provided to manage the per-cpu list:
>
> 1. int init_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
> 2. void pcpu_list_add(struct pcpu_list_node *node,
> struct pcpu_list_head *head)
> 3. void pcpu_list_del(struct pcpu_list *node)
>
> Iteration of all the list entries within a group of per-cpu
> lists is done by calling either the pcpu_list_iterate() or
> pcpu_list_iterate_safe() functions in a while loop. They correspond
> to the list_for_each_entry() and list_for_each_entry_safe() macros
> respectively. The iteration states are keep in a pcpu_list_state
> structure that is passed to the iteration functions.
>
> Signed-off-by: Waiman Long <Waiman.Long@xxxxxxx>
> ---
> include/linux/percpu-list.h | 235 +++++++++++++++++++++++++++++++++++++++++++
> lib/Makefile | 2 +-
> lib/percpu-list.c | 85 ++++++++++++++++
> 3 files changed, 321 insertions(+), 1 deletions(-)
> create mode 100644 include/linux/percpu-list.h
> create mode 100644 lib/percpu-list.c
>
> diff --git a/include/linux/percpu-list.h b/include/linux/percpu-list.h
> new file mode 100644
> index 0000000..8759fec
> --- /dev/null
> +++ b/include/linux/percpu-list.h
> @@ -0,0 +1,235 @@
> +/*
> + * Per-cpu list
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
> + *
> + * Authors: Waiman Long <waiman.long@xxxxxxx>
> + */
> +#ifndef __LINUX_PERCPU_LIST_H
> +#define __LINUX_PERCPU_LIST_H
> +
> +#include <linux/spinlock.h>
> +#include <linux/list.h>
> +#include <linux/percpu.h>
> +
> +/*
> + * include/linux/percpu-list.h
> + *
> + * A per-cpu list protected by a per-cpu spinlock.
> + *
> + * The pcpu_list_head structure contains the spinlock, the other
> + * pcpu_list_node structures only contains a pointer to the spinlock in
> + * pcpu_list_head.
> + */
> +struct pcpu_list_head {
> + struct list_head list;
> + spinlock_t lock;
> +};
> +
> +struct pcpu_list_node {
> + struct list_head list;
> + spinlock_t *lockptr;
> +};
> +
> +/*
> + * Per-cpu list iteration state
> + */
> +struct pcpu_list_state {
> + int cpu;
> + spinlock_t *lock;
> + struct list_head *head; /* List head of current per-cpu list */
> + struct pcpu_list_node *curr;
> + struct pcpu_list_node *next;
> +};
> +
> +#define PCPU_LIST_HEAD_INIT(name) \
> + { \
> + .list.prev = &name.list, \
> + .list.next = &name.list, \
> + .list.lock = __SPIN_LOCK_UNLOCKED(name), \
> + }
> +
> +#define PCPU_LIST_NODE_INIT(name) \
> + { \
> + .list.prev = &name.list, \
> + .list.next = &name.list, \
> + .list.lockptr = NULL \
> + }
> +
> +#define PCPU_LIST_STATE_INIT() \
> + { \
> + .cpu = -1, \
> + .lock = NULL, \
> + .head = NULL, \
> + .curr = NULL, \
> + .next = NULL, \
> + }
> +
> +#define DEFINE_PCPU_LIST_STATE(s) \
> + struct pcpu_list_state s = PCPU_LIST_STATE_INIT()
> +
> +#define pcpu_list_next_entry(pos, member) list_next_entry(pos, member.list)
> +
> +static inline void init_pcpu_list_node(struct pcpu_list_node *node)
> +{
> + INIT_LIST_HEAD(&node->list);
> + node->lockptr = NULL;
> +}
> +
> +static inline void free_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
> +{
> + free_percpu(*ppcpu_head);
> + *ppcpu_head = NULL;
> +}
> +
> +static inline void init_pcpu_list_state(struct pcpu_list_state *state)
> +{
> + state->cpu = -1;
> + state->lock = NULL;
> + state->head = NULL;
> + state->curr = NULL;
> + state->next = NULL;
> +}
> +
> +#if NR_CPUS == 1
> +/*
> + * For uniprocessor, the list head and lock in struct pcpu_list_head are
> + * used directly.
> + */
> +static inline bool pcpu_list_empty(struct pcpu_list_head *pcpu_head)
> +{
> + return list_empty(&pcpu_head->list);
> +}
> +
> +static __always_inline bool
> +__pcpu_list_next_cpu(struct pcpu_list_head *head, struct pcpu_list_state *state)
> +{
> + if (state->lock)
> + spin_unlock(state->lock);
> +
> + if (state->cpu++ >= 0)
> + return false;
> +
> + state->curr = list_entry(head->list.next, struct pcpu_list_node, list);
> + if (list_empty(&state->curr->list))
> + return false;
> + state->lock = &head->lock;
> + spin_lock(state->lock);
> + return true;
> +
> +}
> +#else /* NR_CPUS == 1 */
> +/*
> + * Multiprocessor
> + */
> +static inline bool pcpu_list_empty(struct pcpu_list_head *pcpu_head)
> +{
> + int cpu;
> +
> + for_each_possible_cpu(cpu)
> + if (!list_empty(&per_cpu_ptr(pcpu_head, cpu)->list))
> + return false;
> + return true;
> +}
> +
> +/*
> + * Helper function to find the first entry of the next per-cpu list
> + * It works somewhat like for_each_possible_cpu(cpu).
> + *
> + * Return: true if the entry is found, false if all the lists exhausted
> + */
> +static __always_inline bool
> +__pcpu_list_next_cpu(struct pcpu_list_head *head, struct pcpu_list_state *state)
> +{
> + if (state->lock)
> + spin_unlock(state->lock);
> +next_cpu:
> + /*
> + * for_each_possible_cpu(cpu)
> + */
> + state->cpu = cpumask_next(state->cpu, cpu_possible_mask);
> + if (state->cpu >= nr_cpu_ids)
> + return false; /* All the per-cpu lists iterated */
> +
> + state->head = &per_cpu_ptr(head, state->cpu)->list;
> + state->lock = &per_cpu_ptr(head, state->cpu)->lock;
> + state->curr = list_entry(state->head->next,
> + struct pcpu_list_node, list);
> + if (&state->curr->list == state->head)
> + goto next_cpu;
> +
> + spin_lock(state->lock);
> + return true;
> +}
> +#endif /* NR_CPUS == 1 */
> +
> +/*
> + * Iterate to the next entry of the group of per-cpu lists
> + *
> + * Return: true if the next entry is found, false if all the entries iterated
> + */
> +static inline bool pcpu_list_iterate(struct pcpu_list_head *head,
> + struct pcpu_list_state *state)
> +{
> + /*
> + * Find next entry
> + */
> + if (state->curr)
> + state->curr = list_next_entry(state->curr, list);
> +
> + if (!state->curr || (&state->curr->list == state->head)) {
> + /*
> + * The current per-cpu list has been exhausted, try the next
> + * per-cpu list.
> + */
> + if (!__pcpu_list_next_cpu(head, state))
> + return false;
> + }
> + return true; /* Continue the iteration */
> +}
> +
> +/*
> + * Iterate to the next entry of the group of per-cpu lists and safe
> + * against removal of list_entry
> + *
> + * Return: true if the next entry is found, false if all the entries iterated
> + */
> +static inline bool pcpu_list_iterate_safe(struct pcpu_list_head *head,
> + struct pcpu_list_state *state)
> +{
> + /*
> + * Find next entry
> + */
> + if (state->curr) {
> + state->curr = state->next;
> + state->next = list_next_entry(state->next, list);
> + }
> +
> + if (!state->curr || (&state->curr->list == state->head)) {
> + /*
> + * The current per-cpu list has been exhausted, try the next
> + * per-cpu list.
> + */
> + if (!__pcpu_list_next_cpu(head, state))
> + return false;
> + state->next = list_next_entry(state->curr, list);
> + }
> + return true; /* Continue the iteration */
> +}
> +
> +extern int init_pcpu_list_head(struct pcpu_list_head **ppcpu_head);
> +extern void pcpu_list_add(struct pcpu_list_node *node,
> + struct pcpu_list_head *head);
> +extern void pcpu_list_del(struct pcpu_list_node *node);
> +
> +#endif /* __LINUX_PERCPU_LIST_H */
> diff --git a/lib/Makefile b/lib/Makefile
> index a7c26a4..71a25d4 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -27,7 +27,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
> gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
> bsearch.o find_bit.o llist.o memweight.o kfifo.o \
> percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
> - once.o
> + once.o percpu-list.o
> obj-y += string_helpers.o
> obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
> obj-y += hexdump.o
> diff --git a/lib/percpu-list.c b/lib/percpu-list.c
> new file mode 100644
> index 0000000..45bbb2a
> --- /dev/null
> +++ b/lib/percpu-list.c
> @@ -0,0 +1,85 @@
> +/*
> + * Per-cpu list
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
> + *
> + * Authors: Waiman Long <waiman.long@xxxxxxx>
> + */
> +#include <linux/percpu-list.h>
> +
> +/*
> + * Initialize the per-cpu list
> + */
> +int init_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
> +{
> + struct pcpu_list_head *pcpu_head = alloc_percpu(struct pcpu_list_head);
> + int cpu;
> +
> + if (!pcpu_head)
> + return -ENOMEM;
> +
> + for_each_possible_cpu(cpu) {
> + struct pcpu_list_head *head = per_cpu_ptr(pcpu_head, cpu);
> +
> + INIT_LIST_HEAD(&head->list);
> + head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
> + }
> +
> + *ppcpu_head = pcpu_head;
> + return 0;
> +}
> +
> +/*
> + * List selection is based on the CPU being used when the pcpu_list_add()
> + * function is called. However, deletion may be done by a different CPU.
> + * So we still need to use a lock to protect the content of the list.
> + */
> +void pcpu_list_add(struct pcpu_list_node *node, struct pcpu_list_head *head)
> +{
> + spinlock_t *lock;
> +
> + /*
> + * There is a very slight chance the cpu will be changed
> + * (by preemption) before calling spin_lock(). We only need to put
> + * the node in one of the per-cpu lists. It may not need to be
> + * that of the current cpu.
> + */
Just curious about the comment here, what if the following happens:
CPU 0 CPU 1
===================== =====================
task_1:
lock = this_cpu_ptr(&head->lock); // head->lock is on CPU0
<preempted>
continue to task_1:
spin_lock(lock);
node->lockptr = lock;
// head->list is on CPU1
list_add(&node->list, this_cpu_ptr(&head->list));
spin_unlock(lock);
, which ends up the node is in the list on CPU1 while ->lockptr pointing
to the lock on CPU0.
If there is another node whose ->lockptr points to the lock on CPU1 and
the node is in list on CPU1, what will happen if these two nodes get
deleted simultaneously?
Regards,
Boqun
> + lock = this_cpu_ptr(&head->lock);
> + spin_lock(lock);
> + node->lockptr = lock;
> + list_add(&node->list, this_cpu_ptr(&head->list));
> + spin_unlock(lock);
> +}
> +
> +/*
> + * Delete a node from a percpu list
> + *
> + * We need to check the lock pointer again after taking the lock to guard
> + * against concurrent delete of the same node. If the lock pointer changes
> + * (becomes NULL or to a different one), we assume that the deletion was done
> + * elsewhere.
> + */
> +void pcpu_list_del(struct pcpu_list_node *node)
> +{
> + spinlock_t *lock = READ_ONCE(node->lockptr);
> +
> + if (unlikely(!lock))
> + return;
> +
> + spin_lock(lock);
> + if (likely(lock == node->lockptr)) {
> + list_del_init(&node->list);
> + node->lockptr = NULL;
> + }
> + spin_unlock(lock);
> +}
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Attachment:
signature.asc
Description: PGP signature