Re: [PATCH v8 4/6] lib/dlock-list: Make sibling CPUs share the same linked list

From: Jan Kara
Date: Wed Nov 01 2017 - 04:38:35 EST


On Tue 31-10-17 14:50:58, Waiman Long wrote:
> The dlock list needs one list for each of the CPUs available. However,
> for sibling CPUs, they are sharing the L2 and probably L1 caches
> too. As a result, there is not much to gain in term of avoiding
> cacheline contention while increasing the cacheline footprint of the
> L1/L2 caches as separate lists may need to be in the cache.
>
> This patch makes all the sibling CPUs share the same list, thus
> reducing the number of lists that need to be maintained in each
> dlock list without having any noticeable impact on performance. It
> also improves dlock list iteration performance as fewer lists need
> to be iterated.
>
> Signed-off-by: Waiman Long <longman@xxxxxxxxxx>

The patch looks good to me. You can add:

Reviewed-by: Jan Kara <jack@xxxxxxx>

Honza

> ---
> lib/dlock-list.c | 74 ++++++++++++++++++++++++++++++++++++++++++++------------
> 1 file changed, 59 insertions(+), 15 deletions(-)
>
> diff --git a/lib/dlock-list.c b/lib/dlock-list.c
> index 17ced06..a4ddecc 100644
> --- a/lib/dlock-list.c
> +++ b/lib/dlock-list.c
> @@ -25,31 +25,65 @@
> * The distributed and locked list is a distributed set of lists each of
> * which is protected by its own spinlock, but acts like a single
> * consolidated list to the callers. For scaling purpose, the number of
> - * lists used is equal to the number of possible CPUs in the system to
> - * minimize contention.
> + * lists used is equal to the number of possible cores in the system to
> + * minimize contention. All threads of the same CPU core will share the
> + * same list.
> *
> - * However, it is possible that individual CPU numbers may be equal to
> - * or greater than the number of possible CPUs when there are holes in
> - * the CPU number list. As a result, we need to map the CPU number to a
> - * list index.
> + * We need to map each CPU number to a list index.
> */
> static DEFINE_PER_CPU_READ_MOSTLY(int, cpu2idx);
> +static int nr_dlock_lists __read_mostly;
>
> /*
> - * Initialize cpu2idx mapping table
> + * Initialize cpu2idx mapping table & nr_dlock_lists.
> *
> * It is possible that a dlock-list can be allocated before the cpu2idx is
> * initialized. In this case, all the cpus are mapped to the first entry
> * before initialization.
> *
> + * All the sibling CPUs of a sibling group will map to the same dlock list so
> + * as to reduce the number of dlock lists to be maintained while minimizing
> + * cacheline contention.
> + *
> + * As the sibling masks are set up in the core initcall phase, this function
> + * has to be done in the postcore phase to get the right data.
> */
> static int __init cpu2idx_init(void)
> {
> int idx, cpu;
> + struct cpumask *sibling_mask;
> + static struct cpumask mask __initdata;
>
> + cpumask_clear(&mask);
> idx = 0;
> - for_each_possible_cpu(cpu)
> - per_cpu(cpu2idx, cpu) = idx++;
> + for_each_possible_cpu(cpu) {
> + int scpu;
> +
> + if (cpumask_test_cpu(cpu, &mask))
> + continue;
> + per_cpu(cpu2idx, cpu) = idx;
> + cpumask_set_cpu(cpu, &mask);
> +
> + sibling_mask = topology_sibling_cpumask(cpu);
> + if (sibling_mask) {
> + for_each_cpu(scpu, sibling_mask) {
> + per_cpu(cpu2idx, scpu) = idx;
> + cpumask_set_cpu(scpu, &mask);
> + }
> + }
> + idx++;
> + }
> +
> + /*
> + * nr_dlock_lists can only be set after cpu2idx is properly
> + * initialized.
> + */
> + smp_mb();
> + nr_dlock_lists = idx;
> + WARN_ON(nr_dlock_lists > nr_cpu_ids);
> +
> + pr_info("dlock-list: %d head entries per dlock list.\n",
> + nr_dlock_lists);
> return 0;
> }
> postcore_initcall(cpu2idx_init);
> @@ -67,19 +101,23 @@ static int __init cpu2idx_init(void)
> *
> * Dynamically allocated locks need to have their own special lock class
> * to avoid lockdep warning.
> + *
> + * Since nr_dlock_lists will always be <= nr_cpu_ids, having more lists
> + * than necessary allocated is not a problem other than some wasted memory.
> + * The extra lists will not be ever used as all the cpu2idx entries will be
> + * 0 before initialization.
> */
> int __alloc_dlock_list_heads(struct dlock_list_heads *dlist,
> struct lock_class_key *key)
> {
> - int idx;
> + int idx, cnt = nr_dlock_lists ? nr_dlock_lists : nr_cpu_ids;
>
> - dlist->heads = kcalloc(nr_cpu_ids, sizeof(struct dlock_list_head),
> - GFP_KERNEL);
> + dlist->heads = kcalloc(cnt, sizeof(struct dlock_list_head), GFP_KERNEL);
>
> if (!dlist->heads)
> return -ENOMEM;
>
> - for (idx = 0; idx < nr_cpu_ids; idx++) {
> + for (idx = 0; idx < cnt; idx++) {
> struct dlock_list_head *head = &dlist->heads[idx];
>
> INIT_LIST_HEAD(&head->list);
> @@ -117,7 +155,10 @@ bool dlock_lists_empty(struct dlock_list_heads *dlist)
> {
> int idx;
>
> - for (idx = 0; idx < nr_cpu_ids; idx++)
> + /* Shouldn't be called before nr_dlock_lists is initialized */
> + WARN_ON_ONCE(!nr_dlock_lists);
> +
> + for (idx = 0; idx < nr_dlock_lists; idx++)
> if (!list_empty(&dlist->heads[idx].list))
> return false;
> return true;
> @@ -199,6 +240,9 @@ struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
> struct dlock_list_node *next;
> struct dlock_list_head *head;
>
> + /* Shouldn't be called before nr_dlock_lists is initialized */
> + WARN_ON_ONCE(!nr_dlock_lists);
> +
> restart:
> if (iter->entry) {
> spin_unlock(&iter->entry->lock);
> @@ -209,7 +253,7 @@ struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
> /*
> * Try next list
> */
> - if (++iter->index >= nr_cpu_ids)
> + if (++iter->index >= nr_dlock_lists)
> return NULL; /* All the entries iterated */
>
> if (list_empty(&iter->head[iter->index].list))
> --
> 1.8.3.1
>
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR