severe proc dentry lock contention

From: Junxiao Bi
Date: Thu Jun 18 2020 - 18:17:42 EST


Hi,

When debugging some performance issue, i found that thousands of threads exit around same time could cause a severe spin lock contention on proc dentry "/proc/$parent_process_pid/task/", that's because threads needs to clean up their pid file from that dir when exit. Check the following standalone test case that simulated the case and perf top result on v5.7 kernel. Any idea on how to fix this?


 PerfTop: 48891 irqs/sec kernel:95.6% exact: 100.0% lost: 0/0 drop: 0/0 [4000Hz cycles], (all, 72 CPUs)
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

ÂÂÂ 66.10%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] native_queued_spin_lock_slowpath
ÂÂÂÂ 1.13%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] _raw_spin_lock
ÂÂÂÂ 0.84%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] clear_page_erms
ÂÂÂÂ 0.82%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] queued_write_lock_slowpath
ÂÂÂÂ 0.64%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] proc_task_readdir
ÂÂÂÂ 0.61%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] find_idlest_group.isra.95
ÂÂÂÂ 0.61%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] syscall_return_via_sysret
ÂÂÂÂ 0.55%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] entry_SYSCALL_64
ÂÂÂÂ 0.49%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] memcpy_erms
ÂÂÂÂ 0.46%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] update_cfs_group
ÂÂÂÂ 0.41%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] get_pid_task
ÂÂÂÂ 0.39%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] _raw_spin_lock_irqsave
ÂÂÂÂ 0.37%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] __list_del_entry_valid
ÂÂÂÂ 0.34%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] get_page_from_freelist
ÂÂÂÂ 0.34%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] __d_lookup
ÂÂÂÂ 0.32%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] update_load_avg
ÂÂÂÂ 0.31%Â libc-2.17.soÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [.] get_next_seq
ÂÂÂÂ 0.27%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] avc_has_perm_noaudit
ÂÂÂÂ 0.26%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] __sched_text_start
ÂÂÂÂ 0.25%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] selinux_inode_permission
ÂÂÂÂ 0.25%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] __slab_free
ÂÂÂÂ 0.24%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] detach_entity_cfs_rq
ÂÂÂÂ 0.23%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] zap_pte_range
ÂÂÂÂ 0.22%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] _find_next_bit.constprop.1
ÂÂÂÂ 0.22%Â libc-2.17.soÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [.] vfprintf
ÂÂÂÂ 0.20%Â libc-2.17.soÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [.] _int_malloc
ÂÂÂÂ 0.19%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] _raw_spin_lock_irq
ÂÂÂÂ 0.18%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] rb_erase
ÂÂÂÂ 0.18%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] pid_revalidate
ÂÂÂÂ 0.18%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] lockref_get_not_dead
ÂÂÂÂ 0.18%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] __alloc_pages_nodemask
ÂÂÂÂ 0.17%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] set_task_cpu
ÂÂÂÂ 0.17%Â libc-2.17.soÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [.] __strcoll_l
ÂÂÂÂ 0.17%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] do_syscall_64
ÂÂÂÂ 0.17%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] __vmalloc_node_range
ÂÂÂÂ 0.17%Â libc-2.17.soÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [.] _IO_vfscanf
ÂÂÂÂ 0.17%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] refcount_dec_not_one
ÂÂÂÂ 0.15%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] __task_pid_nr_ns
ÂÂÂÂ 0.15%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] native_irq_return_iret
ÂÂÂÂ 0.15%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] free_pcppages_bulk
ÂÂÂÂ 0.14%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] kmem_cache_alloc
ÂÂÂÂ 0.14%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] link_path_walk
ÂÂÂÂ 0.14%Â libc-2.17.soÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [.] _int_free
ÂÂÂÂ 0.14%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] __update_load_avg_cfs_rq
ÂÂÂÂ 0.14%Â perf.5.7.0-master.20200601.ol7.x86_64Â [.] 0x00000000000eac29
ÂÂÂÂ 0.13%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] kmem_cache_free
ÂÂÂÂ 0.13%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] number
ÂÂÂÂ 0.13%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] memset_erms
ÂÂÂÂ 0.12%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] proc_pid_status
ÂÂÂÂ 0.12%Â [kernel]ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ [k] __d_lookup_rcu


=========== runme.sh ==========

#!/bin/bash

threads=${1:-10000}
prog=proc_race
while [ 1 ]; do ./$prog $threads; done &

while [ 1 ]; do
ÂÂÂ pid=`ps aux | grep $prog | grep -v grep| awk '{print $2}'`
ÂÂÂ if [ -z $pid ]; then continue; fi
ÂÂÂ threadnum=`ls -l /proc/$pid/task | wc -l`
ÂÂÂ if [ $threadnum -gt $threads ]; then
ÂÂÂ ÂÂÂ echo kill $pid
ÂÂÂ ÂÂÂ kill -9 $pid
ÂÂÂ fi
done


===========proc_race.c=========


#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>

#define handle_error_en(en, msg) \
ÂÂÂ do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)

#define handle_error(msg) \
ÂÂÂ do { perror(msg); exit(EXIT_FAILURE); } while (0)

struct thread_info {
ÂÂÂ pthread_t thread_id;
ÂÂÂ intÂÂÂÂÂÂ thread_num;
};

static void *child_thread()
{
ÂÂÂ int i;

ÂÂÂ while (1) { if (!(i++ % 1000000)) sleep(1);}
ÂÂÂ return NULL;
}

int main(int argc, char *argv[])
{
ÂÂÂ int s, tnum, opt, num_threads;
ÂÂÂ struct thread_info *tinfo;
ÂÂÂ void *res;

ÂÂÂ if (argc == 2)
ÂÂÂ ÂÂÂ num_threads = atoi(argv[1]);
ÂÂÂ else
ÂÂÂ ÂÂÂ num_threads = 10000;

ÂÂÂ tinfo = calloc(num_threads, sizeof(struct thread_info));
ÂÂÂ if (tinfo == NULL)
ÂÂÂ ÂÂÂ handle_error("calloc");


ÂÂÂ for (tnum = 0; tnum < num_threads; tnum++) {
ÂÂÂ ÂÂÂ tinfo[tnum].thread_num = tnum + 1;

ÂÂÂ ÂÂÂ s = pthread_create(&tinfo[tnum].thread_id, NULL,
ÂÂÂ ÂÂÂ ÂÂÂ ÂÂÂ &child_thread, NULL);
ÂÂÂ ÂÂÂ if (s != 0)
ÂÂÂ ÂÂÂ ÂÂÂ handle_error_en(s, "pthread_create");
ÂÂÂ }

ÂÂÂ for (tnum = 0; tnum < num_threads; tnum++) {
ÂÂÂ ÂÂÂ s = pthread_join(tinfo[tnum].thread_id, &res);
ÂÂÂ ÂÂÂ if (s != 0)
ÂÂÂ ÂÂÂ ÂÂÂ handle_error_en(s, "pthread_join");

ÂÂÂ ÂÂÂ free(res);
ÂÂÂ }

ÂÂÂ free(tinfo);
ÂÂÂ exit(EXIT_SUCCESS);
}

==========

Thanks,

Junxiao.