Hi,
I posted a patch to completely remove the lock here from the
ep_poll_safewake() patch here:
http://lkml.iu.edu/hypermail/linux/kernel/1710.1/05834.html
So these are going to conflict. The reason its safe to remove the lock
is that there are loop and depth checks now that are performed during
EPOLL_CTL_ADD. Specifically, ep_loop_check(). I would prefer to these
checks once add add time as opposed to at each wakeup (even if they can
be scaled better).
I also have a pending patch to do something similar for
poll_readywalk_ncalls, but I think that poll_safewake_ncalls is the most
egregious one here?
Thanks,
-Jason
On 10/13/2017 11:45 AM, Davidlohr Bueso wrote:
A customer reported massive contention on the ncalls->lock in which
the workload is designed around nested epolls (where the fd is already
an epoll).
83.49% [kernel] [k] __pv_queued_spin_lock_slowpath
2.70% [kernel] [k] ep_call_nested.constprop.13
1.94% [kernel] [k] _raw_spin_lock_irqsave
1.83% [kernel] [k]
__raw_callee_save___pv_queued_spin_unlock
1.45% [kernel] [k] _raw_spin_unlock_irqrestore
0.41% [kernel] [k] ep_scan_ready_list.isra.8
0.36% [kernel] [k] pvclock_clocksource_read
The application running on kvm, is using a shared memory IPC communication
with a pipe wakeup mechanism, and a two level dispatcher both built around
'epoll_wait'. There is one process per physical core and a full mesh of
pipes
between them, so on a 18 core system (the actual case), there are 18*18
pipes
for the IPCs alone.
This patch proposes replacing the nested calls global linked list with a
dlock
interface, for which we can benefit from pcpu lists when doing
ep_poll_safewake(),
and hashing for the current task, which provides two benefits:
1. Speeds up the process of loop and max-depth checks from O(N) lookups
to O(1)
(albeit possible collisions, which we have to iterate); and,
2. Provides smaller lock granularity.
cpus before after diff
1 1409370 1344804 -4.58%
2 1015690 1337053 31.63%
3 721009 1273254 76.59%
4 380959 1128931 196.33%
5 287603 1028362 257.56%
6 221104 894943 304.76%
7 173592 976395 462.46%
8 145860 922584 532.51%
9 127877 925900 624.05%
10 112603 791456 602.87%
11 97926 724296 639.63%
12 80732 730485 804.82%
With the exception of a single cpu, where the overhead of jhashing
influences), we
get some pretty good raw throughput increase. Similarly, the amount of
time spent
decreases immensely as well.
Passes ltp related testcases.
Signed-off-by: Davidlohr Bueso <dbueso@xxxxxxx>
---
fs/eventpoll.c | 88
+++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 53 insertions(+), 35 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..675c97fdc5da 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -22,7 +22,7 @@
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/string.h>
-#include <linux/list.h>
+#include <linux/dlock-list.h>
#include <linux/hash.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
@@ -119,7 +119,7 @@ struct epoll_filefd {
* and loop cycles.
*/
struct nested_call_node {
- struct list_head llink;
+ struct dlock_list_node llink;
void *cookie;
void *ctx;
};
@@ -129,8 +129,7 @@ struct nested_call_node {
* maximum recursion dept and loop cycles.
*/
struct nested_calls {
- struct list_head tasks_call_list;
- spinlock_t lock;
+ struct dlock_list_heads tasks_call_list;
};
/*
@@ -371,10 +370,14 @@ static inline int ep_op_has_event(int op)
}
/* Initialize the poll safe wake up structure */
-static void ep_nested_calls_init(struct nested_calls *ncalls)
+static inline int ep_nested_calls_init(struct nested_calls *ncalls)
+{
+ return alloc_dlock_list_heads(&ncalls->tasks_call_list, true);
+}
+
+static inline void ep_nested_calls_free(struct nested_calls *ncalls)
{
- INIT_LIST_HEAD(&ncalls->tasks_call_list);
- spin_lock_init(&ncalls->lock);
+ free_dlock_list_heads(&ncalls->tasks_call_list);
}
/**
@@ -483,45 +486,50 @@ static int ep_call_nested(struct nested_calls
*ncalls, int max_nests,
{
int error, call_nests = 0;
unsigned long flags;
- struct list_head *lsthead = &ncalls->tasks_call_list;
- struct nested_call_node *tncur;
- struct nested_call_node tnode;
+ struct dlock_list_head *head;
+ struct nested_call_node *tncur, tnode;
- spin_lock_irqsave(&ncalls->lock, flags);
+ head = dlock_list_hash(&ncalls->tasks_call_list, ctx);
+ spin_lock_irqsave(&head->lock, flags);
/*
* Try to see if the current task is already inside this wakeup call.
- * We use a list here, since the population inside this set is always
- * very much limited.
+ *
+ * We use a chained hash table with linked lists here, and take the
+ * lock to avoid racing when collisions (where ctx pointer is the
key).
+ * Calls for which context is the cpu number, avoid hashing and act as
+ * pcpu add/removal.
+ *
+ * Since the population inside this set is always very much limited,
+ * list scanning should be short.
*/
- list_for_each_entry(tncur, lsthead, llink) {
- if (tncur->ctx == ctx &&
- (tncur->cookie == cookie || ++call_nests > max_nests)) {
- /*
- * Ops ... loop detected or maximum nest level reached.
- * We abort this wake by breaking the cycle itself.
- */
- error = -1;
- goto out_unlock;
- }
- }
+ list_for_each_entry(tncur, &head->list, llink.list) {
+ if (tncur->ctx == ctx &&
+ (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
+ /*
+ * Ops ... loop detected or maximum nest level reached.
+ * We abort this wake by breaking the cycle itself.
+ */
+ error = -1;
+ spin_unlock_irqrestore(&head->lock, flags);
+ goto done;
+ }
+ }
+
/* Add the current task and cookie to the list */
tnode.ctx = ctx;
tnode.cookie = cookie;
- list_add(&tnode.llink, lsthead);
-
- spin_unlock_irqrestore(&ncalls->lock, flags);
+ tnode.llink.head = head;
+ list_add(&tnode.llink.list, &head->list);
+ spin_unlock_irqrestore(&head->lock, flags);
/* Call the nested function */
error = (*nproc)(priv, cookie, call_nests);
/* Remove the current task from the list */
- spin_lock_irqsave(&ncalls->lock, flags);
- list_del(&tnode.llink);
-out_unlock:
- spin_unlock_irqrestore(&ncalls->lock, flags);
-
+ dlock_lists_del(&tnode.llink);
+done:
return error;
}
@@ -2313,13 +2321,16 @@ static int __init eventpoll_init(void)
* Initialize the structure used to perform epoll file descriptor
* inclusion loops checks.
*/
- ep_nested_calls_init(&poll_loop_ncalls);
+ if (ep_nested_calls_init(&poll_loop_ncalls))
+ goto err;
/* Initialize the structure used to perform safe poll wait head wake
ups */
- ep_nested_calls_init(&poll_safewake_ncalls);
+ if (ep_nested_calls_init(&poll_safewake_ncalls))
+ goto err_free1;
/* Initialize the structure used to perform file's f_op->poll()
calls */
- ep_nested_calls_init(&poll_readywalk_ncalls);
+ if (ep_nested_calls_init(&poll_readywalk_ncalls))
+ goto err_free0;
/*
* We can have many thousands of epitems, so prevent this from
@@ -2336,5 +2347,12 @@ static int __init eventpoll_init(void)
sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
return 0;
+
+err_free0:
+ ep_nested_calls_free(&poll_safewake_ncalls);
+err_free1:
+ ep_nested_calls_free(&poll_loop_ncalls);
+err:
+ return -ENOMEM;
}
fs_initcall(eventpoll_init);