Re: [PATCH rfc] workqueue: honour cond_resched() more effectively.
From: NeilBrown
Date: Wed Nov 18 2020 - 00:11:56 EST
On Wed, Nov 18 2020, Hillf Danton wrote:
> On Wed, 18 Nov 2020 09:16:09 +1100
>> From: NeilBrown <neilb@xxxxxxx>
>> Date: Mon, 9 Nov 2020 13:37:17 +1100
>
> What is the brand of your wall clock? The gap between the Date tag
> above and 18 Nov is longer than a week.
I guess 'git commit --amend' doesn't update the Date: stamp.
>
>> Subject: [PATCH] workqueue: warn when cond_resched called from concurrency-managed worker
>>
>> Several workers in concurrency-managed work queues call cond_resched().
>> This suggest that they might consume a lot of CPU time and are willing
>> to allow other code to run concurrently.
>> This *does* allow non-workqueue code to run but does *not* allow other
>> concurrency-managed work items to run on the same CPU.
>>
>> In general such workers should not be run from a concurrency-managed
>> workqueue however:
>> 1/ there is no mechanism to alert coders to the fact that they are
>> using the wrong work queue, and
>> 2/ sometimes it is not clear, when a work item is created, whether it
>> will end up in code that might consume at lot of CPU time. So
>> choosing the best workqueue it not always easy.
>>
>> This patch addresses both these issues:
>> 1/ If a concurrency-managed worker calls cond_resched() a warning
>> (WARN_ON_ONCE) is generated (if CONFIG_WQ_WATCHDOG is selected).
>> 2/ A new interface - workqueue_prepare_use_cpu() - is provided which
>> may be called by low-level code which might be called in a workqueue
>> and might consume CPU. This switches the worker to CPU_INTENSIVE
>> mode so that other work items can run on the same CPU.
>> This means there is no need to chose the required behaviour when
>> submitting a work item, as the code can switch behaviour when needed.
>
> Hm...sounds like the cure can be prepared by splicing the new interface
> above to the cond_resched() in worker's context like
>
> static void foo_work_fn(struct work_struct *work)
> {
> + bool use;
> ...
> + use = workqueue_prepare_use_cpu();
> cond_resched();
> + workqueue_end_use_cpu(use);
> ...
> }
>
> bool workqueue_prepare_use_cpu(void)
> {
> struct worker *worker = current_wq_worker();
>
> if (worker && !(worker->flags & WORKER_CPU_INTENSIVE)) {
> worker_set_flags(worker, WORKER_CPU_INTENSIVE);
> return true;
> } else
> return false;
> }
>
> void workqueue_end_use_cpu(bool use)
> {
> if (use) {
> struct worker *worker = current_wq_worker();
>
> worker_clear_flags(worker, WORKER_CPU_INTENSIVE);
> }
> }
>
> And in a nutshell it looks like the below helper to avoid touching
> anything like cond_resched().
>
> void workqueue_cond_resched(void)
> {
> struct worker *worker = current_wq_worker();
>
> if (worker && !(worker->flags & WORKER_CPU_INTENSIVE)) {
> worker_set_flags(worker, WORKER_CPU_INTENSIVE);
> cond_resched();
> worker_clear_flags(worker, WORKER_CPU_INTENSIVE);
> } else
> cond_resched();
> }
I don't think this is a good idea.
cond_resched() is expected to be called often. Adding all this extra
work every time is excessive and unnecessary.
It might make sense to add a "workqueue_end_use_cpu()" call at the end
of functions that include "workqueue_prepare_use_cpu()" at the start. I
don't think it is likely to make a useful difference, but I'm open to
the possibility if anyone can make a case for it.
Thanks,
NeilBrown
>
>>
>> This patch also changes some code to avoid the warning:
>> - in some cases, system_unbound_wq is used instead of system_wq,
>> when the work item will normally call cond_resched()
>> - in other cases, calls to workqueue_prepare_use_cpu() are inserted.
>>
>> - in slab.c, a cond_resched() call is replaced by
>> if (tif_need_resched())
>> break;
>> so that the worker will relinquish CPU and try again later.
>>
>> There are doubtless more changes needed, but these allow me to boot and
>> access NFS files without warnings.
>>
>> Signed-off-by: NeilBrown <neilb@xxxxxxx>
>> =2D--
>> fs/dcache.c | 2 ++
>> fs/nfs/delegation.c | 5 +++++
>> fs/nfs/write.c | 6 ++++++
>> include/linux/rhashtable.h | 4 ++--
>> include/linux/sched.h | 2 ++
>> include/linux/workqueue.h | 30 ++++++++++++++++++++++++++++++
>> kernel/rcu/tree.c | 2 +-
>> kernel/workqueue.c | 33 +++++++++++++++++++++++++++++++--
>> lib/rhashtable.c | 8 ++++----
>> mm/page_alloc.c | 14 ++++----------
>> mm/slab.c | 3 ++-
>> mm/truncate.c | 3 +++
>> net/sunrpc/clnt.c | 2 +-
>> security/keys/gc.c | 8 ++++----
>> security/keys/key.c | 2 +-
>> 15 files changed, 98 insertions(+), 26 deletions(-)
>>
>> diff --git a/fs/dcache.c b/fs/dcache.c
>> index ea0485861d93..46c83f712ad3 100644
>> =2D-- a/fs/dcache.c
>> +++ b/fs/dcache.c
>> @@ -686,6 +686,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
>> struct inode *inode =3D dentry->d_inode;
>> struct dentry *parent =3D NULL;
>> =20
>> + workqueue_prepare_use_cpu();
>> +
>> if (inode && unlikely(!spin_trylock(&inode->i_lock)))
>> goto slow_positive;
>> =20
>> diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
>> index 816e1427f17e..cbf4e586ee2c 100644
>> =2D-- a/fs/nfs/delegation.c
>> +++ b/fs/nfs/delegation.c
>> @@ -573,6 +573,7 @@ static int nfs_server_return_marked_delegations(struct =
>> nfs_server *server,
>> struct nfs_delegation *place_holder_deleg =3D NULL;
>> int err =3D 0;
>> =20
>> + workqueue_prepare_use_cpu();
>> restart:
>> /*
>> * To avoid quadratic looping we hold a reference
>> @@ -1097,6 +1098,8 @@ static int nfs_server_reap_unclaimed_delegations(stru=
>> ct nfs_server *server,
>> {
>> struct nfs_delegation *delegation;
>> struct inode *inode;
>> +
>> + workqueue_prepare_use_cpu();
>> restart:
>> rcu_read_lock();
>> restart_locked:
>> @@ -1229,6 +1232,8 @@ static int nfs_server_reap_expired_delegations(struct=
>> nfs_server *server,
>> struct inode *inode;
>> const struct cred *cred;
>> nfs4_stateid stateid;
>> +
>> + workqueue_prepare_use_cpu();
>> restart:
>> rcu_read_lock();
>> restart_locked:
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index 639c34fec04a..e84df784acc6 100644
>> =2D-- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -1034,6 +1034,8 @@ nfs_scan_commit_list(struct list_head *src, struct li=
>> st_head *dst,
>> struct nfs_page *req, *tmp;
>> int ret =3D 0;
>> =20
>> + workqueue_prepare_use_cpu();
>> +
>> restart:
>> list_for_each_entry_safe(req, tmp, src, wb_list) {
>> kref_get(&req->wb_kref);
>> @@ -1839,6 +1841,8 @@ static void nfs_commit_release_pages(struct nfs_commi=
>> t_data *data)
>> struct nfs_commit_info cinfo;
>> struct nfs_server *nfss;
>> =20
>> + workqueue_prepare_use_cpu();
>> +
>> while (!list_empty(&data->pages)) {
>> req =3D nfs_list_entry(data->pages.next);
>> nfs_list_remove_request(req);
>> @@ -1924,6 +1928,8 @@ static int __nfs_commit_inode(struct inode *inode, in=
>> t how,
>> int may_wait =3D how & FLUSH_SYNC;
>> int ret, nscan;
>> =20
>> + workqueue_prepare_use_cpu();
>> +
>> nfs_init_cinfo_from_inode(&cinfo, inode);
>> nfs_commit_begin(cinfo.mds);
>> for (;;) {
>> diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
>> index 68dab3e08aad..01ab43d4b9bb 100644
>> =2D-- a/include/linux/rhashtable.h
>> +++ b/include/linux/rhashtable.h
>> @@ -788,7 +788,7 @@ static inline void *__rhashtable_insert_fast(
>> rht_assign_unlock(tbl, bkt, obj);
>> =20
>> if (rht_grow_above_75(ht, tbl))
>> =2D schedule_work(&ht->run_work);
>> + queue_work(system_unbound_wq, &ht->run_work);
>> =20
>> data =3D NULL;
>> out:
>> @@ -1056,7 +1056,7 @@ static inline int __rhashtable_remove_fast_one(
>> atomic_dec(&ht->nelems);
>> if (unlikely(ht->p.automatic_shrinking &&
>> rht_shrink_below_30(ht, tbl)))
>> =2D schedule_work(&ht->run_work);
>> + queue_work(system_unbound_wq, &ht->run_work);
>> err =3D 0;
>> }
>> =20
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 063cd120b459..3a3f1d9c0bb9 100644
>> =2D-- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1837,6 +1837,7 @@ static inline int _cond_resched(void) { return 0; }
>> =20
>> #define cond_resched() ({ \
>> ___might_sleep(__FILE__, __LINE__, 0); \
>> + WARN_ON_ONCE(workqueue_mustnt_use_cpu()); \
>> _cond_resched(); \
>> })
>> =20
>> @@ -1844,6 +1845,7 @@ extern int __cond_resched_lock(spinlock_t *lock);
>> =20
>> #define cond_resched_lock(lock) ({ \
>> ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
>> + WARN_ON_ONCE(workqueue_mustnt_use_cpu()); \
>> __cond_resched_lock(lock); \
>> })
>> =20
>> diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
>> index 26de0cae2a0a..6c6473ee02e1 100644
>> =2D-- a/include/linux/workqueue.h
>> +++ b/include/linux/workqueue.h
>> @@ -626,6 +626,36 @@ static inline bool schedule_delayed_work(struct delaye=
>> d_work *dwork,
>> return queue_delayed_work(system_wq, dwork, delay);
>> }
>> =20
>> +
>> +/* Following are #define rather than 'static inline' because
>> + * workqueue.h (this file) is included by sched.h, so it cannot
>> + * use types and macros defined in that file.
>> + *
>> + * If workqueue_mustnt_use_cpu() returns false and triggers a warning,
>> + * it means that a worker in a concurrency-managed workqueue is calling
>> + * cond_resched(). This does NOT allow other workers to run, so it can
>> + * block them unduly. Such workers should either be run on some other
>> + * workqueue, such as system_unbound_wq, or must call
>> + * workqueue_prepare_use_cpu() so that the worker drops out of
>> + * concurrency management.
>> + */
>> +
>> +#define current_is_wq_worker() (current->flags & PF_WQ_WORKER)
>> +#ifdef CONFIG_WQ_WATCHDOG
>> +bool __workqueue_may_use_cpu(void);
>> +#define workqueue_mustnt_use_cpu() \
>> + (current_is_wq_worker() && !__workqueue_may_use_cpu())
>> +#else
>> +#define workqueue_mustnt_use_cpu() false
>> +#endif
>> +
>> +void __workqueue_prepare_use_cpu(void);
>> +#define workqueue_prepare_use_cpu() \
>> + do { \
>> + if (current_is_wq_worker()) \
>> + __workqueue_prepare_use_cpu(); \
>> + } while(0)
>> +
>> #ifndef CONFIG_SMP
>> static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
>> {
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index 06895ef85d69..907707481d36 100644
>> =2D-- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -3302,7 +3302,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_=
>> rcu_cpu *krcp)
>> * channels have been detached following by each
>> * other.
>> */
>> =2D queue_rcu_work(system_wq, &krwp->rcu_work);
>> + queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
>> }
>> =20
>> // Repeat if any "free" corresponding channel is still busy.
>> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
>> index ab593b20fb94..3f5ee4468493 100644
>> =2D-- a/kernel/workqueue.c
>> +++ b/kernel/workqueue.c
>> @@ -2295,12 +2295,12 @@ __acquires(&pool->lock)
>> * stop_machine. At the same time, report a quiescent RCU state so
>> * the same condition doesn't freeze RCU.
>> */
>> =2D cond_resched();
>> + _cond_resched();
>> =20
>> raw_spin_lock_irq(&pool->lock);
>> =20
>> /* clear cpu intensive status */
>> =2D if (unlikely(cpu_intensive))
>> + if (unlikely(worker->flags & WORKER_CPU_INTENSIVE))
>> worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
>> =20
>> /* tag the worker for identification in schedule() */
>> @@ -5345,6 +5345,35 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpum=
>> ask)
>> return ret;
>> }
>> =20
>> +#ifdef CONFIG_WQ_WATCHDOG
>> +bool __workqueue_may_use_cpu(void)
>> +{
>> + struct worker *worker =3D current_wq_worker();
>> + if (!worker)
>> + return true;
>> + /* If any flag in WORKER_NOT_RUNNING is set, the worker is not
>> + * counted for concurrency control, so it can use as much CPU
>> + * as it likes.
>> + */
>> + return worker->flags & WORKER_NOT_RUNNING;
>> +}
>> +EXPORT_SYMBOL(__workqueue_may_use_cpu);
>> +#endif
>> +
>> +void __workqueue_prepare_use_cpu(void)
>> +{
>> + struct worker *worker =3D current_wq_worker();
>> +
>> + if (worker && !(worker->flags & WORKER_CPU_INTENSIVE)) {
>> + struct worker_pool *pool =3D worker->pool;
>> +
>> + worker_set_flags(worker, WORKER_CPU_INTENSIVE);
>> + if (need_more_worker(pool))
>> + wake_up_worker(pool);
>> + }
>> +}
>> +EXPORT_SYMBOL(__workqueue_prepare_use_cpu);
>> +
>> #ifdef CONFIG_SYSFS
>> /*
>> * Workqueues with WQ_SYSFS flag set is visible to userland via
>> diff --git a/lib/rhashtable.c b/lib/rhashtable.c
>> index c949c1e3b87c..1ef41411cda7 100644
>> =2D-- a/lib/rhashtable.c
>> +++ b/lib/rhashtable.c
>> @@ -433,7 +433,7 @@ static void rht_deferred_worker(struct work_struct *wor=
>> k)
>> mutex_unlock(&ht->mutex);
>> =20
>> if (err)
>> =2D schedule_work(&ht->run_work);
>> + queue_work(system_unbound_wq, &ht->run_work);
>> }
>> =20
>> static int rhashtable_insert_rehash(struct rhashtable *ht,
>> @@ -468,7 +468,7 @@ static int rhashtable_insert_rehash(struct rhashtable *=
>> ht,
>> if (err =3D=3D -EEXIST)
>> err =3D 0;
>> } else
>> =2D schedule_work(&ht->run_work);
>> + queue_work(system_unbound_wq, &ht->run_work);
>> =20
>> return err;
>> =20
>> @@ -479,7 +479,7 @@ static int rhashtable_insert_rehash(struct rhashtable *=
>> ht,
>> =20
>> /* Schedule async rehash to retry allocation in process context. */
>> if (err =3D=3D -ENOMEM)
>> =2D schedule_work(&ht->run_work);
>> + queue_work(system_unbound_wq, &ht->run_work);
>> =20
>> return err;
>> }
>> @@ -579,7 +579,7 @@ static struct bucket_table *rhashtable_insert_one(
>> =20
>> atomic_inc(&ht->nelems);
>> if (rht_grow_above_75(ht, tbl))
>> =2D schedule_work(&ht->run_work);
>> + queue_work(system_unbound_wq, &ht->run_work);
>> =20
>> return NULL;
>> }
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 23f5066bd4a5..69770135e8eb 100644
>> =2D-- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4557,17 +4557,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
>> }
>> =20
>> out:
>> =2D /*
>> =2D * Memory allocation/reclaim might be called from a WQ context and the
>> =2D * current implementation of the WQ concurrency control doesn't
>> =2D * recognize that a particular WQ is congested if the worker thread is
>> =2D * looping without ever sleeping. Therefore we have to do a short sleep
>> =2D * here rather than calling cond_resched().
>> + /* This might take a while - allow workqueue to schedule other
>> + * work in parallel.
>> */
>> =2D if (current->flags & PF_WQ_WORKER)
>> =2D schedule_timeout_uninterruptible(1);
>> =2D else
>> =2D cond_resched();
>> + workqueue_prepare_use_cpu();
>> + cond_resched();
>> return ret;
>> }
>> =20
>> diff --git a/mm/slab.c b/mm/slab.c
>> index b1113561b98b..2e50603e1b26 100644
>> =2D-- a/mm/slab.c
>> +++ b/mm/slab.c
>> @@ -3994,7 +3994,8 @@ static void cache_reap(struct work_struct *w)
>> STATS_ADD_REAPED(searchp, freed);
>> }
>> next:
>> =2D cond_resched();
>> + if (tif_need_resched())
>> + break;
>> }
>> check_irq_on();
>> mutex_unlock(&slab_mutex);
>> diff --git a/mm/truncate.c b/mm/truncate.c
>> index 18cec39a9f53..b333130a7629 100644
>> =2D-- a/mm/truncate.c
>> +++ b/mm/truncate.c
>> @@ -324,6 +324,7 @@ void truncate_inode_pages_range(struct address_space *m=
>> apping,
>> else
>> end =3D (lend + 1) >> PAGE_SHIFT;
>> =20
>> + workqueue_prepare_use_cpu();
>> pagevec_init(&pvec);
>> index =3D start;
>> while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
>> @@ -538,6 +539,7 @@ unsigned long __invalidate_mapping_pages(struct address=
>> _space *mapping,
>> unsigned long count =3D 0;
>> int i;
>> =20
>> + workqueue_prepare_use_cpu();
>> pagevec_init(&pvec);
>> while (index <=3D end && pagevec_lookup_entries(&pvec, mapping, index,
>> min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
>> @@ -717,6 +719,7 @@ int invalidate_inode_pages2_range(struct address_space =
>> *mapping,
>> if (mapping->nrpages =3D=3D 0 && mapping->nrexceptional =3D=3D 0)
>> goto out;
>> =20
>> + workqueue_prepare_use_cpu();
>> pagevec_init(&pvec);
>> index =3D start;
>> while (index <=3D end && pagevec_lookup_entries(&pvec, mapping, index,
>> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
>> index 62e0b6c1e8cf..1ce66672481f 100644
>> =2D-- a/net/sunrpc/clnt.c
>> +++ b/net/sunrpc/clnt.c
>> @@ -906,7 +906,7 @@ rpc_free_client(struct rpc_clnt *clnt)
>> put_cred(clnt->cl_cred);
>> =20
>> INIT_WORK(&clnt->cl_work, rpc_free_client_work);
>> =2D schedule_work(&clnt->cl_work);
>> + queue_work(system_unbound_wq, &clnt->cl_work);
>> return parent;
>> }
>> =20
>> diff --git a/security/keys/gc.c b/security/keys/gc.c
>> index 3c90807476eb..a4ad04b78833 100644
>> =2D-- a/security/keys/gc.c
>> +++ b/security/keys/gc.c
>> @@ -57,7 +57,7 @@ void key_schedule_gc(time64_t gc_at)
>> =20
>> if (gc_at <=3D now || test_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) {
>> kdebug("IMMEDIATE");
>> =2D schedule_work(&key_gc_work);
>> + queue_work(system_unbound_wq, &key_gc_work);
>> } else if (gc_at < key_gc_next_run) {
>> kdebug("DEFERRED");
>> key_gc_next_run =3D gc_at;
>> @@ -72,7 +72,7 @@ void key_schedule_gc(time64_t gc_at)
>> void key_schedule_gc_links(void)
>> {
>> set_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags);
>> =2D schedule_work(&key_gc_work);
>> + queue_work(system_unbound_wq, &key_gc_work);
>> }
>> =20
>> /*
>> @@ -106,7 +106,7 @@ void key_gc_keytype(struct key_type *ktype)
>> set_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags);
>> =20
>> kdebug("schedule");
>> =2D schedule_work(&key_gc_work);
>> + queue_work(system_unbound_wq, &key_gc_work);
>> =20
>> kdebug("sleep");
>> wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE,
>> @@ -319,7 +319,7 @@ static void key_garbage_collector(struct work_struct *w=
>> ork)
>> }
>> =20
>> if (gc_state & KEY_GC_REAP_AGAIN)
>> =2D schedule_work(&key_gc_work);
>> + queue_work(system_unbound_wq, &key_gc_work);
>> kleave(" [end %x]", gc_state);
>> return;
>> =20
>> diff --git a/security/keys/key.c b/security/keys/key.c
>> index e282c6179b21..f990e226d74a 100644
>> =2D-- a/security/keys/key.c
>> +++ b/security/keys/key.c
>> @@ -647,7 +647,7 @@ void key_put(struct key *key)
>> key_check(key);
>> =20
>> if (refcount_dec_and_test(&key->usage))
>> =2D schedule_work(&key_gc_work);
>> + queue_work(system_unbound_wq, &key_gc_work);
>> }
>> }
>> EXPORT_SYMBOL(key_put);
>> =2D-=20
>> 2.29.2
Attachment:
signature.asc
Description: PGP signature