Re: Bisected GFP in bfq_bfqq_expire on v5.1-rc1

From: Paolo Valente
Date: Tue Apr 09 2019 - 05:55:30 EST




> Il giorno 4 apr 2019, alle ore 21:22, Dmitrii Tcvetkov <demfloro@xxxxxxxxxxx> ha scritto:
>
> On Mon, 1 Apr 2019 12:35:11 +0200
> Paolo Valente <paolo.valente@xxxxxxxxxx> wrote:
>
>>
>>
>>> Il giorno 1 apr 2019, alle ore 11:22, Dmitrii Tcvetkov
>>> <demfloro@xxxxxxxxxxx> ha scritto:
>>>
>>> On Mon, 1 Apr 2019 11:01:27 +0200
>>> Paolo Valente <paolo.valente@xxxxxxxxxx> wrote:
>>>> Ok, thank you. Could you please do a
>>>>
>>>> list *(bfq_bfqq_expire+0x1f3)
>>>>
>>>> for me?
>>>>
>>>> Thanks,
>>>> Paolo
>>>>
>>>>>
>>>>> <gpf.txt><gpf-w-bfq-group-iosched.txt><config.txt>
>>>
>>> Reading symbols from vmlinux...done.
>>> (gdb) list *(bfq_bfqq_expire+0x1f3)
>>> 0xffffffff813d02c3 is in bfq_bfqq_expire (block/bfq-iosched.c:3390).
>>> 3385 * even in case bfqq and thus parent entities go on
>>> receiving 3386 * service with the same budget.
>>> 3387 */
>>> 3388 entity = entity->parent;
>>> 3389 for_each_entity(entity)
>>> 3390 entity->service = 0;
>>> 3391 }
>>> 3392
>>> 3393 /*
>>> 3394 * Budget timeout is not implemented through a dedicated
>>> timer, but
>>
>> Thank you very much. Unfortunately this doesn't ring any bell. I'm
>> trying to reproduce the failure. It will probably take a little
>> time. If I don't make it, I'll ask you to kindly retry after applying
>> some instrumentation patch.
>>
>
> I looked at what git is doing just before panic and it's doing a lot of
> lstat() syscalls on working tree.
>
> I've attached a python script which reproduces the crash in about
> 10 seconds after it prepares testdir, git checkout origin/linux-5.0.y
> reproduces it in about 2 seconds. I have to use multiprocessing Pool as
> I couldn't reproduce the crash using ThreadPool, probably due to Python
> GIL.
>

Unfortunately this failure doesn't reproduce on my systems. But I
have a suspect. Could you please test this patch? (also attached as a
compressed file):

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fac188dd78fa..0a435bcfed20 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2822,7 +2822,7 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
bfq_remove_request(q, rq);
}

-static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
/*
* If this bfqq is shared between multiple processes, check
@@ -2857,7 +2857,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
* or requeued before executing the next function, which
* resets all in-service entites as no more in service.
*/
- __bfq_bfqd_reset_in_service(bfqd);
+ return __bfq_bfqd_reset_in_service(bfqd);
}

/**
@@ -3262,7 +3262,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
bool slow;
unsigned long delta = 0;
struct bfq_entity *entity = &bfqq->entity;
- int ref;

/*
* Check whether the process is slow (see bfq_bfqq_is_slow).
@@ -3347,10 +3346,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
* reason.
*/
__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
- ref = bfqq->ref;
- __bfq_bfqq_expire(bfqd, bfqq);
-
- if (ref == 1) /* bfqq is gone, no more actions on it */
+ if (__bfq_bfqq_expire(bfqd, bfqq))
+ /* bfqq is gone, no more actions on it */
return;

bfqq->injected_service = 0;
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 062e1c4787f4..86394e503ca9 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -995,7 +995,7 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity,
bool ins_into_idle_tree);
bool next_queue_may_preempt(struct bfq_data *bfqd);
struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
-void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
+bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool ins_into_idle_tree, bool expiration);
void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index a11bef75483d..a0c60c47ed1c 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1605,7 +1605,7 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
return bfqq;
}

-void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
{
struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
@@ -1629,8 +1629,18 @@ void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
* service tree either, then release the service reference to
* the queue it represents (taken with bfq_get_entity).
*/
- if (!in_serv_entity->on_st)
+ if (!in_serv_entity->on_st) {
+ /*
+ * bfqq may be freed here, if bfq_exit_bfqq(bfqq) has
+ * already been executed
+ */
+ int ref = in_serv_bfqq->ref;
bfq_put_queue(in_serv_bfqq);
+ if (ref == 1)
+ return true;
+ }
+
+ return false;
}

void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,

Attachment: 0001-block-bfq-tentative-fix-of-use-after-free-in-bfq_bfq.patch.gz
Description: GNU Zip compressed data




> <crash.py>