Re: [PATCH] sched: refill quota for current period when refilling timer activates

From: Konstantin Khlebnikov
Date: Mon May 14 2018 - 07:05:21 EST


On 14.05.2018 13:58, Konstantin Khlebnikov wrote:
Period timer deactivates if task group has no activity during past period,
i.e. if were no throttle and runtime from global pool weren't consumed.
When timer activates back global pool contains unpredictable amount of
expired runtime allocated long ago. In some cases this works fine and
task could use it until next refill. But series short execution slices
could drain all that leftovers because each switch expires local poll.
In this case task group will be throttled until quota refill.

Attached piece of code allows to reproduce problem.

run
sched-burst 10 10000000 1000000000

in cgroup with

cpu.cfs_period_us = 100000
cpu.cfs_quota_us = 1100000

Series of sched_yield() drains stale runtime from global pool before refilling.


This patch refills quota right at the moment of timer's activation.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>
---
kernel/sched/fair.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79f574dba096..b8d73ed17ff6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5165,6 +5165,8 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
if (!cfs_b->period_active) {
cfs_b->period_active = 1;
+ /* refill quota for current period after inactivity */
+ __refill_cfs_bandwidth_runtime(cfs_b);
hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}

#define _GNU_SOURCE
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdatomic.h>
#include <sched.h>
#include <unistd.h>
#include <string.h>
#include <time.h>


pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t wakeup = PTHREAD_COND_INITIALIZER;
pthread_cond_t done = PTHREAD_COND_INITIALIZER;
int nr_threads;
int nr_booting;
int nr_running;
int64_t burn_ns;
int64_t pace_ns;

#define NSEC_PER_SEC 1000000000L

int64_t clock_diff(struct timespec *a, struct timespec *b) {
return (a->tv_sec - b->tv_sec) * NSEC_PER_SEC + a->tv_nsec - b->tv_nsec;
}

struct thread_data {
struct timespec wakeup;
struct timespec start;
struct timespec finish;
int64_t burn;
};

void *thread_fn(void *_data) {
struct thread_data *data = _data;

pthread_mutex_lock(&mutex);
if (!--nr_booting)
pthread_cond_signal(&done);
while (1) {
pthread_cond_wait(&wakeup, &mutex);
clock_gettime(CLOCK_MONOTONIC, &data->start);
pthread_mutex_unlock(&mutex);
do {
sched_yield();
clock_gettime(CLOCK_MONOTONIC, &data->finish);
} while (clock_diff(&data->finish, &data->start) < burn_ns);
pthread_mutex_lock(&mutex);
if (!--nr_running)
pthread_cond_signal(&done);
}
pthread_mutex_unlock(&mutex);

return NULL;
}

int main(int argc, char **argv) {
nr_threads = atoi(argv[1]);
burn_ns = atol(argv[2]);
pace_ns = atol(argv[3]);

pthread_t thread[nr_threads];
struct thread_data thread_data[nr_threads];
int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);

nr_booting = nr_threads;
for (int i = 0; i < nr_threads; i++)
pthread_create(thread + i, NULL, thread_fn, thread_data + i);

pthread_mutex_lock(&mutex);
pthread_cond_wait(&done, &mutex);
pthread_mutex_unlock(&mutex);

while (1) {
struct timespec pace_ts = {
.tv_sec = pace_ns / NSEC_PER_SEC,
.tv_nsec = pace_ns % NSEC_PER_SEC
};

clock_nanosleep(CLOCK_MONOTONIC, 0, &pace_ts, NULL);

nr_running = nr_threads;
pthread_cond_broadcast(&wakeup);

pthread_mutex_lock(&mutex);
pthread_cond_wait(&done, &mutex);
pthread_mutex_unlock(&mutex);
}

return 0;
}