Re: [PATCH 1/3] fork: dynamically allocate cache array for vmapped stacks using cpuhp

From: Hoeun Ryu
Date: Fri Feb 03 2017 - 11:43:01 EST


On Sat, Feb 4, 2017 at 12:39 AM, Michal Hocko <mhocko@xxxxxxxxxx> wrote:
> On Sat 04-02-17 00:30:05, Hoeun Ryu wrote:
>> Using virtually mapped stack, kernel stacks are allocated via vmalloc.
>> In the current implementation, two stacks per cpu can be cached when
>> tasks are freed and the cached stacks are used again in task duplications.
>> but the array for the cached stacks is statically allocated by per-cpu api.
>> In this new implementation, the array for the cached stacks are dynamically
>> allocted and freed by cpu hotplug callbacks and the cached stacks are freed
>> when cpu is down. setup for cpu hotplug is established in fork_init().
>
> Why do we want this? I can see that the follow up patch makes the number
> configurable but the changelog doesn't describe the motivation for that.
> Which workload would benefit from a higher value?
>

The key difference of this implementation, the cached stacks for a cpu
is freed when a cpu is down.
so the cached stacks are no longer wasted.
In the current implementation, the cached stacks for a cpu still
remain on the system when a cpu is down.
I think we could imagine what if a machine has many cpus and someone
wants to have bigger size of stack caches.

>> Signed-off-by: Hoeun Ryu <hoeun.ryu@xxxxxxxxx>
>> ---
>> kernel/fork.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++-------------
>> 1 file changed, 64 insertions(+), 17 deletions(-)
>>
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 61284d8..54421a9 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -167,26 +167,71 @@ void __weak arch_release_thread_stack(unsigned long *stack)
>> * flush. Try to minimize the number of calls by caching stacks.
>> */
>> #define NR_CACHED_STACKS 2
>> -static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
>> +
>> +struct vm_stack_cache {
>> + struct vm_struct **vm_stacks;
>> + int nr;
>> + int cur;
>> +};
>> +
>> +static DEFINE_PER_CPU(struct vm_stack_cache, vm_stacks);
>> +
>> +static int alloc_vm_stack_cache(unsigned int cpu)
>> +{
>> + struct vm_stack_cache *vm_stack_cache = &per_cpu(vm_stacks, cpu);
>> + struct vm_struct **vm_stacks = vm_stack_cache->vm_stacks;
>> + int i;
>> +
>> + /* if free_vm_stack_cache() didn't free it */
>> + if (!vm_stacks) {
>> + vm_stacks =
>> + vzalloc(sizeof(struct vm_struct *) * NR_CACHED_STACKS);
>> + if (!vm_stacks)
>> + return -ENOMEM;
>> + }
>> +
>> + vm_stack_cache->vm_stacks = vm_stacks;
>> + vm_stack_cache->cur = 0;
>> + vm_stack_cache->nr = 0;
>> +
>> + return 0;
>> +}
>> +
>> +static int free_vm_stack_cache(unsigned int cpu)
>> +{
>> + struct vm_stack_cache *vm_stack_cache = &per_cpu(vm_stacks, cpu);
>> + struct vm_struct **vm_stacks = vm_stack_cache->vm_stacks;
>> + int i;
>> +
>> + for (i = 0; i < vm_stack_cache->nr; i++) {
>> + vfree(vm_stacks[i]->addr);
>> + vm_stacks[i] = NULL;
>> + }
>> +
>> + vm_stack_cache->nr = 0;
>> + vm_stack_cache->cur = 0;
>> + /* do not free vm_stack[cpu]->vm_stacks itself, reused in allocation */
>> +
>> + return 0;
>> +}
>> +
>> #endif
>>
>> static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>> {
>> #ifdef CONFIG_VMAP_STACK
>> + struct vm_stack_cache *vm_stack_cache =
>> + &per_cpu(vm_stacks, smp_processor_id());
>> + struct vm_struct **vm_stacks = vm_stack_cache->vm_stacks;
>> void *stack;
>> - int i;
>>
>> local_irq_disable();
>> - for (i = 0; i < NR_CACHED_STACKS; i++) {
>> - struct vm_struct *s = this_cpu_read(cached_stacks[i]);
>> -
>> - if (!s)
>> - continue;
>> - this_cpu_write(cached_stacks[i], NULL);
>> -
>> - tsk->stack_vm_area = s;
>> + if (vm_stack_cache->cur > 0) {
>> + struct vm_struct *vm_stack = vm_stacks[--vm_stack_cache->cur];
>> + tsk->stack_vm_area = vm_stack;
>> local_irq_enable();
>> - return s->addr;
>> +
>> + return vm_stack->addr;
>> }
>> local_irq_enable();
>>
>> @@ -216,15 +261,14 @@ static inline void free_thread_stack(struct task_struct *tsk)
>> {
>> #ifdef CONFIG_VMAP_STACK
>> if (task_stack_vm_area(tsk)) {
>> + struct vm_stack_cache *vm_stack_cache =
>> + &per_cpu(vm_stacks, smp_processor_id());
>> + struct vm_struct **vm_stacks = vm_stack_cache->vm_stacks;
>> unsigned long flags;
>> - int i;
>>
>> local_irq_save(flags);
>> - for (i = 0; i < NR_CACHED_STACKS; i++) {
>> - if (this_cpu_read(cached_stacks[i]))
>> - continue;
>> -
>> - this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
>> + if (vm_stack_cache->cur < vm_stack_cache->nr) {
>> + vm_stacks[vm_stack_cache->cur++] = tsk->stack_vm_area;
>> local_irq_restore(flags);
>> return;
>> }
>> @@ -456,6 +500,9 @@ void __init fork_init(void)
>> for (i = 0; i < UCOUNT_COUNTS; i++) {
>> init_user_ns.ucount_max[i] = max_threads/2;
>> }
>> +
>> + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "vm_stack_cache",
>> + alloc_vm_stack_cache, free_vm_stack_cache);
>> }
>>
>> int __weak arch_dup_task_struct(struct task_struct *dst,
>> --
>> 2.7.4
>>
>
> --
> Michal Hocko
> SUSE Labs