[RFC 2/2] pid: Replace PID bitmap implementation with IDR API

From: Gargi Sharma
Date: Sat Sep 09 2017 - 11:21:54 EST


This patch replaces the current bitmap implemetation for
Process ID allocation. Functions that are no longer required,
for example, free_pidmap(), alloc_pidmap(), etc. are removed.
The rest of the functions are modified to use the IDR API.
The change was made to make the PID allocation less complex by
replacing custom code with calls to generic API.

Signed-off-by: Gargi Sharma <gs051095@xxxxxxxxx>
---
include/linux/pid.h | 1 +
include/linux/pid_namespace.h | 5 +-
init/main.c | 4 +-
kernel/pid.c | 204 ++++++++----------------------------------
kernel/pid_namespace.c | 39 ++++----
5 files changed, 59 insertions(+), 194 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 7195827..de85a7b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -99,6 +99,7 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new,

struct pid_namespace;
extern struct pid_namespace init_pid_ns;
+extern int pid_max;

/*
* look up a PID in the hash table. Must be called with the tasklist_lock
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index b09136f..1ac6e85 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/kref.h>
#include <linux/ns_common.h>
+#include <linux/idr.h>

struct pidmap {
atomic_t nr_free;
@@ -29,7 +30,7 @@ enum { /* definitions for pid_namespace's hide_pid field */

struct pid_namespace {
struct kref kref;
- struct pidmap pidmap[PIDMAP_ENTRIES];
+ struct idr idr;
struct rcu_head rcu;
int last_pid;
unsigned int nr_hashed;
@@ -105,6 +106,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)

extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
-void pidmap_init(void);
+void pid_idr_init(void);

#endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index a21a1a8..8821c1d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/binfmts.h>
+#include <linux/delay.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/stackprotector.h>
@@ -667,7 +668,7 @@ asmlinkage __visible void __init start_kernel(void)
if (late_time_init)
late_time_init();
calibrate_delay();
- pidmap_init();
+ pid_idr_init();
anon_vma_init();
acpi_early_init();
#ifdef CONFIG_X86
@@ -989,7 +990,6 @@ static inline void mark_readonly(void)
static int __ref kernel_init(void *unused)
{
int ret;
-
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedb..27e43fd 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,6 +39,7 @@
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#include <linux/sched/task.h>
+#include <linux/idr.h>

#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -53,12 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;

-static inline int mk_pid(struct pid_namespace *pid_ns,
- struct pidmap *map, int off)
-{
- return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
#define find_next_offset(map, off) \
find_next_zero_bit((map)->page, BITS_PER_PAGE, off)

@@ -70,10 +65,8 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .pidmap = {
- [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
- },
.last_pid = 0,
+ .idr = IDR_INIT,
.nr_hashed = PIDNS_HASH_ADDING,
.level = 0,
.child_reaper = &init_task,
@@ -101,138 +94,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);

-static void free_pidmap(struct upid *upid)
-{
- int nr = upid->nr;
- struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
- int offset = nr & BITS_PER_PAGE_MASK;
-
- clear_bit(offset, map->page);
- atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
- /*
- * This is the same as saying
- *
- * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
- * and that mapping orders 'a' and 'b' with respect to 'base'.
- */
- return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value. We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
- int prev;
- int last_write = base;
- do {
- prev = last_write;
- last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
- } while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
- struct pidmap *map;
-
- pid = last + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
- offset = pid & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- /*
- * If last_pid points into the middle of the map->page we
- * want to scan this bitmap block twice, the second time
- * we start with offset == 0 (or RESERVED_PIDS).
- */
- max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
- for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- return -ENOMEM;
- }
- if (likely(atomic_read(&map->nr_free))) {
- for ( ; ; ) {
- if (!test_and_set_bit(offset, map->page)) {
- atomic_dec(&map->nr_free);
- set_last_pid(pid_ns, last, pid);
- return pid;
- }
- offset = find_next_offset(map, offset);
- if (offset >= BITS_PER_PAGE)
- break;
- pid = mk_pid(pid_ns, map, offset);
- if (pid >= pid_max)
- break;
- }
- }
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
- ++map;
- offset = 0;
- } else {
- map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset))
- break;
- }
- pid = mk_pid(pid_ns, map, offset);
- }
- return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
- int offset;
- struct pidmap *map, *end;
-
- if (last >= PID_MAX_LIMIT)
- return -1;
-
- offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pid_ns->pidmap[PIDMAP_ENTRIES];
- for (; map < end; map++, offset = 0) {
- if (unlikely(!map->page))
- continue;
- offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
- if (offset < BITS_PER_PAGE)
- return mk_pid(pid_ns, map, offset);
- }
- return -1;
-}
-
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -285,10 +146,14 @@ void free_pid(struct pid *pid)
break;
}
}
- spin_unlock_irqrestore(&pidmap_lock, flags);

- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
+ for (i = 0; i <= pid->level; i++) {
+ struct upid *upid = pid->numbers + i;
+ struct pid_namespace *ns = upid->ns;
+
+ idr_remove(&ns->idr, upid->nr);
+ }
+ spin_unlock_irqrestore(&pidmap_lock, flags);

call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -309,7 +174,22 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ int pid_min = 1;
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&pidmap_lock);
+
+ /*
+ * init really needs pid 1, but after reaching the maximum
+ * wrap back to RESERVED_PIDS
+ */
+ if (tmp->idr.idr_next > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ nr = idr_alloc_cyclic(&tmp->idr, pid, pid_min,
+ pid_max, GFP_ATOMIC);
+ spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
+
if (nr < 0) {
retval = nr;
goto out_free;
@@ -346,12 +226,14 @@ struct pid *alloc_pid(struct pid_namespace *ns)
return pid;

out_unlock:
- spin_unlock_irq(&pidmap_lock);
put_pid_ns(ns);
+ spin_unlock_irq(&pidmap_lock);

out_free:
+ spin_lock_irq(&pidmap_lock);
while (++i <= ns->level)
- free_pidmap(pid->numbers + i);
+ idr_remove(&ns->idr, (pid->numbers + i)->nr);
+ spin_unlock_irq(&pidmap_lock);

kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -527,11 +409,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (!ns)
ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID) {
- if (type == __PIDTYPE_TGID)
- type = PIDTYPE_PID;
+ if (type != PIDTYPE_PID)
task = task->group_leader;
- }
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
@@ -553,16 +432,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
- struct pid *pid;
-
- do {
- pid = find_pid_ns(nr, ns);
- if (pid)
- break;
- nr = next_pidmap(ns, nr);
- } while (nr > 0);
-
- return pid;
+ return idr_get_next(&ns->idr, &nr);
}

/*
@@ -572,13 +442,16 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
+ unsigned int pidhash_size;
+
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
+ pidhash_size = 1U << pidhash_shift;
}

-void __init pidmap_init(void)
+void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
@@ -590,10 +463,9 @@ void __init pidmap_init(void)
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);

- init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pid_ns.pidmap[0].page);
- atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ idr_init(&init_pid_ns.idr);
+ /* Reserve PID 0. */
+ idr_alloc_cyclic(&init_pid_ns.idr, &init_pid_ns, 0, 0, GFP_KERNEL);

init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 74a5a72..d0dc231 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
+#include <linux/idr.h>

struct pid_cache {
int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
struct ucounts *ucounts;
- int i;
int err;

err = -ENOSPC;
@@ -113,17 +113,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns == NULL)
goto out_dec;

- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
+ idr_init(&ns->idr);

ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
- goto out_free_map;
+ goto out_free_idr;

err = ns_alloc_inum(&ns->ns);
if (err)
- goto out_free_map;
+ goto out_free_idr;
ns->ns.ops = &pidns_operations;

kref_init(&ns->kref);
@@ -134,17 +132,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->nr_hashed = PIDNS_HASH_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);

- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++)
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
return ns;

-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+ idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
out_dec:
dec_pid_namespaces(ucounts);
@@ -164,11 +155,9 @@ static void delayed_free_pidns(struct rcu_head *p)

static void destroy_pid_namespace(struct pid_namespace *ns)
{
- int i;
-
ns_free_inum(&ns->ns);
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
+
+ idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}

@@ -205,10 +194,11 @@ EXPORT_SYMBOL_GPL(put_pid_ns);

void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
- int nr;
+ int nr = 2;
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
+ struct pid *pid;

/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -236,8 +226,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*
*/
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
+ pid = idr_get_next(&pid_ns->idr, &nr);
+ while (pid) {
rcu_read_lock();

task = pid_task(find_vpid(nr), PIDTYPE_PID);
@@ -246,7 +236,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)

rcu_read_unlock();

- nr = next_pidmap(pid_ns, nr);
+ nr++;
+ pid = idr_get_next(&pid_ns->idr, &nr);
}
read_unlock(&tasklist_lock);

@@ -307,7 +298,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/

- tmp.data = &pid_ns->last_pid;
+ tmp.data = &pid_max;
return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}

--
2.7.4