[PATCH] let proc net directory inodes reflect to active net namespace

From: Hallsmark, Per
Date: Tue Jun 25 2019 - 06:36:17 EST


Hi,

Linux kernel recently got a bugfix 1fde6f21d90f ("proc: fix /proc/net/* after setns(2)"),
but unfortunately it only solves the issue for procfs net file inodes so they get correct
content after a process change namespace.

Checking on a v5.2-rc6 kernel :

sh-4.4# sh netns_procfs_test.sh
[ 16.451640] ip (108) used greatest stack depth: 12264 bytes left
Before net namespace change :
==== /proc/net/dev ====
Inter-| Receive | Transmit
face |bytes packets errs drop fifo frame compressed multicast|bytes packd
eth0: 0 0 0 0 0 0 0 0 0 0
lo: 0 0 0 0 0 0 0 0 0 0
if_default: 0 0 0 0 0 0 0 0 0 0
sit0: 0 0 0 0 0 0 0 0 0 0

==== files in /proc/net/dev_snmp6 ====
.
..
lo
eth0
sit0
if_default


After net namespace change :
==== /proc/net/dev ====
Inter-| Receive | Transmit
face |bytes packets errs drop fifo frame compressed multicast|bytes packd
sit0: 0 0 0 0 0 0 0 0 0 0
if_other: 0 0 0 0 0 0 0 0 0 0
lo: 0 0 0 0 0 0 0 0 0 0

==== files in /proc/net/dev_snmp6 ====
.
..
lo
eth0
sit0
if_default
This kernel is fixed for file inode bug but suffers dir inode bug
sh-4.4#

As can be seen above, after the namespace change we see new content in procfs net/dev
but the listing of procfs net/dev_snmp6 still shows entries from previous namespace.
We need to apply similar bugfix for directory creation in procfs net as the mentioned
commit do for files.

Checking on a v5.2-rc6 kernel with bugfixes :

sh-4.4# sh netns_procfs_test.sh
[ 745.993882] ip (108) used greatest stack depth: 12264 bytes left
Before net namespace change :
==== /proc/net/dev ====
Inter-| Receive | Transmit
face |bytes packets errs drop fifo frame compressed multicast|bytes packd
lo: 0 0 0 0 0 0 0 0 0 0
sit0: 0 0 0 0 0 0 0 0 0 0
eth0: 0 0 0 0 0 0 0 0 0 0
if_default: 0 0 0 0 0 0 0 0 0 0

==== files in /proc/net/dev_snmp6 ====
.
..
lo
eth0
sit0
if_default


After net namespace change :
==== /proc/net/dev ====
Inter-| Receive | Transmit
face |bytes packets errs drop fifo frame compressed multicast|bytes packd
if_other: 0 0 0 0 0 0 0 0 0 0
sit0: 0 0 0 0 0 0 0 0 0 0
lo: 0 0 0 0 0 0 0 0 0 0

==== files in /proc/net/dev_snmp6 ====
.
..
lo
sit0
if_other
This kernel is fixed for both file and dir inode bug
sh-4.4#

Here we see that the directory procfs net/dev_snmp6 is updated according to the namespace
change.

The fix is two commits, first updates proc_net_mkdir() entries similar to mentioned patch
and second one is changing net/ipv6/proc.c to use proc_net_mkdir() instead.

Speaking about proc_net_mkdir()...

[phallsma@arn-phallsma-l3 linux]$ git grep proc_mkdir | grep proc_net
drivers/isdn/divert/divert_procfs.c: isdn_proc_entry = proc_mkdir("isdn", init_net.proc_net);
drivers/isdn/hysdn/hysdn_procconf.c: hysdn_proc_entry = proc_mkdir(PROC_SUBDIR_NAME, init_net.proc_net);
drivers/net/bonding/bond_procfs.c: bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net);
drivers/net/wireless/intel/ipw2x00/libipw_module.c: libipw_proc = proc_mkdir(DRV_PROCNAME, init_net.proc_net);
drivers/net/wireless/intersil/hostap/hostap_main.c: hostap_proc = proc_mkdir("hostap", init_net.proc_net);
drivers/staging/rtl8192u/ieee80211/ieee80211_module.c: ieee80211_proc = proc_mkdir(DRV_NAME, init_net.proc_net);
drivers/staging/rtl8192u/r8192U_core.c: rtl8192_proc = proc_mkdir(RTL819XU_MODULE_NAME, init_net.proc_net);
net/appletalk/atalk_proc.c: if (!proc_mkdir("atalk", init_net.proc_net))
net/core/pktgen.c: pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net);
net/ipv4/netfilter/ipt_CLUSTERIP.c: cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
net/ipv6/proc.c: net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
net/llc/llc_proc.c: llc_proc_dir = proc_mkdir("llc", init_net.proc_net);
net/netfilter/xt_hashlimit.c: hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net);
net/netfilter/xt_hashlimit.c: hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net);
net/netfilter/xt_recent.c: recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net);
net/sunrpc/cache.c: cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc);
net/sunrpc/stats.c: sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net);
net/x25/x25_proc.c: if (!proc_mkdir("x25", init_net.proc_net))
[phallsma@arn-phallsma-l3 linux]$

IMHO all code should use proc_net_mkdir() instead of proc_mkdir() for procfs net entries,
or am I missing something here? If not possible to changeover to proc_net_mkdir() there
is a need for duplicating my first commit at those places. I'm fixing the one for dev_snmp6()
which is what I've tested as well.

Also wonder if it all is optimal. Wouldn't it be better to re-enable dcache for these (files as well as directories)
and in addition have kernel drop dcache in case of a namespace change?

Attaching patches and app/script for verifying.

I'm not on the mailing lists so please keep me on CC in case of responding.

Best regards,
Per

--
Per Hallsmark per.hallsmark@xxxxxxxxxxxxx
Senior Member Technical Staff Wind River AB
Mobile: +46733249340 Office: +46859461127
Torshamnsgatan 27 164 40 Kista
From 66769af1f931124da1aa04b34350b3623a9003e3 Mon Sep 17 00:00:00 2001
From: Per Hallsmark <per.hallsmark@xxxxxxxxxxxxx>
Date: Wed, 19 Jun 2019 15:46:39 +0200
Subject: [PATCH 1/2] Make directory inodes in /proc/net adhere to net
namespace

This patch fixes /proc/net directory inodes in similar way as
commit 1fde6f21d90f ("proc: fix /proc/net/* after setns(2)")
fixes file inodes.

Signed-off-by: Per Hallsmark <per.hallsmark@xxxxxxxxxxxxx>
---
fs/proc/proc_net.c | 12 ++++++++++++
include/linux/proc_fs.h | 7 ++-----
2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 76ae278df1c4..1c4231a60c9e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -55,6 +55,18 @@ static void pde_force_lookup(struct proc_dir_entry *pde)
pde->proc_dops = &proc_net_dentry_ops;
}

+struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
+ struct proc_dir_entry *parent)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_mkdir_data(name, 0, parent, net);
+ pde->proc_dops = &proc_net_dentry_ops;
+
+ return pde;
+}
+EXPORT_SYMBOL(proc_net_mkdir);
+
static int seq_open_net(struct inode *inode, struct file *file)
{
unsigned int state_size = PDE(inode)->state_size;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 52a283ba0465..608b8a10e338 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -124,11 +124,8 @@ static inline struct pid *tgid_pidfd_to_pid(const struct file *file)

struct net;

-static inline struct proc_dir_entry *proc_net_mkdir(
- struct net *net, const char *name, struct proc_dir_entry *parent)
-{
- return proc_mkdir_data(name, 0, parent, net);
-}
+extern struct proc_dir_entry *proc_net_mkdir(
+ struct net *net, const char *name, struct proc_dir_entry *parent);

struct ns_common;
int open_related_ns(struct ns_common *ns,
--
2.20.1

From 6925a5408ffce37dc71871c2ee05db96e60cae0d Mon Sep 17 00:00:00 2001
From: Per Hallsmark <per.hallsmark@xxxxxxxxxxxxx>
Date: Thu, 20 Jun 2019 10:21:31 +0200
Subject: [PATCH 2/2] net: Directories created in /proc/net should be done via
proc_net_mkdir()

Directories created in /proc/net should be done via proc_net_mkdir()

Signed-off-by: Per Hallsmark <per.hallsmark@xxxxxxxxxxxxx>
---
net/ipv6/proc.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 4a8da679866e..3728c57e93dc 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -282,7 +282,8 @@ static int __net_init ipv6_proc_init_net(struct net *net)
snmp6_seq_show, NULL))
goto proc_snmp6_fail;

- net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
+ net->mib.proc_net_devsnmp6 = proc_net_mkdir(net,
+ "dev_snmp6", net->proc_net);
if (!net->mib.proc_net_devsnmp6)
goto proc_dev_snmp6_fail;
return 0;
--
2.20.1

/*
Verifier of /proc/net and net namespace consistency
Author : Per Hallsmark <per.hallsmark@xxxxxxxxxxxxx>

First setup a net namespace and add a veth so we get something to test with

ip netns add netns_other
ip link add if_default type veth peer name if_other
ip link set if_other netns netns_other

Run test app without first reading some info from /proc/net in default namespace

./netns_procfs_test

Run test app again, this time reading some info from /proc/net in default namespace
before changing net namespace

./netns_procfs_test cached

On a bugfree kernel, the output after "After net namespace change" should be same as
in first test run, meaning we should see if_other in the /proc/net/dev dump and we
should see if_other in the directory.

The issue is not visible if it is different processes doing the readings since then
the inode's aren't cached.
*/


#define _GNU_SOURCE
#include <sched.h>

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <assert.h>
#include <fcntl.h>
#include <errno.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/mount.h>
#include <errno.h>
#include <dirent.h>

#define NETNS_RUN_DIR "/var/run/netns"

static int get_nsfd(const char *ns)
{
char path[strlen(NETNS_RUN_DIR) + strlen(ns) + 2];
int fd;
snprintf(path, sizeof(path), "%s/%s", NETNS_RUN_DIR, ns);
fd = open(path, O_RDONLY, 0);
return fd;
}

static int dump_file(const char *fn, const int checkforbug)
{
int fd;
char buf[512];
ssize_t len;
char *wholebuf = NULL;
ssize_t wholelen = 0;
int status = 0;

printf("==== %s ====\n", fn);

fd = open(fn, O_RDONLY);
if (fd == -1) {
printf("Couldn't open %s (%s)\n", fn, strerror(errno));
return -1;
}

do {
len = read(fd, buf, sizeof(buf)-1);
buf[len] = 0;
wholebuf = realloc(wholebuf, wholelen + len + 1);
sprintf(&wholebuf[wholelen], "%s", buf);
wholelen += len;
} while (len > 0);
printf("%s\n", wholebuf);
if (checkforbug && strstr(wholebuf, "if_other")) {
status = 1;
}

free(wholebuf);
close(fd);

return status;
}

static int dump_dir(const char *dn, const int checkforbug)
{
DIR *dir;
struct dirent *dir_entry;
int status = 0;

printf("==== files in %s ====\n", dn);

dir = opendir(dn);
if (!dir) {
printf("Couldn't open %s (%s)\n", dn, strerror(errno));
return -1;
}

while ((dir_entry = readdir(dir)) != NULL) {
printf(" %s", dir_entry->d_name);
if (checkforbug && !strcmp(dir_entry->d_name, "if_other")) {
status = 2;
}
printf("\n");
}

closedir(dir);

return status;
}

static int switch_ns(const char *ns)
{
int nsfd = get_nsfd(ns);

if (setns(nsfd, CLONE_NEWNET) < 0) {
fprintf(stderr, "Unable to switch to namespace(fd=%d): %s.\n",
nsfd, strerror(errno));
exit(-1);
}
return 0;
}

int main (int argc, char **argv)
{
int bugcheck = 0;
int checkforbug = 1;
if (argc == 2 && !strcmp(argv[1], "cached")) {
// access /proc/net a bit so we dcache file and
// directory inodes
printf("Before net namespace change :\n");
dump_file("/proc/net/dev", 0);
dump_dir("/proc/net/dev_snmp6", 0);
} else {
// we cannot check for bug if not run with cached arg
checkforbug = 0;
}

// change namespace
switch_ns("netns_other");

// access /proc/net again, if all works we should see
// new info because of net namespace change
printf("\n\nAfter net namespace change :\n");
bugcheck += dump_file("/proc/net/dev", 1);
bugcheck += dump_dir("/proc/net/dev_snmp6", 1);

if (!checkforbug)
return 0;

switch (bugcheck) {
case 1 :
printf("This kernel is fixed for file inode bug but suffers dir inode bug\n");
break;
case 3 :
printf("This kernel is fixed for both file and dir inode bug\n");
break;
default :
printf("This kernel suffers both file and dir inode bug\n");
}

return bugcheck != 3;
}

Attachment: netns_procfs_test.sh
Description: netns_procfs_test.sh