net/netfilter/nf_tables_api.c: BUG_ON(ctx->table->use > 0)

From: Michael Tirado
Date: Thu Aug 03 2017 - 09:48:20 EST


Been getting beaten up by this bug for a few days now. I made a small
test program for you netfilter experts to try because I'm running out
of ideas over here. Attached is a C program to trigger the BUG_ON. I
have narrowed possible causes down to the portion of my code that
sends NFT_MSG_NEWRULE, if you comment that out the bug will not
happen. Let me know if you need more information or have a patch to
try.

The kernel config is nothing special, minimal x86 qemu with ipv{4,6}
and full nftables options, no modules.

------------[ cut here ]------------
kernel BUG at net/netfilter/nf_tables_api.c:816!
invalid opcode: 0000 [#1]
CPU: 0 PID: 42 Comm: kworker/u2:2 Not tainted 4.9.40 #1
Workqueue: netns cleanup_net
task: c0225540 task.stack: c026e000
EIP: 0060:[<c1289440>] EFLAGS: 00000202 CPU: 0
EIP is at nf_tables_table_destroy.isra.23.part.24+0x0/0x10
EAX: f4e613d8 EBX: f4e613d8 ECX: 000000a8 EDX: 00000001
ESI: f4e613d8 EDI: f4e613d8 EBP: f4e613e8 ESP: c026fe90
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
CR0: 80050033 CR2: 08197148 CR3: 002a5000 CR4: 00000690
Stack:
c128caa6 f4e613e8 f4e4a000 f4e52464 f4e52450 f4e52464 f4e4a000 f4e52450
f4e613d8 f4e61664 00000000 00000000 00000000 00000000 f4e4a000 c026ff08
c1470348 c147034c c133f31e f4e4a000 c124e506 c147033c c026fee8 c026ff10
Call Trace:
[<c128caa6>] ? nft_unregister_afinfo+0x1f6/0x200
[<c133f31e>] ? nf_tables_ipv6_exit_net+0xe/0x20
[<c124e506>] ? ops_exit_list.isra.6+0x26/0x50
[<c124edc5>] ? cleanup_net+0x135/0x210
[<c1044c1a>] ? pick_next_task_fair+0xba/0x120
[<c1038c7e>] ? process_one_work+0x19e/0x350
[<c1038e77>] ? worker_thread+0x47/0x4a0
[<c1038e30>] ? process_one_work+0x350/0x350
[<c103d248>] ? kthread+0x98/0xb0
[<c103d1b0>] ? kthread_worker_fn+0xb0/0xb0
[<c134c377>] ? ret_from_fork+0x1b/0x28
Code: 8b 04 24 8d 53 b4 8b 08 89 f8 e8 7c f5 fe ff 8b 5b 08 83 eb 08
39 de 75 c2 83 c4 04 5b 5e 5f 5d c3 8d 76 00 8d bc 27 00 00 00 00 <0f>
0b 8d b4 26 00 00 00 00 8d bc 27 00 00 00 00 8b 50 1c 85 d2
EIP: [<c1289440>] nf_tables_table_destroy.isra.23.part.24+0x0/0x10
SS:ESP 0068:c026fe90
---[ end trace 20fa171526d8ba2a ]---





BUG: unable to handle kernel paging request at fffffff0
IP: [<c103d4a6>] kthread_data+0x6/0x10
*pde = 014b5067 *pte = 00000000

Oops: 0000 [#2]
CPU: 0 PID: 42 Comm: kworker/u2:2 Tainted: G D 4.9.40 #1
task: c0225540 task.stack: c026e000
EIP: 0060:[<c103d4a6>] EFLAGS: 00000002 CPU: 0
EIP is at kthread_data+0x6/0x10
EAX: 00000000 EBX: c0225540 ECX: 00000001 EDX: c0225570
ESI: 00000000 EDI: c026ff98 EBP: c026ff80 ESP: c026ff64
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
CR0: 80050033 CR2: 00000014 CR3: 002a5000 CR4: 00000690
Stack:
c10387f5 c1349a27 c0225750 c026fdf0 c0225540 c026fdf0 c026ff98 c026ff88
c104210d 00000000 c102aee9 c02256cc 0126ff90 c026ff98 c026ff98 0000000b
c0270000 c13f9a10 00000000 c134cdfc 00000000 00000000 00000000 00000000
Call Trace:
[<c10387f5>] ? wq_worker_sleeping+0x5/0x70
[<c1349a27>] ? __schedule+0x207/0x350
[<c104210d>] ? do_task_dead+0x1d/0x20
[<c102aee9>] ? do_exit+0x4c9/0x7f0
[<c134cdfc>] ? rewind_stack_do_exit+0x10/0x12
Code: 27 00 00 00 00 85 c0 74 03 c6 00 00 a1 a8 11 45 c1 8b 80 e4 01
00 00 8b 40 e8 d1 e8 83 e0 01 c3 90 8d 74 26 00 8b 80 e4 01 00 00 <8b>
40 f0 c3 8d b6 00 00 00 00 83 ec 04 8b 90 e4 01 00 00 b9 04
EIP: [<c103d4a6>] kthread_data+0x6/0x10
SS:ESP 0068:c026ff64
CR2: 00000000fffffff0
---[ end trace 20fa171526d8ba2b ]---
/* (c) 2017 Michael R. Tirado
* GPLv3+, GNU General Public License, version 3 or later.
*
* build with: `gcc -o netph netph.c -lmnl -lnftnl`
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <errno.h>
#include <sched.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>

#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <linux/netfilter/nfnetlink.h>
#include <libmnl/libmnl.h>
#include <libnftnl/common.h>
#include <libnftnl/ruleset.h>
#include <libnftnl/table.h>
#include <libnftnl/chain.h>
#include <libnftnl/set.h>
#include <libnftnl/rule.h>
#include <libnftnl/batch.h>

#define ST_SOCKBUF_SIZE 8192

#define list_for_each(pos, head) \
for (pos = (head)->next; pos != (head); pos = pos->next)


struct net_data {
struct nftnl_ruleset *ruleset;
};

const char fw[]=" \n\
flush ruleset \n\
\n\
table ip firewall { \n\
chain incoming { \n\
type filter hook input priority 0; policy drop; \n\
ct state invalid drop \n\
ct state established,related accept \n\
} \n\
} \n\
\n\
table ip6 firewall { \n\
chain incoming { \n\
type filter hook input priority 0; policy drop; \n\
ct state invalid drop \n\
ct state established,related accept \n\
icmpv6 type { \n\
nd-neighbor-solicit, \n\
nd-neighbor-advert, \n\
nd-router-advert, \n\
nd-redirect \n\
} accept \n\
} \n\
} \n\
";


static int mnl_talk(struct mnl_socket *nf_sock, const void *data, unsigned int len,
int (*cb)(const struct nlmsghdr *nlh, void *data), void *cb_data,
unsigned int seq)
{
char buf[ST_SOCKBUF_SIZE*2];
uint32_t portid = mnl_socket_get_portid(nf_sock);
int ret;

if (len >= ST_SOCKBUF_SIZE) {
printf("bad len?\n");
return -1;
}

if (mnl_socket_sendto(nf_sock, data, len) < 0) {
printf("sendto\n");
return -1;
}

ret = mnl_socket_recvfrom(nf_sock, buf, ST_SOCKBUF_SIZE);
while (cb && ret > 0)
{
errno = 0;
ret = mnl_cb_run(buf, ret, seq, portid, cb, cb_data);
if (ret <= 0) {
printf("errno=%s\n", strerror(errno));
printf("cb_run=%d\n",ret);
goto out;
}

ret = mnl_socket_recvfrom(nf_sock, buf, sizeof(buf));
}
out:
if (ret < 0 && errno == EAGAIN)
return 0;

return ret;
}

static int setid;
static int clone_cb(const struct nlmsghdr *nlmsg_hdr, void *data)
{
void *vtype;
unsigned short msg_type = NFNL_MSG_TYPE(nlmsg_hdr->nlmsg_type);
if (NFNL_SUBSYS_ID(nlmsg_hdr->nlmsg_type) != NFNL_SUBSYS_NFTABLES) {
printf("unexpected subsys id\n");
return MNL_CB_ERROR;
}
switch (msg_type)
{
case NFT_MSG_NEWTABLE:
vtype = nftnl_table_alloc();
if (!vtype)
return MNL_CB_ERROR;
if (nftnl_table_nlmsg_parse(nlmsg_hdr, vtype))
return MNL_CB_ERROR;
nftnl_table_list_add(vtype, ((struct nftnl_table_list *)data));
break;
case NFT_MSG_NEWCHAIN:
vtype = nftnl_chain_alloc();
if (!vtype)
return MNL_CB_ERROR;
if (nftnl_chain_nlmsg_parse(nlmsg_hdr, vtype))
return MNL_CB_ERROR;
nftnl_chain_list_add(vtype, ((struct nftnl_chain_list *)data));

break;
case NFT_MSG_NEWSET:
vtype = nftnl_set_alloc();
if (!vtype)
return MNL_CB_ERROR;
if (nftnl_set_nlmsg_parse(nlmsg_hdr, vtype))
return MNL_CB_ERROR;
nftnl_set_set_u32(vtype, NFTNL_SET_ID, setid++);
nftnl_set_list_add(vtype, ((struct nftnl_set_list *)data));
break;
case NFT_MSG_NEWSETELEM:
vtype = data;
if (nftnl_set_elems_nlmsg_parse(nlmsg_hdr, vtype))
return MNL_CB_ERROR;
break;
case NFT_MSG_NEWRULE:
vtype = nftnl_rule_alloc();
if (!vtype)
return MNL_CB_ERROR;
if (nftnl_rule_nlmsg_parse(nlmsg_hdr, vtype))
return MNL_CB_ERROR;
nftnl_rule_unset(vtype, NFTNL_RULE_HANDLE);
nftnl_rule_unset(vtype, NFTNL_RULE_POSITION);
nftnl_rule_list_add(vtype, ((struct nftnl_rule_list *)data));
break;
default:
printf("unexpected msg type: %d\n", msg_type);
return MNL_CB_ERROR;
}
return MNL_CB_OK;
}


static int clone_list(unsigned short msg_type, struct mnl_socket *nl_sock, void *list)
{
char buf[ST_SOCKBUF_SIZE];
struct nlmsghdr *hdr;
unsigned int sq = rand()%20000;
unsigned int flags;
int r;
struct nftnl_set *nfset;
struct nftnl_rule *nfrule;

setid = 1;
memset(buf, 0, sizeof(buf));

switch (msg_type)
{
/* 0 == NFPROTO_UNSPEC */
case NFT_MSG_GETTABLE:
flags = NLM_F_DUMP;
hdr = nftnl_table_nlmsg_build_hdr(buf, msg_type, 0, flags, sq);
break;
case NFT_MSG_GETCHAIN:
flags = NLM_F_DUMP;
hdr = nftnl_chain_nlmsg_build_hdr(buf, msg_type, 0, flags, sq);
break;
case NFT_MSG_GETSET:
flags = NLM_F_DUMP;
hdr = nftnl_set_nlmsg_build_hdr(buf, msg_type, 0, flags, sq);
nfset = nftnl_set_alloc();
if (nfset == NULL)
return -1;
/* was this needed? */
nftnl_set_nlmsg_build_payload(hdr, nfset);
nftnl_set_free(nfset);
break;
case NFT_MSG_GETSETELEM:

return 0;

nfset = list;
flags = NLM_F_DUMP;
hdr = nftnl_set_nlmsg_build_hdr(buf, msg_type,
nftnl_set_get_u32(nfset, NFTNL_SET_FAMILY), flags, sq);
nftnl_set_elems_nlmsg_build_payload(hdr, nfset);
break;
case NFT_MSG_GETRULE:

flags = NLM_F_DUMP;
(void) nfrule;
hdr = nftnl_rule_nlmsg_build_hdr(buf, msg_type, 0, flags, sq);
break;
default:
return -1;
}
r = mnl_talk(nl_sock, hdr, hdr->nlmsg_len, clone_cb, list, sq);
if (r < 0 || r >= ST_SOCKBUF_SIZE)
return -1;
return 0;
}
/*
static int get_setelems(struct nftnl_set *cur, void *data)
{
printf("get_setelems\n");
printf("set name: %s\n", nftnl_set_get_str(cur, NFTNL_SET_NAME));
printf("set table: %s\n", nftnl_set_get_str(cur, NFTNL_SET_TABLE));
printf("set family: ");
switch (nftnl_set_get_u32(cur, NFTNL_SET_FAMILY) )
{
case NFPROTO_IPV4: printf("ip4\n"); break;
case NFPROTO_IPV6: printf("ip6\n"); break;
case NFPROTO_BRIDGE: printf("bridge\n"); break;
case NFPROTO_ARP: printf("arp\n"); break;
default: printf("?\n");
}
return clone_list(NFT_MSG_GETSETELEM, data, cur);
}
*/
static int create_ruleset(struct net_data *nnp, struct mnl_socket *nl_sock)
{
struct nftnl_ruleset *ruleset = NULL;
struct nftnl_table_list *table_list = NULL;
struct nftnl_chain_list *chain_list = NULL;
struct nftnl_set_list *set_list = NULL;
struct nftnl_rule_list *rule_list = NULL;

table_list = nftnl_table_list_alloc();
if (table_list == NULL)
return -1;
if (clone_list(NFT_MSG_GETTABLE, nl_sock, table_list))
goto free_fail;

chain_list = nftnl_chain_list_alloc();
if (chain_list == NULL)
goto free_fail;
if (clone_list(NFT_MSG_GETCHAIN, nl_sock, chain_list))
goto free_fail;

set_list = nftnl_set_list_alloc();
if (set_list == NULL)
goto free_fail;
if (clone_list(NFT_MSG_GETSET, nl_sock, set_list))
goto free_fail;

/* get set elements */
/*
if (nftnl_set_list_foreach(set_list, get_setelems, nl_sock)) {
printf("problem getting set elements\n");
goto free_fail;
}
*/

rule_list = nftnl_rule_list_alloc();
if (rule_list == NULL)
goto free_fail;
if (clone_list(NFT_MSG_GETRULE, nl_sock, rule_list))
goto free_fail;

ruleset = nftnl_ruleset_alloc();
if (ruleset == NULL)
goto free_fail;

nftnl_ruleset_set(ruleset, NFTNL_RULESET_TABLELIST, table_list);
nftnl_ruleset_set(ruleset, NFTNL_RULESET_CHAINLIST, chain_list);
nftnl_ruleset_set(ruleset, NFTNL_RULESET_SETLIST, set_list);
nftnl_ruleset_set(ruleset, NFTNL_RULESET_RULELIST, rule_list);
nnp->ruleset = ruleset;

if (nftnl_ruleset_fprintf(stdout, ruleset, NFTNL_OUTPUT_JSON, 0) < 0) {
printf("ruleset print failure\n");
return -1;
}
return 0;

free_fail:
printf("create_ruleset failed\n");
if (table_list)
nftnl_table_list_free(table_list);
if (chain_list)
nftnl_chain_list_free(chain_list);
if (set_list)
nftnl_set_list_free(set_list);
if (rule_list)
nftnl_rule_list_free(rule_list);
return -1;
}

static int netns_get_nftables(struct net_data *nnp)
{
struct mnl_socket *nl_sock;

nl_sock = mnl_socket_open(NETLINK_NETFILTER);
if (nl_sock == NULL) {
printf("mnl_socket_open: %s\n", strerror(errno));
return -1;
}
if (mnl_socket_bind(nl_sock, 0, MNL_SOCKET_AUTOPID)) {
printf("mnl_socket_bind: %s\n", strerror(errno));
mnl_socket_close(nl_sock);
return -1;
}
if (create_ruleset(nnp, nl_sock)) {
mnl_socket_close(nl_sock);
return -1;
}

mnl_socket_close(nl_sock);
return 0;
}

struct cb_data {
struct mnl_socket *sock;
unsigned int msg_type;
unsigned int portid;
};

static int noop_cb(const struct nlmsghdr *nlmsg_hdr, void *data)
{
(void)nlmsg_hdr;
(void)data;
return MNL_CB_OK;
}
int nftnl_send_item(void *cur, void *data)
{
char buf[ST_SOCKBUF_SIZE*2];
struct mnl_nlmsg_batch *batch = NULL;
struct nlmsghdr *hdr;
struct cb_data *dat = data;
unsigned int sq, item_sq;
int r;
uint32_t family, flags;
int batching; /* */

memset(buf, 0, sizeof(buf));
sq = (rand()%20000);

batching = nftnl_batch_is_supported();
batch = mnl_nlmsg_batch_start(buf, ST_SOCKBUF_SIZE);
if (batch == NULL)
return MNL_CB_ERROR;
if (batching) {
nftnl_batch_begin(mnl_nlmsg_batch_current(batch), sq);
mnl_nlmsg_batch_next(batch);
}

switch (dat->msg_type)
{
case NFT_MSG_NEWTABLE:
flags = NLM_F_ACK | NLM_F_CREATE;
family = nftnl_table_get_u32(cur, NFTNL_TABLE_FAMILY);
hdr = nftnl_table_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch),
dat->msg_type, family,
flags, ++sq);
nftnl_table_nlmsg_build_payload(hdr, (struct nftnl_table *)cur);
break;
case NFT_MSG_NEWCHAIN:
flags = NLM_F_ACK;
nftnl_chain_unset(cur, NFTNL_CHAIN_HANDLE);
family = nftnl_chain_get_u32(cur, NFTNL_CHAIN_FAMILY);
hdr = nftnl_chain_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch),
dat->msg_type, family,
flags, ++sq);
nftnl_chain_nlmsg_build_payload(hdr, (struct nftnl_chain *)cur);
break;
case NFT_MSG_NEWSET:
flags = NLM_F_ACK | NLM_F_CREATE;
family = nftnl_set_get_u32(cur, NFTNL_SET_FAMILY);
hdr = nftnl_set_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch),
dat->msg_type, family,
flags, ++sq);
nftnl_set_nlmsg_build_payload(hdr, (struct nftnl_set *)cur);
break;
case NFT_MSG_NEWRULE:
flags = NLM_F_ACK | NLM_F_CREATE | NLM_F_APPEND;
family = nftnl_rule_get_u32(cur, NFTNL_RULE_FAMILY);
hdr = nftnl_rule_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch),
dat->msg_type, family,
flags, ++sq);
nftnl_rule_nlmsg_build_payload(hdr, (struct nftnl_rule *)cur);
break;
default:
mnl_nlmsg_batch_stop(batch);
return MNL_CB_ERROR;
}
item_sq = sq;
mnl_nlmsg_batch_next(batch);

if (batching) {
nftnl_batch_end(mnl_nlmsg_batch_current(batch), ++sq);
mnl_nlmsg_batch_next(batch);
}

r = mnl_socket_sendto(dat->sock, mnl_nlmsg_batch_head(batch),
mnl_nlmsg_batch_size(batch));
if (r < 0 || r >= ST_SOCKBUF_SIZE) {
printf("r == %d -- %s\n", r, strerror(errno));
mnl_nlmsg_batch_stop(batch);
return MNL_CB_ERROR;
}
mnl_nlmsg_batch_stop(batch);

r = mnl_socket_recvfrom(dat->sock, buf, ST_SOCKBUF_SIZE);
while (r > 0) {
r = mnl_cb_run(buf, r, item_sq, dat->portid, noop_cb, &r);
if (r <= 0)
break;
r = mnl_socket_recvfrom(dat->sock, buf, ST_SOCKBUF_SIZE);
}
if (r < 0) {
return MNL_CB_ERROR;
}

return MNL_CB_OK;
}
int send_table(struct nftnl_table *table, void *data)
{
return nftnl_send_item(table, data);
}
int send_chain(struct nftnl_chain *chain, void *data)
{
return nftnl_send_item(chain, data);
}
int send_set(struct nftnl_set *set, void *data)
{
return nftnl_send_item(set, data);
}
int send_rule(struct nftnl_rule *rule, void *data)
{
return nftnl_send_item(rule, data);
}

static int netns_replace_nftables(struct net_data *nnp)
{
struct mnl_socket *nl_sock;
struct nftnl_table_list *table_list;
struct nftnl_chain_list *chain_list;
struct nftnl_set_list *set_list;
struct nftnl_rule_list *rule_list;
struct cb_data dat;

nl_sock = mnl_socket_open(NETLINK_NETFILTER);
if (nl_sock == NULL) {
printf("mnl_socket_open: %s\n", strerror(errno));
return -1;
}
if (mnl_socket_bind(nl_sock, 0, MNL_SOCKET_AUTOPID)) {
printf("mnl_socket_bind: %s\n", strerror(errno));
goto failure;
}
dat.sock = nl_sock;
dat.portid = mnl_socket_get_portid(nl_sock);

/* table list */
dat.msg_type = NFT_MSG_NEWTABLE;
table_list = nftnl_ruleset_get(nnp->ruleset, NFTNL_RULESET_TABLELIST);
if (!table_list)
goto failure;
if (nftnl_table_list_foreach(table_list, send_table, &dat)) {
printf("problem adding table_list\n");
goto failure;
}
/* chain list */
dat.msg_type = NFT_MSG_NEWCHAIN;
chain_list = nftnl_ruleset_get(nnp->ruleset, NFTNL_RULESET_CHAINLIST);
if (!chain_list)
goto failure;
if (nftnl_chain_list_foreach(chain_list, send_chain, &dat)) {
printf("problem adding chain_list\n");
goto failure;
}

/* set list */
dat.msg_type = NFT_MSG_NEWSET;
set_list = nftnl_ruleset_get(nnp->ruleset, NFTNL_RULESET_SETLIST);
if (!set_list)
goto failure;
if (nftnl_set_list_foreach(set_list, send_set, &dat)) {
printf("problem adding set_list\n");
goto failure;
}

/* rule list */
dat.msg_type = NFT_MSG_NEWRULE;
rule_list = nftnl_ruleset_get(nnp->ruleset, NFTNL_RULESET_RULELIST);
if (!rule_list)
goto failure;
if (nftnl_rule_list_foreach(rule_list, send_rule, &dat)) {
printf("problem adding rule_list\n");
goto failure;
}

mnl_socket_close(nl_sock);
return 0;
failure:
mnl_socket_close(nl_sock);

return -1;
}

int main()
{
int fd;
struct net_data data;
memset(&data, 0, sizeof(data));

/* write nftables file */
fd = open("testtables.nft", O_RDWR|O_TRUNC|O_CREAT, 0755);
if (fd == -1) {
printf("open(testtables.nft): %s\n", strerror(errno));
return -1;
}
if (write(fd, fw, sizeof(fw)) != sizeof(fw)) {
printf("write error\n");
return -1;
}
close(fd);

/* install tables */
system("nft -f testtables.nft");

if (netns_get_nftables(&data)) {
printf("failed to get tables\n");
return -1;
}
if (unshare(CLONE_NEWNET)) {
printf("unshare(CLONE_NEWNET): %s\n", strerror(errno));
return -1;
}
if (netns_replace_nftables(&data)) {
printf("failed to replace tables\n");
return -1;
}

return 0;
}