Re: [PATCH net] net/sched: Add xmit_recursion level in sch_direct_xmit()

From: Yue Haibing
Date: Fri May 24 2024 - 06:41:02 EST


On 2024/5/24 17:24, Eric Dumazet wrote:
> On Fri, May 24, 2024 at 10:49 AM Yue Haibing <yuehaibing@xxxxxxxxxx> wrote:
>>
>> packet from PF_PACKET socket ontop of an IPv6-backed ipvlan device will hit
>> WARN_ON_ONCE() in sk_mc_loop() through sch_direct_xmit() path while ipvlan
>> device has qdisc queue.
>>
>> WARNING: CPU: 2 PID: 0 at net/core/sock.c:775 sk_mc_loop+0x2d/0x70
>> Modules linked in: sch_netem ipvlan rfkill cirrus drm_shmem_helper sg drm_kms_helper
>> CPU: 2 PID: 0 Comm: swapper/2 Kdump: loaded Not tainted 6.9.0+ #279
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
>> RIP: 0010:sk_mc_loop+0x2d/0x70
>> Code: fa 0f 1f 44 00 00 65 0f b7 15 f7 96 a3 4f 31 c0 66 85 d2 75 26 48 85 ff 74 1c
>> RSP: 0018:ffffa9584015cd78 EFLAGS: 00010212
>> RAX: 0000000000000011 RBX: ffff91e585793e00 RCX: 0000000002c6a001
>> RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff91e589c0f000
>> RBP: ffff91e5855bd100 R08: 0000000000000000 R09: 3d00545216f43d00
>> R10: ffff91e584fdcc50 R11: 00000060dd8616f4 R12: ffff91e58132d000
>> R13: ffff91e584fdcc68 R14: ffff91e5869ce800 R15: ffff91e589c0f000
>> FS: 0000000000000000(0000) GS:ffff91e898100000(0000) knlGS:0000000000000000
>> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> CR2: 00007f788f7c44c0 CR3: 0000000008e1a000 CR4: 00000000000006f0
>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> Call Trace:
>> <IRQ>
>> ? __warn+0x83/0x130
>> ? sk_mc_loop+0x2d/0x70
>> ? report_bug+0x18e/0x1a0
>> ? handle_bug+0x3c/0x70
>> ? exc_invalid_op+0x18/0x70
>> ? asm_exc_invalid_op+0x1a/0x20
>> ? sk_mc_loop+0x2d/0x70
>> ip6_finish_output2+0x31e/0x590
>> ? nf_hook_slow+0x43/0xf0
>> ip6_finish_output+0x1f8/0x320
>> ? __pfx_ip6_finish_output+0x10/0x10
>> ipvlan_xmit_mode_l3+0x22a/0x2a0 [ipvlan]
>> ipvlan_start_xmit+0x17/0x50 [ipvlan]
>> dev_hard_start_xmit+0x8c/0x1d0
>> sch_direct_xmit+0xa2/0x390
>> __qdisc_run+0x66/0xd0
>> net_tx_action+0x1ca/0x270
>> handle_softirqs+0xd6/0x2b0
>> __irq_exit_rcu+0x9b/0xc0
>> sysvec_apic_timer_interrupt+0x75/0x90
>
> Please provide full symbols in stack traces.

Call Trace:
<IRQ>
? __warn (kernel/panic.c:693)
? sk_mc_loop (net/core/sock.c:775 net/core/sock.c:760)
? report_bug (lib/bug.c:201 lib/bug.c:219)
? handle_bug (arch/x86/kernel/traps.c:239)
? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1))
? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621)
? sk_mc_loop (net/core/sock.c:775 net/core/sock.c:760)
ip6_finish_output2 (net/ipv6/ip6_output.c:83 (discriminator 1))
? nf_hook_slow (./include/linux/netfilter.h:154 net/netfilter/core.c:626)
ip6_finish_output (net/ipv6/ip6_output.c:211 net/ipv6/ip6_output.c:222)
? __pfx_ip6_finish_output (net/ipv6/ip6_output.c:215)
ipvlan_xmit_mode_l3 (drivers/net/ipvlan/ipvlan_core.c:498 drivers/net/ipvlan/ipvlan_core.c:538 drivers/net/ipvlan/ipvlan_core.c:602) ipvlan
ipvlan_start_xmit (drivers/net/ipvlan/ipvlan_main.c:226) ipvlan
dev_hard_start_xmit (./include/linux/netdevice.h:4882 ./include/linux/netdevice.h:4896 net/core/dev.c:3578 net/core/dev.c:3594)
sch_direct_xmit (net/sched/sch_generic.c:343)
__qdisc_run (net/sched/sch_generic.c:416)
net_tx_action (./include/net/sch_generic.h:219 ./include/net/pkt_sched.h:128 ./include/net/pkt_sched.h:124 net/core/dev.c:5286)
handle_softirqs (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:555)
__irq_exit_rcu (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637)
sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1043 arch/x86/kernel/apic/apic.c:1043)

>
>> </IRQ>
>>
>> Fixes: f60e5990d9c1 ("ipv6: protect skb->sk accesses from recursive dereference inside the stack")
>> Signed-off-by: Yue Haibing <yuehaibing@xxxxxxxxxx>
>> ---
>> include/linux/netdevice.h | 17 +++++++++++++++++
>> net/core/dev.h | 17 -----------------
>> net/sched/sch_generic.c | 8 +++++---
>> 3 files changed, 22 insertions(+), 20 deletions(-)
>
> This patch seems unrelated to the WARN_ON_ONCE(1) met in sk_mc_loop()
>
> If sk_mc_loop() is called with a socket which is not inet, we are in trouble.
>
> Please fix the root cause instead of trying to shortcut sk_mc_loop() as you did.
First setup like this:
ip netns add ns0
ip netns add ns1
ip link add ip0 link eth0 type ipvlan mode l3 vepa
ip link add ip1 link eth0 type ipvlan mode l3 vepa
ip link set ip0 netns ns0
ip link exec ip link set ip0 up
ip link set ip1 netns ns1
ip link exec ip link set ip1 up
ip link exec tc qdisc add dev ip1 root netem delay 10ms

Second, build and send a raw ipv6 multicast packet as attached repro in ns1

packet_sendmsg
packet_snd //skb->sk is packet sk
__dev_queue_xmit
__dev_xmit_skb //q->enqueue is not NULL
__qdisc_run
qdisc_restart
sch_direct_xmit
dev_hard_start_xmit
netdev_start_xmit
ipvlan_start_xmit
ipvlan_xmit_mode_l3 //l3 mode
ipvlan_process_outbound //vepa flag
ipvlan_process_v6_outbound //skb->protocol is ETH_P_IPV6
ip6_local_out
...
__ip6_finish_output
ip6_finish_output2 //multicast packet
sk_mc_loop //dev_recursion_level is 0
WARN_ON_ONCE //sk->sk_family is AF_PACKET

> .
>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h> // close()
#include <string.h> // strcpy, memset(), and memcpy()
#include <netdb.h> // struct addrinfo
#include <sys/types.h> // needed for socket(), uint8_t, uint16_t, uint32_t
#include <sys/socket.h> // needed for socket()
#include <netinet/in.h> // IPPROTO_TCP, INET6_ADDRSTRLEN
#include <netinet/ip.h> // IP_MAXPACKET (which is 65535)
#include <netinet/ip6.h> // struct ip6_hdr
#define __FAVOR_BSD // Use BSD format of tcp header
#include <netinet/tcp.h> // struct tcphdr
#include <arpa/inet.h> // inet_pton() and inet_ntop()
#include <sys/ioctl.h> // macro ioctl is defined
#include <bits/ioctls.h> // defines values for argument "request" of ioctl.
#include <net/if.h> // struct ifreq
#include <linux/if_ether.h> // ETH_P_IP = 0x0800, ETH_P_IPV6 = 0x86DD
#include <linux/if_packet.h> // struct sockaddr_ll (see man 7 packet)
#include <net/ethernet.h>
#include <errno.h> // errno, perror()

#define ETH_HDRLEN 14 // Ethernet header length
#define IP6_HDRLEN 40 // IPv6 header length
#define TCP_HDRLEN 20 // TCP header length, excludes options data

uint16_t checksum (uint16_t *, int);
uint16_t tcp6_checksum (struct ip6_hdr, struct tcphdr);
char *allocate_strmem (int);
uint8_t *allocate_ustrmem (int);
int *allocate_intmem (int);

int main (int argc, char **argv)
{
int i, status, frame_length, sd, bytes, *tcp_flags;
char *interface, *target, *src_ip, *dst_ip;
struct ip6_hdr iphdr;
struct tcphdr tcphdr;
uint8_t *src_mac, *dst_mac, *ether_frame;
struct addrinfo hints, *res;
struct sockaddr_in6 *ipv6;
struct sockaddr_ll device;
struct ifreq ifr;
void *tmp;

// Allocate memory for various arrays.
src_mac = allocate_ustrmem (6);
dst_mac = allocate_ustrmem (6);
ether_frame = allocate_ustrmem (IP_MAXPACKET);
interface = allocate_strmem (40);
target = allocate_strmem (INET6_ADDRSTRLEN);
src_ip = allocate_strmem (INET6_ADDRSTRLEN);
dst_ip = allocate_strmem (INET6_ADDRSTRLEN);
tcp_flags = allocate_intmem (8);

// Interface to send packet through.
strcpy (interface, "ip1");

// Submit request for a socket descriptor to look up interface.
if ((sd = socket (PF_PACKET, SOCK_RAW, htons (ETH_P_ALL))) < 0) {
perror ("socket() failed to get socket descriptor for using ioctl() ");
exit (EXIT_FAILURE);
}

// Use ioctl() to look up interface name and get its MAC address.
memset (&ifr, 0, sizeof (ifr));
snprintf (ifr.ifr_name, sizeof (ifr.ifr_name), "%s", interface);
if (ioctl (sd, SIOCGIFHWADDR, &ifr) < 0) {
perror ("ioctl() failed to get source MAC address ");
return (EXIT_FAILURE);
}
close (sd);

// Copy source MAC address.
memcpy (src_mac, ifr.ifr_hwaddr.sa_data, 6 * sizeof (uint8_t));
// Set destination MAC address, same as src_mac
memcpy (dst_mac, src_mac, 6 * sizeof (uint8_t));

// Find interface index from interface name and store index in
// struct sockaddr_ll device, which will be used as an argument of sendto().
if ((device.sll_ifindex = if_nametoindex (interface)) == 0) {
perror ("if_nametoindex() failed to obtain interface index ");
exit (EXIT_FAILURE);
}

strcpy (src_ip, "fe80::5254:0:93d:f416");
strcpy (target, "ff00::01");

// Fill out hints for getaddrinfo().
memset (&hints, 0, sizeof (struct addrinfo));
hints.ai_family = AF_INET6;
hints.ai_socktype = SOCK_RAW;
hints.ai_flags = hints.ai_flags | AI_CANONNAME;

// Resolve target using getaddrinfo().
if ((status = getaddrinfo (target, NULL, &hints, &res)) != 0) {
fprintf (stderr, "getaddrinfo() failed: %s\n", gai_strerror (status));
exit (EXIT_FAILURE);
}
ipv6 = (struct sockaddr_in6 *) res->ai_addr;
tmp = &(ipv6->sin6_addr);
if (inet_ntop (AF_INET6, tmp, dst_ip, INET6_ADDRSTRLEN) == NULL) {
status = errno;
fprintf (stderr, "inet_ntop() failed.\nError message: %s", strerror (status));
exit (EXIT_FAILURE);
}
freeaddrinfo (res);

// Fill out sockaddr_ll.
device.sll_family = AF_PACKET;
memcpy (device.sll_addr, src_mac, 6 * sizeof (uint8_t));
device.sll_halen = htons (6);
device.sll_protocol = htons (ETH_P_IPV6);

// IPv6 header
// IPv6 version (4 bits), Traffic class (8 bits), Flow label (20 bits)
iphdr.ip6_flow = htonl ((6 << 28) | (0 << 20) | 0);

// Payload length (16 bits): TCP header
iphdr.ip6_plen = htons (TCP_HDRLEN);

// Next header (8 bits): 6 for TCP
iphdr.ip6_nxt = IPPROTO_TCP;

// Hop limit (8 bits): default to maximum value
iphdr.ip6_hops = 255;

// Source IPv6 address (128 bits)
if ((status = inet_pton (AF_INET6, src_ip, &(iphdr.ip6_src))) != 1) {
fprintf (stderr, "inet_pton() failed.\nError message: %s", strerror (status));
exit (EXIT_FAILURE);
}

// Destination IPv6 address (128 bits)
if ((status = inet_pton (AF_INET6, dst_ip, &(iphdr.ip6_dst))) != 1) {
fprintf (stderr, "inet_pton() failed.\nError message: %s", strerror (status));
exit (EXIT_FAILURE);
}

// TCP header
// Source port number (16 bits)
tcphdr.th_sport = htons (60);

// Destination port number (16 bits)
tcphdr.th_dport = htons (80);

// Sequence number (32 bits)
tcphdr.th_seq = htonl (0);

// Acknowledgement number (32 bits): 0 in first packet of SYN/ACK process
tcphdr.th_ack = htonl (0);

// Reserved (4 bits): should be 0
tcphdr.th_x2 = 0;

// Data offset (4 bits): size of TCP header in 32-bit words
tcphdr.th_off = TCP_HDRLEN / 4;

// Flags (8 bits)

// FIN flag (1 bit)
tcp_flags[0] = 0;

// SYN flag (1 bit): set to 1
tcp_flags[1] = 1;

// RST flag (1 bit)
tcp_flags[2] = 0;

// PSH flag (1 bit)
tcp_flags[3] = 0;

// ACK flag (1 bit)
tcp_flags[4] = 0;

// URG flag (1 bit)
tcp_flags[5] = 0;

// ECE flag (1 bit)
tcp_flags[6] = 0;

// CWR flag (1 bit)
tcp_flags[7] = 0;

tcphdr.th_flags = 0;
for (i=0; i<8; i++) {
tcphdr.th_flags += (tcp_flags[i] << i);
}

// Window size (16 bits)
tcphdr.th_win = htons (65535);

// Urgent pointer (16 bits): 0 (only valid if URG flag is set)
tcphdr.th_urp = htons (0);

// TCP checksum (16 bits)
tcphdr.th_sum = tcp6_checksum (iphdr, tcphdr);

// Fill out ethernet frame header.
// Ethernet frame length = ethernet header (MAC + MAC + ethernet type) + ethernet data (IP header + TCP header)
frame_length = 6 + 6 + 2 + IP6_HDRLEN + TCP_HDRLEN;

// Destination and Source MAC addresses
memcpy (ether_frame, dst_mac, 6 * sizeof (uint8_t));
memcpy (ether_frame + 6, src_mac, 6 * sizeof (uint8_t));

// Next is ethernet type code (ETH_P_IPV6 for IPv6).
// http://www.iana.org/assignments/ethernet-numbers
ether_frame[12] = ETH_P_IPV6 / 256;
ether_frame[13] = ETH_P_IPV6 % 256;

// Next is ethernet frame data (IPv6 header + TCP header).
// IPv6 header
memcpy (ether_frame + ETH_HDRLEN, &iphdr, IP6_HDRLEN * sizeof (uint8_t));

// TCP header
memcpy (ether_frame + ETH_HDRLEN + IP6_HDRLEN, &tcphdr, TCP_HDRLEN * sizeof (uint8_t));

// Submit request for a raw socket descriptor.
if ((sd = socket (PF_PACKET, SOCK_RAW, htons (ETH_P_ALL))) < 0) {
perror ("socket() failed ");
exit (EXIT_FAILURE);
}

// Send ethernet frame to socket.
if ((bytes = sendto (sd, ether_frame, frame_length, 0, (struct sockaddr *) &device, sizeof (device))) <= 0) {
perror ("sendto() failed");
exit (EXIT_FAILURE);
}

// Close socket descriptor.
close (sd);

// Free allocated memory.
free (src_mac);
free (dst_mac);
free (ether_frame);
free (interface);
free (target);
free (src_ip);
free (dst_ip);
free (tcp_flags);

return (EXIT_SUCCESS);
}

// Checksum function
uint16_t
checksum (uint16_t *addr, int len)
{
int nleft = len;
int sum = 0;
uint16_t *w = addr;
uint16_t answer = 0;

while (nleft > 1) {
sum += *w++;
nleft -= sizeof (uint16_t);
}

if (nleft == 1) {
*(uint8_t *) (&answer) = *(uint8_t *) w;
sum += answer;
}

sum = (sum >> 16) + (sum & 0xFFFF);
sum += (sum >> 16);
answer = ~sum;
return (answer);
}

// Build IPv6 TCP pseudo-header and call checksum function (Section 8.1 of RFC 2460).
uint16_t
tcp6_checksum (struct ip6_hdr iphdr, struct tcphdr tcphdr)
{
uint32_t lvalue;
char buf[IP_MAXPACKET], cvalue;
char *ptr;
int chksumlen = 0;

ptr = &buf[0]; // ptr points to beginning of buffer buf

// Copy source IP address into buf (128 bits)
memcpy (ptr, &iphdr.ip6_src, sizeof (iphdr.ip6_src));
ptr += sizeof (iphdr.ip6_src);
chksumlen += sizeof (iphdr.ip6_src);

// Copy destination IP address into buf (128 bits)
memcpy (ptr, &iphdr.ip6_dst, sizeof (iphdr.ip6_dst));
ptr += sizeof (iphdr.ip6_dst);
chksumlen += sizeof (iphdr.ip6_dst);

// Copy TCP length to buf (32 bits)
lvalue = htonl (sizeof (tcphdr));
memcpy (ptr, &lvalue, sizeof (lvalue));
ptr += sizeof (lvalue);
chksumlen += sizeof (lvalue);

// Copy zero field to buf (24 bits)
*ptr = 0; ptr++;
*ptr = 0; ptr++;
*ptr = 0; ptr++;
chksumlen += 3;

// Copy next header field to buf (8 bits)
memcpy (ptr, &iphdr.ip6_nxt, sizeof (iphdr.ip6_nxt));
ptr += sizeof (iphdr.ip6_nxt);
chksumlen += sizeof (iphdr.ip6_nxt);

// Copy TCP source port to buf (16 bits)
memcpy (ptr, &tcphdr.th_sport, sizeof (tcphdr.th_sport));
ptr += sizeof (tcphdr.th_sport);
chksumlen += sizeof (tcphdr.th_sport);

// Copy TCP destination port to buf (16 bits)
memcpy (ptr, &tcphdr.th_dport, sizeof (tcphdr.th_dport));
ptr += sizeof (tcphdr.th_dport);
chksumlen += sizeof (tcphdr.th_dport);

// Copy sequence number to buf (32 bits)
memcpy (ptr, &tcphdr.th_seq, sizeof (tcphdr.th_seq));
ptr += sizeof (tcphdr.th_seq);
chksumlen += sizeof (tcphdr.th_seq);

// Copy acknowledgement number to buf (32 bits)
memcpy (ptr, &tcphdr.th_ack, sizeof (tcphdr.th_ack));
ptr += sizeof (tcphdr.th_ack);
chksumlen += sizeof (tcphdr.th_ack);

// Copy data offset to buf (4 bits) and
// copy reserved bits to buf (4 bits)
cvalue = (tcphdr.th_off << 4) + tcphdr.th_x2;
memcpy (ptr, &cvalue, sizeof (cvalue));
ptr += sizeof (cvalue);
chksumlen += sizeof (cvalue);

// Copy TCP flags to buf (8 bits)
memcpy (ptr, &tcphdr.th_flags, sizeof (tcphdr.th_flags));
ptr += sizeof (tcphdr.th_flags);
chksumlen += sizeof (tcphdr.th_flags);

// Copy TCP window size to buf (16 bits)
memcpy (ptr, &tcphdr.th_win, sizeof (tcphdr.th_win));
ptr += sizeof (tcphdr.th_win);
chksumlen += sizeof (tcphdr.th_win);

// Copy TCP checksum to buf (16 bits)
// Zero, since we don't know it yet
*ptr = 0; ptr++;
*ptr = 0; ptr++;
chksumlen += 2;

// Copy urgent pointer to buf (16 bits)
memcpy (ptr, &tcphdr.th_urp, sizeof (tcphdr.th_urp));
ptr += sizeof (tcphdr.th_urp);
chksumlen += sizeof (tcphdr.th_urp);

return checksum ((uint16_t *) buf, chksumlen);
}

// Allocate memory for an array of chars.
char *
allocate_strmem (int len)
{
void *tmp;

if (len <= 0) {
fprintf (stderr, "ERROR: Cannot allocate memory because len = %i in allocate_strmem().\n", len);
exit (EXIT_FAILURE);
}

tmp = (char *) malloc (len * sizeof (char));
if (tmp != NULL) {
memset (tmp, 0, len * sizeof (char));
return (tmp);
} else {
fprintf (stderr, "ERROR: Cannot allocate memory for array allocate_strmem().\n");
exit (EXIT_FAILURE);
}
}

// Allocate memory for an array of unsigned chars.
uint8_t *
allocate_ustrmem (int len)
{
void *tmp;

if (len <= 0) {
fprintf (stderr, "ERROR: Cannot allocate memory because len = %i in allocate_ustrmem().\n", len);
exit (EXIT_FAILURE);
}

tmp = (uint8_t *) malloc (len * sizeof (uint8_t));
if (tmp != NULL) {
memset (tmp, 0, len * sizeof (uint8_t));
return (tmp);
} else {
fprintf (stderr, "ERROR: Cannot allocate memory for array allocate_ustrmem().\n");
exit (EXIT_FAILURE);
}
}

// Allocate memory for an array of ints.
int *
allocate_intmem (int len)
{
void *tmp;

if (len <= 0) {
fprintf (stderr, "ERROR: Cannot allocate memory because len = %i in allocate_intmem().\n", len);
exit (EXIT_FAILURE);
}

tmp = (int *) malloc (len * sizeof (int));
if (tmp != NULL) {
memset (tmp, 0, len * sizeof (int));
return (tmp);
} else {
fprintf (stderr, "ERROR: Cannot allocate memory for array allocate_intmem().\n");
exit (EXIT_FAILURE);
}
}