Re: [RFC PATCH 0/4] net/io_uring: pass a kernel pointer via optlen_t to proto[_ops].getsockopt()

From: Stefan Metzmacher
Date: Tue Apr 01 2025 - 09:38:22 EST


Am 01.04.25 um 10:19 schrieb Stefan Metzmacher:
Am 31.03.25 um 23:04 schrieb Stanislav Fomichev:
On 03/31, Stefan Metzmacher wrote:
The motivation for this is to remove the SOL_SOCKET limitation
from io_uring_cmd_getsockopt().

The reason for this limitation is that io_uring_cmd_getsockopt()
passes a kernel pointer as optlen to do_sock_getsockopt()
and can't reach the ops->getsockopt() path.

The first idea would be to change the optval and optlen arguments
to the protocol specific hooks also to sockptr_t, as that
is already used for setsockopt() and also by do_sock_getsockopt()
sk_getsockopt() and BPF_CGROUP_RUN_PROG_GETSOCKOPT().

But as Linus don't like 'sockptr_t' I used a different approach.

@Linus, would that optlen_t approach fit better for you?

[..]

Instead of passing the optlen as user or kernel pointer,
we only ever pass a kernel pointer and do the
translation from/to userspace in do_sock_getsockopt().

At this point why not just fully embrace iov_iter? You have the size
now + the user (or kernel) pointer. Might as well do
s/sockptr_t/iov_iter/ conversion?

I think that would only be possible if we introduce
proto[_ops].getsockopt_iter() and then convert the implementations
step by step. Doing it all in one go has a lot of potential to break
the uapi. I could try to convert things like socket, ip and tcp myself, but
the rest needs to be converted by the maintainer of the specific protocol,
as it needs to be tested. As there are crazy things happening in the existing
implementations, e.g. some getsockopt() implementations use optval as in and out
buffer.

I first tried to convert both optval and optlen of getsockopt to sockptr_t,
and that showed that touching the optval part starts to get complex very soon,
see https://git.samba.org/?p=metze/linux/wip.git;a=commitdiff;h=141912166473bf8843ec6ace76dc9c6945adafd1
(note it didn't converted everything, I gave up after hitting
sctp_getsockopt_peer_addrs and sctp_getsockopt_local_addrs.
sctp_getsockopt_context, sctp_getsockopt_maxseg, sctp_getsockopt_associnfo and maybe
more are the ones also doing both copy_from_user and copy_to_user on optval)

I come also across one implementation that returned -ERANGE because *optlen was
too short and put the required length into *optlen, which means the returned
*optlen is larger than the optval buffer given from userspace.

Because of all these strange things I tried to do a minimal change
in order to get rid of the io_uring limitation and only converted
optlen and leave optval as is.

In order to have a patchset that has a low risk to cause regressions.

But as alternative introducing a prototype like this:

        int (*getsockopt_iter)(struct socket *sock, int level, int optname,
                               struct iov_iter *optval_iter);

That returns a non-negative value which can be placed into *optlen
or negative value as error and *optlen will not be changed on error.
optval_iter will get direction ITER_DEST, so it can only be written to.

Implementations could then opt in for the new interface and
allow do_sock_getsockopt() work also for the io_uring case,
while all others would still get -EOPNOTSUPP.

So what should be the way to go?

Ok, I've added the infrastructure for getsockopt_iter, see below,
but the first part I wanted to convert was
tcp_ao_copy_mkts_to_user() and that also reads from userspace before
writing.

So we could go with the optlen_t approach, or we need
logic for ITER_BOTH or pass two iov_iters one with ITER_SRC and one
with ITER_DEST...

So who wants to decide?

Thanks!
metze
---
include/linux/net.h | 4 +++
include/net/sock.h | 64 +++++++++++++++++++++++++++++++++++++++++++++
net/core/sock.c | 12 +++++++--
net/socket.c | 12 +++++++--
4 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index 0ff950eecc6b..ceb9f9ed84b9 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -194,6 +194,10 @@ struct proto_ops {
unsigned int optlen);
int (*getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
+ int (*getsockopt_iter)(struct socket *sock,
+ int level,
+ int optname,
+ struct iov_iter *optval_iter);
void (*show_fdinfo)(struct seq_file *m, struct socket *sock);
int (*sendmsg) (struct socket *sock, struct msghdr *m,
size_t total_len);
diff --git a/include/net/sock.h b/include/net/sock.h
index 8daf1b3b12c6..e741b219056e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1249,6 +1249,11 @@ struct proto {
int (*getsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int __user *option);
+ int (*getsockopt_iter)(struct sock *sk,
+ int level,
+ int optname,
+ struct iov_iter *optval_iter);
+
void (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
int (*compat_ioctl)(struct sock *sk,
@@ -1781,6 +1786,65 @@ int do_sock_setsockopt(struct socket *sock, bool compat, int level,
int do_sock_getsockopt(struct socket *sock, bool compat, int level,
int optname, sockptr_t optval, sockptr_t optlen);

+#define __generic_wrap_getsockopt_iter(__s, __level, \
+ __optname, __optval, __optlen, \
+ __getsockopt_iter) \
+do { \
+ struct iov_iter optval_iter; \
+ struct kvec optval_kvec; \
+ int len; \
+ int err; \
+ \
+ if (unlikely(__getsockopt_iter == NULL)) \
+ return -EOPNOTSUPP; \
+ \
+ if (copy_from_sockptr(&len, __optlen, sizeof(len))) \
+ return -EFAULT; \
+ \
+ if (len < 0) \
+ return -EINVAL; \
+ \
+ if (__optval.is_kernel) { \
+ if (__optval.kernel == NULL && len != 0) \
+ return -EFAULT; \
+ \
+ optval_kvec = (struct kvec) { \
+ .iov_base = __optval.kernel, \
+ .iov_len = len, \
+ }; \
+ \
+ iov_iter_kvec(&optval_iter, ITER_DEST, \
+ &optval_kvec, 1, optval_kvec.iov_len); \
+ } else { \
+ if (import_ubuf(ITER_DEST, __optval.user, len, &optval_iter)) \
+ return -EFAULT; \
+ } \
+ \
+ err = getsockopt_iter(__s, __level, __optname, &optval_iter); \
+ if (unlikely(err < 0)) \
+ return err; \
+ \
+ len = err; \
+ if (copy_to_sockptr(__optlen, &len, sizeof(len))) \
+ return -EFAULT; \
+ \
+ return 0; \
+} while (0)
+
+static __always_inline
+int sk_wrap_getsockopt_iter(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen,
+ int (*getsockopt_iter)(struct sock *sk, int level, int optname, struct iov_iter *optval_iter))
+{
+ __generic_wrap_getsockopt_iter(sk, level, optname, optval, optlen, getsockopt_iter);
+}
+
+static __always_inline
+int sock_wrap_getsockopt_iter(struct socket *sock, int level, int optname, sockptr_t optval, sockptr_t optlen,
+ int (*getsockopt_iter)(struct socket *sock, int level, int optname, struct iov_iter *optval_iter))
+{
+ __generic_wrap_getsockopt_iter(sock, level, optname, optval, optlen, getsockopt_iter);
+}
+
int sk_getsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, sockptr_t optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
diff --git a/net/core/sock.c b/net/core/sock.c
index 323892066def..61625060e724 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3857,9 +3857,17 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
-
/* IPV6_ADDRFORM can change sk->sk_prot under us. */
- return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+
+ if (prot->getsockopt_iter) {
+ return sk_wrap_getsockopt_iter(sk, level, optname,
+ USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen),
+ prot->getsockopt_iter);
+ }
+
+ return prot->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);

diff --git a/net/socket.c b/net/socket.c
index 9a0e720f0859..792cfd272611 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2335,6 +2335,7 @@ int do_sock_getsockopt(struct socket *sock, bool compat, int level,
{
int max_optlen __maybe_unused = 0;
const struct proto_ops *ops;
+ const struct proto *prot;
int err;

err = security_socket_getsockopt(sock, level, optname);
@@ -2345,12 +2346,19 @@ int do_sock_getsockopt(struct socket *sock, bool compat, int level,
copy_from_sockptr(&max_optlen, optlen, sizeof(int));

ops = READ_ONCE(sock->ops);
+ prot = READ_ONCE(sock->sk->sk_prot);
if (level == SOL_SOCKET) {
err = sk_getsockopt(sock->sk, level, optname, optval, optlen);
- } else if (unlikely(!ops->getsockopt)) {
+ } else if (ops->getsockopt_iter) {
+ err = sock_wrap_getsockopt_iter(sock, level, optname, optval, optlen,
+ ops->getsockopt_iter);
+ } else if (ops->getsockopt == sock_common_getsockopt && prot->getsockopt_iter) {
+ err = sk_wrap_getsockopt_iter(sock->sk, level, optname, optval, optlen,
+ prot->getsockopt_iter);
+ } else if (unlikely(!ops->getsockopt || optlen.is_kernel)) {
err = -EOPNOTSUPP;
} else {
- if (WARN_ONCE(optval.is_kernel || optlen.is_kernel,
+ if (WARN_ONCE(optval.is_kernel,
"Invalid argument type"))
return -EOPNOTSUPP;

--
2.34.1