Re: [PATCH net-next v12 11/22] ovpn: implement TCP transport

From: Antonio Quartulli
Date: Wed Dec 04 2024 - 06:16:50 EST

Next message: kernel test robot: "Re: [PATCH v1 3/4] iio: afe: rescale: Re-use generic struct s32_fract"
Previous message: Jan Kara: "Re: [PATCH 01/27] ext4: remove writable userspace mappings before truncating page cache"
In reply to: Paolo Abeni: "Re: [PATCH net-next v12 11/22] ovpn: implement TCP transport"
Next in thread: Antonio Quartulli: "Re: [PATCH net-next v12 11/22] ovpn: implement TCP transport"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On 03/12/2024 16:19, Paolo Abeni wrote:

On 12/2/24 16:07, Antonio Quartulli wrote:

+void ovpn_tcp_socket_detach(struct socket *sock)
+{
+ struct ovpn_socket *ovpn_sock;
+ struct ovpn_peer *peer;
+
+ if (!sock)
+ return;
+
+ rcu_read_lock();
+ ovpn_sock = rcu_dereference_sk_user_data(sock->sk);
+
+ if (!ovpn_sock->peer) {
+ rcu_read_unlock();
+ return;
+ }
+
+ peer = ovpn_sock->peer;
+ strp_stop(&peer->tcp.strp);
+
+ skb_queue_purge(&peer->tcp.user_queue);
+
+ /* restore CBs that were saved in ovpn_sock_set_tcp_cb() */
+ sock->sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready;
+ sock->sk->sk_write_space = peer->tcp.sk_cb.sk_write_space;
+ sock->sk->sk_prot = peer->tcp.sk_cb.prot;
+ sock->sk->sk_socket->ops = peer->tcp.sk_cb.ops;
+ /* drop reference to peer */
+ rcu_assign_sk_user_data(sock->sk, NULL);
+
+ rcu_read_unlock();
+
+ barrier();

It's unclear to me the role of the above barrier. A comment would help

Unless I misinterpreted Sabrina's previous comment, the barrier() is needed to prevent reordering and therefore ensure that the assumption described in the comment below is true.

I can re-arrange the comment to make it clear that the barrier() is serving this specific purpose.

+ /* cancel any ongoing work. Done after removing the CBs so that these
+ * workers cannot be re-armed
+ */
+ cancel_work_sync(&peer->tcp.tx_work);
+ strp_done(&peer->tcp.strp);
+ skb_queue_purge(&peer->tcp.out_queue);
+
+ ovpn_peer_put(peer);
+}
+
+static void ovpn_tcp_send_sock(struct ovpn_peer *peer)
+{
+ struct sk_buff *skb = peer->tcp.out_msg.skb;
+
+ if (!skb)
+ return;
+
+ if (peer->tcp.tx_in_progress)
+ return;
+
+ peer->tcp.tx_in_progress = true;
+
+ do {
+ int ret = skb_send_sock_locked(peer->sock->sock->sk, skb,
+ peer->tcp.out_msg.offset,
+ peer->tcp.out_msg.len);
+ if (unlikely(ret < 0)) {
+ if (ret == -EAGAIN)
+ goto out;
+
+ net_warn_ratelimited("%s: TCP error to peer %u: %d\n",
+ netdev_name(peer->ovpn->dev),
+ peer->id, ret);
+
+ /* in case of TCP error we can't recover the VPN
+ * stream therefore we abort the connection
+ */
+ ovpn_peer_del(peer,
+ OVPN_DEL_PEER_REASON_TRANSPORT_ERROR);
+ break;
+ }
+
+ peer->tcp.out_msg.len -= ret;
+ peer->tcp.out_msg.offset += ret;
+ } while (peer->tcp.out_msg.len > 0);
+
+ if (!peer->tcp.out_msg.len)
+ dev_sw_netstats_tx_add(peer->ovpn->dev, 1, skb->len);
+
+ kfree_skb(peer->tcp.out_msg.skb);
+ peer->tcp.out_msg.skb = NULL;
+ peer->tcp.out_msg.len = 0;
+ peer->tcp.out_msg.offset = 0;
+
+out:
+ peer->tcp.tx_in_progress = false;
+}
+
+static void ovpn_tcp_tx_work(struct work_struct *work)
+{
+ struct ovpn_peer *peer;
+
+ peer = container_of(work, struct ovpn_peer, tcp.tx_work);
+
+ lock_sock(peer->sock->sock->sk);
+ ovpn_tcp_send_sock(peer);
+ release_sock(peer->sock->sock->sk);
+}
+
+static void ovpn_tcp_send_sock_skb(struct ovpn_peer *peer, struct sk_buff *skb)
+{
+ if (peer->tcp.out_msg.skb)
+ ovpn_tcp_send_sock(peer);
+
+ if (peer->tcp.out_msg.skb) {
+ dev_core_stats_rx_dropped_inc(peer->ovpn->dev);
+ kfree_skb(skb);
+ return;
+ }
+
+ peer->tcp.out_msg.skb = skb;
+ peer->tcp.out_msg.len = skb->len;
+ peer->tcp.out_msg.offset = 0;
+ ovpn_tcp_send_sock(peer);
+}
+
+void ovpn_tcp_send_skb(struct ovpn_peer *peer, struct sk_buff *skb)
+{
+ u16 len = skb->len;
+
+ *(__be16 *)__skb_push(skb, sizeof(u16)) = htons(len);
+
+ bh_lock_sock(peer->sock->sock->sk);

Are you sure this runs in BH context? AFAICS we reach here from an AEAD
callback.

It could be from the AEAD callback (crypto async case), but it may also be directly from the packet RX path (crypto sync case).

If I am not wrong, in the latter case we are in BH context.

+ if (sock_owned_by_user(peer->sock->sock->sk)) {
+ if (skb_queue_len(&peer->tcp.out_queue) >=
+ READ_ONCE(net_hotdata.max_backlog)) {
+ dev_core_stats_rx_dropped_inc(peer->ovpn->dev);
+ kfree_skb(skb);
+ goto unlock;
+ }
+ __skb_queue_tail(&peer->tcp.out_queue, skb);
+ } else {
+ ovpn_tcp_send_sock_skb(peer, skb);
+ }
+unlock:
+ bh_unlock_sock(peer->sock->sock->sk);
+}

[...]

+static void ovpn_tcp_build_protos(struct proto *new_prot,
+ struct proto_ops *new_ops,
+ const struct proto *orig_prot,
+ const struct proto_ops *orig_ops);
+
+/* Set TCP encapsulation callbacks */
+int ovpn_tcp_socket_attach(struct socket *sock, struct ovpn_peer *peer)
+{
+ struct strp_callbacks cb = {
+ .rcv_msg = ovpn_tcp_rcv,
+ .parse_msg = ovpn_tcp_parse,
+ };
+ int ret;
+
+ /* make sure no pre-existing encapsulation handler exists */
+ if (sock->sk->sk_user_data)
+ return -EBUSY;
+
+ /* sanity check */
+ if (sock->sk->sk_protocol != IPPROTO_TCP) {
+ net_err_ratelimited("%s: provided socket is not TCP as expected\n",
+ netdev_name(peer->ovpn->dev));
+ return -EINVAL;
+ }
+
+ /* only a fully connected socket are expected. Connection should be
+ * handled in userspace
+ */
+ if (sock->sk->sk_state != TCP_ESTABLISHED) {
+ net_err_ratelimited("%s: provided TCP socket is not in ESTABLISHED state: %d\n",
+ netdev_name(peer->ovpn->dev),
+ sock->sk->sk_state);
+ return -EINVAL;
+ }
+
+ lock_sock(sock->sk);
+
+ ret = strp_init(&peer->tcp.strp, sock->sk, &cb);
+ if (ret < 0) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ release_sock(sock->sk);
+ return ret;
+ }
+
+ INIT_WORK(&peer->tcp.tx_work, ovpn_tcp_tx_work);
+ __sk_dst_reset(sock->sk);
+ skb_queue_head_init(&peer->tcp.user_queue);
+ skb_queue_head_init(&peer->tcp.out_queue);
+
+ /* save current CBs so that they can be restored upon socket release */
+ peer->tcp.sk_cb.sk_data_ready = sock->sk->sk_data_ready;
+ peer->tcp.sk_cb.sk_write_space = sock->sk->sk_write_space;
+ peer->tcp.sk_cb.prot = sock->sk->sk_prot;
+ peer->tcp.sk_cb.ops = sock->sk->sk_socket->ops;
+
+ /* assign our static CBs and prot/ops */
+ sock->sk->sk_data_ready = ovpn_tcp_data_ready;
+ sock->sk->sk_write_space = ovpn_tcp_write_space;
+
+ if (sock->sk->sk_family == AF_INET) {
+ sock->sk->sk_prot = &ovpn_tcp_prot;
+ sock->sk->sk_socket->ops = &ovpn_tcp_ops;
+ } else {
+ mutex_lock(&tcp6_prot_mutex);
+ if (!ovpn_tcp6_prot.recvmsg)
+ ovpn_tcp_build_protos(&ovpn_tcp6_prot, &ovpn_tcp6_ops,
+ sock->sk->sk_prot,
+ sock->sk->sk_socket->ops);
+ mutex_unlock(&tcp6_prot_mutex);

This looks like an hack to avoid a build dependency on IPV6, I think the
explicit

I happily copied this approach from espintcp.c:espintcp_init_sk() :-D

#if IS_ENABLED(CONFIG_IPV6)

at init time should be preferable

Ok, will try to take this other approach.

I also have the feeling that by going this way I can get rid of the checkpatch warning about proto_ops not being consts.

+
+ sock->sk->sk_prot = &ovpn_tcp6_prot;
+ sock->sk->sk_socket->ops = &ovpn_tcp6_ops;
+ }

[...]

+static void ovpn_tcp_close(struct sock *sk, long timeout)
+{
+ struct ovpn_socket *sock;
+
+ rcu_read_lock();
+ sock = rcu_dereference_sk_user_data(sk);
+
+ strp_stop(&sock->peer->tcp.strp);
+ barrier();

Again, is not clear to me the role of the above barrier, please document it.

Also taken from espintcp_close(), with the idea to avoid reordering during the shut down sequence.

Will add a comment here too.

Thanks a lot.

Regards,

Thanks,

Paolo

--
Antonio Quartulli
OpenVPN Inc.

Next message: kernel test robot: "Re: [PATCH v1 3/4] iio: afe: rescale: Re-use generic struct s32_fract"
Previous message: Jan Kara: "Re: [PATCH 01/27] ext4: remove writable userspace mappings before truncating page cache"
In reply to: Paolo Abeni: "Re: [PATCH net-next v12 11/22] ovpn: implement TCP transport"
Next in thread: Antonio Quartulli: "Re: [PATCH net-next v12 11/22] ovpn: implement TCP transport"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]