Re: [PATCH net-next v3 11/13] net: replace page_frag with page_frag_cache

From: Mat Martineau
Date: Thu May 09 2024 - 12:22:19 EST


On Wed, 8 May 2024, Yunsheng Lin wrote:

Use the newly introduced prepare/probe/commit API to
replace page_frag with page_frag_cache for sk_page_frag().

CC: Alexander Duyck <alexander.duyck@xxxxxxxxx>
Signed-off-by: Yunsheng Lin <linyunsheng@xxxxxxxxxx>
---
.../chelsio/inline_crypto/chtls/chtls.h | 3 -
.../chelsio/inline_crypto/chtls/chtls_io.c | 100 ++++---------
.../chelsio/inline_crypto/chtls/chtls_main.c | 3 -
drivers/net/tun.c | 28 ++--
include/linux/sched.h | 4 +-
include/net/sock.h | 14 +-
kernel/exit.c | 3 +-
kernel/fork.c | 3 +-
net/core/skbuff.c | 32 ++--
net/core/skmsg.c | 22 +--
net/core/sock.c | 46 ++++--
net/ipv4/ip_output.c | 33 +++--
net/ipv4/tcp.c | 35 ++---
net/ipv4/tcp_output.c | 28 ++--
net/ipv6/ip6_output.c | 33 +++--
net/kcm/kcmsock.c | 30 ++--
net/mptcp/protocol.c | 70 +++++----
net/sched/em_meta.c | 2 +-
net/tls/tls_device.c | 139 ++++++++++--------
19 files changed, 331 insertions(+), 297 deletions(-)


<snip>

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index bb8f96f2b86f..ab844011d442 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -960,17 +960,18 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq,
}

/* we can append data to the given data frag if:
- * - there is space available in the backing page_frag
- * - the data frag tail matches the current page_frag free offset
+ * - there is space available for the current page
+ * - the data frag tail matches the current page and offset
* - the data frag end sequence number matches the current write seq
*/
static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
- const struct page_frag *pfrag,
+ const struct page *page,
+ const unsigned int offset,
+ const unsigned int size,

Hi Yunsheng -

Why add the 'size' parameter here? It's checked to be a nonzero value, but it can only be 0 if page is also NULL. In this case "page == df->page" will be false, so the function will return false even without checking 'size'.

Thanks,

Mat

const struct mptcp_data_frag *df)
{
- return df && pfrag->page == df->page &&
- pfrag->size - pfrag->offset > 0 &&
- pfrag->offset == (df->offset + df->data_len) &&
+ return df && size && page == df->page &&
+ offset == (df->offset + df->data_len) &&
df->data_seq + df->data_len == msk->write_seq;
}

@@ -1085,30 +1086,36 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
* data
*/
-static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+static struct page *mptcp_page_frag_alloc_prepare(struct sock *sk,
+ struct page_frag_cache *pfrag,
+ unsigned int *offset,
+ unsigned int *size, void **va)
{
- if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
- pfrag, sk->sk_allocation)))
- return true;
+ struct page *page;
+
+ page = page_frag_alloc_prepare(pfrag, offset, size, va,
+ sk->sk_allocation);
+ if (likely(page))
+ return page;

mptcp_enter_memory_pressure(sk);
- return false;
+ return NULL;
}

static struct mptcp_data_frag *
-mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
- int orig_offset)
+mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page *page,
+ unsigned int orig_offset)
{
int offset = ALIGN(orig_offset, sizeof(long));
struct mptcp_data_frag *dfrag;

- dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
+ dfrag = (struct mptcp_data_frag *)(page_to_virt(page) + offset);
dfrag->data_len = 0;
dfrag->data_seq = msk->write_seq;
dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
dfrag->offset = offset + sizeof(struct mptcp_data_frag);
dfrag->already_sent = 0;
- dfrag->page = pfrag->page;
+ dfrag->page = page;

return dfrag;
}
@@ -1793,7 +1800,7 @@ static u32 mptcp_send_limit(const struct sock *sk)
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct page_frag *pfrag;
+ struct page_frag_cache *pfrag;
size_t copied = 0;
int ret = 0;
long timeo;
@@ -1832,9 +1839,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
while (msg_data_left(msg)) {
int total_ts, frag_truesize = 0;
struct mptcp_data_frag *dfrag;
- bool dfrag_collapsed;
- size_t psize, offset;
+ bool dfrag_collapsed = false;
+ unsigned int offset, size;
+ struct page *page;
+ size_t psize;
u32 copy_limit;
+ void *va;

/* ensure fitting the notsent_lowat() constraint */
copy_limit = mptcp_send_limit(sk);
@@ -1845,21 +1855,26 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
* page allocator
*/
dfrag = mptcp_pending_tail(sk);
- dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
+ page = page_frag_alloc_probe(pfrag, &offset, &size, &va);
+ dfrag_collapsed = mptcp_frag_can_collapse_to(msk, page, offset,
+ size, dfrag);
if (!dfrag_collapsed) {
- if (!mptcp_page_frag_refill(sk, pfrag))
+ size = 32U + sizeof(struct mptcp_data_frag);
+ page = mptcp_page_frag_alloc_prepare(sk, pfrag, &offset,
+ &size, &va);
+ if (!page)
goto wait_for_memory;

- dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset);
+ dfrag = mptcp_carve_data_frag(msk, page, offset);
frag_truesize = dfrag->overhead;
+ va += dfrag->overhead;
}

/* we do not bound vs wspace, to allow a single packet.
* memory accounting will prevent execessive memory usage
* anyway
*/
- offset = dfrag->offset + dfrag->data_len;
- psize = pfrag->size - offset;
+ psize = size - frag_truesize;
psize = min_t(size_t, psize, msg_data_left(msg));
psize = min_t(size_t, psize, copy_limit);
total_ts = psize + frag_truesize;
@@ -1867,8 +1882,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!sk_wmem_schedule(sk, total_ts))
goto wait_for_memory;

- ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
- page_address(dfrag->page) + offset);
+ ret = do_copy_data_nocache(sk, psize, &msg->msg_iter, va);
if (ret)
goto do_error;

@@ -1877,7 +1891,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
copied += psize;
dfrag->data_len += psize;
frag_truesize += psize;
- pfrag->offset += frag_truesize;
WRITE_ONCE(msk->write_seq, msk->write_seq + psize);

/* charge data on mptcp pending queue to the msk socket
@@ -1885,11 +1898,14 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
*/
sk_wmem_queued_add(sk, frag_truesize);
if (!dfrag_collapsed) {
- get_page(dfrag->page);
+ page_frag_alloc_commit(pfrag, frag_truesize);
list_add_tail(&dfrag->list, &msk->rtx_queue);
if (!msk->first_pending)
WRITE_ONCE(msk->first_pending, dfrag);
+ } else {
+ page_frag_alloc_commit_noref(pfrag, frag_truesize);
}
+
pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk,
dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
!dfrag_collapsed);