[no subject]

From: Unknown
Date: Fri Jun 06 2025 - 13:02:37 EST

Next message: Unknown: "[no subject]"
Previous message: Unknown: "[no subject]"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

The lazy switch mechanism defines three states for the sender thread:

- RUNNING: The sender starts in this state. When the sender initiates
an RPAL call to switch from user mode to the receiver, it transitions
to the CALL state.

- CALL: The sender remains in this state while the receiver is executing
the code triggered by the RPAL call. When the receiver switches back to
the sender from user mode, the sender returns to the RUNNING state.

- KERNEL_RET: If the receiver takes an extended period to switch back to
the sender after a lazy switch, the scheduler may preempt the sender to
run other tasks. In this case, the sender enters the KERNEL_RET state
while in the kernel. Once the sender resumes execution in user mode, it
transitions back to the RUNNING state.

This patch implements the handling and transition of the receiver's state.
When a receiver leaves the run queue in the READY state, its state
transitions to the WAIT state; otherwise, it transitions to the RUNNING
state. The patch also modifies try_to_wake_up() to handling different
states: for the READY and WAIT states, try_to_wake_up() causes the state
to change to the RUNNING state. For the CALL state, try_to_wake_up() cannot
wake up the task. The patch provides a special interface,
rpal_try_to_wake_up(), to wake up tasks in the CALL state, which can be
used for lazy switches.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/kernel/process_64.c | 43 ++++++++++++
arch/x86/rpal/internal.h | 7 ++
include/linux/rpal.h | 50 ++++++++++++++
kernel/sched/core.c | 130 +++++++++++++++++++++++++++++++++++
4 files changed, 230 insertions(+)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f39ff02e498d..4830e9215de7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -40,6 +40,7 @@
#include <linux/ftrace.h>
#include <linux/syscalls.h>
#include <linux/iommu.h>
+#include <linux/rpal.h>

#include <asm/processor.h>
#include <asm/pkru.h>
@@ -596,6 +597,36 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
}
#endif

+#ifdef CONFIG_RPAL
+static void rpal_receiver_enter_wait(struct task_struct *prev_p)
+{
+ if (READ_ONCE(prev_p->__state) == TASK_INTERRUPTIBLE) {
+ atomic_cmpxchg(&prev_p->rpal_rd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_READY,
+ RPAL_RECEIVER_STATE_WAIT);
+ } else {
+ /*
+ * Simply check RPAL_RECEIVER_STATE_READY is not enough. It is
+ * possible task's state is TASK_RUNNING. Consider following case:
+ *
+ * CPU 0(prev_p) CPU 1(waker)
+ * set TASK_INTERRUPTIBLE
+ * set RPAL_RECEIVER_STATE_READY
+ * check TASK_INTERRUPTIBLE
+ * clear RPAL_RECEIVER_STATE_READY
+ * clear TASK_INTERRUPTIBLE
+ * set TASK_INTERRUPTIBLE
+ * set RPAL_RECEIVER_STATE_READY
+ * ttwu_runnable()
+ * schedule()
+ */
+ atomic_cmpxchg(&prev_p->rpal_rd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_READY,
+ RPAL_RECEIVER_STATE_RUNNING);
+ }
+}
+#endif
+
/*
* switch_to(x,y) should switch tasks from x to y.
*
@@ -704,6 +735,18 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
loadsegment(ss, __KERNEL_DS);
}

+#ifdef CONFIG_RPAL
+ /*
+ * When we come to here, the stack switching is finished. Therefore,
+ * the receiver thread is prepared for a lazy switch. We then change
+ * the receiver_state from RPAL_RECEIVER_STATE_REAY to
+ * RPAL_RECEIVER_STATE_WAIT and other thread is able to call it with
+ * RPAL call.
+ */
+ if (rpal_test_task_thread_flag(prev_p, RPAL_RECEIVER_BIT))
+ rpal_receiver_enter_wait(prev_p);
+#endif
+
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in(next_p);

diff --git a/arch/x86/rpal/internal.h b/arch/x86/rpal/internal.h
index cf6d608a994a..6256172bb79e 100644
--- a/arch/x86/rpal/internal.h
+++ b/arch/x86/rpal/internal.h
@@ -47,3 +47,10 @@ int rpal_unregister_sender(void);
int rpal_register_receiver(unsigned long addr);
int rpal_unregister_receiver(void);
void exit_rpal_thread(void);
+
+static inline unsigned long
+rpal_build_call_state(const struct rpal_sender_data *rsd)
+{
+ return ((rsd->rcd.service_id << RPAL_SID_SHIFT) |
+ (rsd->scc->sender_id << RPAL_ID_SHIFT) | RPAL_RECEIVER_STATE_CALL);
+}
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 3310d222739e..4f4719bb7eae 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -87,6 +87,13 @@ enum {

#define RPAL_ERROR_MAGIC 0x98CC98CC

+#define RPAL_SID_SHIFT 24
+#define RPAL_ID_SHIFT 8
+#define RPAL_RECEIVER_STATE_MASK ((1 << RPAL_ID_SHIFT) - 1)
+#define RPAL_SID_MASK (~((1 << RPAL_SID_SHIFT) - 1))
+#define RPAL_ID_MASK (~(0 | RPAL_RECEIVER_STATE_MASK | RPAL_SID_MASK))
+#define RPAL_MAX_ID ((1 << (RPAL_SID_SHIFT - RPAL_ID_SHIFT)) - 1)
+
extern unsigned long rpal_cap;

enum rpal_task_flag_bits {
@@ -94,6 +101,22 @@ enum rpal_task_flag_bits {
RPAL_RECEIVER_BIT,
};

+enum rpal_receiver_state {
+ RPAL_RECEIVER_STATE_RUNNING,
+ RPAL_RECEIVER_STATE_KERNEL_RET,
+ RPAL_RECEIVER_STATE_READY,
+ RPAL_RECEIVER_STATE_WAIT,
+ RPAL_RECEIVER_STATE_CALL,
+ RPAL_RECEIVER_STATE_LAZY_SWITCH,
+ RPAL_RECEIVER_STATE_MAX,
+};
+
+enum rpal_sender_state {
+ RPAL_SENDER_STATE_RUNNING,
+ RPAL_SENDER_STATE_CALL,
+ RPAL_SENDER_STATE_KERNEL_RET,
+};
+
/*
* user_meta will be sent to other service when requested.
*/
@@ -215,6 +238,8 @@ struct rpal_task_context {
struct rpal_receiver_call_context {
struct rpal_task_context rtc;
int receiver_id;
+ atomic_t receiver_state;
+ atomic_t sender_state;
};

/* recovery point for sender */
@@ -390,11 +415,35 @@ static inline bool rpal_test_current_thread_flag(unsigned long bit)
{
return test_bit(bit, &current->rpal_flag);
}
+
+static inline bool rpal_test_task_thread_flag(struct task_struct *tsk,
+ unsigned long bit)
+{
+ return test_bit(bit, &tsk->rpal_flag);
+}
+
+static inline void rpal_set_task_thread_flag(struct task_struct *tsk,
+ unsigned long bit)
+{
+ set_bit(bit, &tsk->rpal_flag);
+}
+
+static inline void rpal_clear_task_thread_flag(struct task_struct *tsk,
+ unsigned long bit)
+{
+ clear_bit(bit, &tsk->rpal_flag);
+}
#else
static inline struct rpal_service *rpal_current_service(void) { return NULL; }
static inline void rpal_set_current_thread_flag(unsigned long bit) { }
static inline void rpal_clear_current_thread_flag(unsigned long bit) { }
static inline bool rpal_test_current_thread_flag(unsigned long bit) { return false; }
+static inline bool rpal_test_task_thread_flag(struct task_struct *tsk,
+ unsigned long bit) { return false; }
+static inline void rpal_set_task_thread_flag(struct task_struct *tsk,
+ unsigned long bit) { }
+static inline void rpal_clear_task_thread_flag(struct task_struct *tsk,
+ unsigned long bit) { }
#endif

void rpal_unregister_service(struct rpal_service *rs);
@@ -414,4 +463,5 @@ struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);

extern void rpal_pick_mmap_base(struct mm_struct *mm,
struct rlimit *rlim_stack);
+int rpal_try_to_wake_up(struct task_struct *p);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62b3416f5e43..045e92ee2e3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -67,6 +67,7 @@
#include <linux/wait_api.h>
#include <linux/workqueue_api.h>
#include <linux/livepatch_sched.h>
+#include <linux/rpal.h>

#ifdef CONFIG_PREEMPT_DYNAMIC
# ifdef CONFIG_GENERIC_ENTRY
@@ -3820,6 +3821,40 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
return ret;
}

+#ifdef CONFIG_RPAL
+static bool rpal_check_state(struct task_struct *p)
+{
+ bool ret = true;
+
+ if (rpal_test_task_thread_flag(p, RPAL_RECEIVER_BIT)) {
+ struct rpal_receiver_call_context *rcc = p->rpal_rd->rcc;
+ int state;
+
+retry:
+ state = atomic_read(&rcc->receiver_state) & RPAL_RECEIVER_STATE_MASK;
+ switch (state) {
+ case RPAL_RECEIVER_STATE_READY:
+ case RPAL_RECEIVER_STATE_WAIT:
+ if (state != atomic_cmpxchg(&rcc->receiver_state, state,
+ RPAL_RECEIVER_STATE_RUNNING))
+ goto retry;
+ break;
+ case RPAL_RECEIVER_STATE_KERNEL_RET:
+ case RPAL_RECEIVER_STATE_LAZY_SWITCH:
+ case RPAL_RECEIVER_STATE_RUNNING:
+ break;
+ case RPAL_RECEIVER_STATE_CALL:
+ ret = false;
+ break;
+ default:
+ rpal_err("%s: invalid state: %d\n", __func__, state);
+ break;
+ }
+ }
+ return ret;
+}
+#endif
+
#ifdef CONFIG_SMP
void sched_ttwu_pending(void *arg)
{
@@ -3841,6 +3876,11 @@ void sched_ttwu_pending(void *arg)
if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
set_task_cpu(p, cpu_of(rq));

+#ifdef CONFIG_RPAL
+ if (!rpal_check_state(p))
+ continue;
+#endif
+
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}

@@ -4208,6 +4248,17 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (!ttwu_state_match(p, state, &success))
goto out;

+#ifdef CONFIG_RPAL
+ /*
+ * For rpal thread, we need to check if it can be woken up. If not,
+ * we do not wake it up here but wake it up later by kernel worker.
+ *
+ * For normal thread, nothing happens.
+ */
+ if (!rpal_check_state(p))
+ goto out;
+#endif
+
trace_sched_waking(p);
ttwu_do_wakeup(p);
goto out;
@@ -4224,6 +4275,11 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (!ttwu_state_match(p, state, &success))
break;

+#ifdef CONFIG_RPAL
+ if (!rpal_check_state(p))
+ break;
+#endif
+
trace_sched_waking(p);

/*
@@ -4344,6 +4400,56 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
return success;
}

+#ifdef CONFIG_RPAL
+int rpal_try_to_wake_up(struct task_struct *p)
+{
+ guard(preempt)();
+ int cpu, success = 0;
+ int wake_flags = WF_TTWU;
+
+ BUG_ON(READ_ONCE(p->__state) == TASK_RUNNING);
+
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ smp_mb__after_spinlock();
+ if (!ttwu_state_match(p, TASK_NORMAL, &success))
+ break;
+
+ trace_sched_waking(p);
+ /* see try_to_wake_up() */
+ smp_rmb();
+
+#ifdef CONFIG_SMP
+ smp_acquire__after_ctrl_dep();
+ WRITE_ONCE(p->__state, TASK_WAKING);
+ /* see try_to_wake_up() */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
+ break;
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+ if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ }
+#else
+ cpu = task_cpu(p);
+#endif
+ }
+ ttwu_queue(p, cpu, wake_flags);
+ if (success)
+ ttwu_stat(p, task_cpu(p), wake_flags);
+
+ return success;
+}
+#endif
+
static bool __task_needs_rq_lock(struct task_struct *p)
{
unsigned int state = READ_ONCE(p->__state);
@@ -6574,6 +6680,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#define SM_PREEMPT 1
#define SM_RTLOCK_WAIT 2

+#ifdef CONFIG_RPAL
+static inline void rpal_check_ready_state(struct task_struct *tsk, int state)
+{
+ if (rpal_test_task_thread_flag(tsk, RPAL_RECEIVER_BIT)) {
+ struct rpal_receiver_call_context *rcc = tsk->rpal_rd->rcc;
+
+ atomic_cmpxchg(&rcc->receiver_state, state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ }
+}
+#endif
+
/*
* Helper function for __schedule()
*
@@ -6727,7 +6845,19 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
+#ifdef CONFIG_RPAL
+ if (!try_to_block_task(rq, prev, &prev_state)) {
+ /*
+ * As the task enter TASK_RUNNING state, we should clean up
+ * RPAL_RECEIVER_STATE_READY status. Therefore, the receiver's
+ * state will not be change to RPAL_RECEIVER_STATE_WAIT. Thus,
+ * there is no RPAL call when a receiver is at TASK_RUNNING state.
+ */
+ rpal_check_ready_state(prev, RPAL_RECEIVER_STATE_READY);
+ }
+#else
try_to_block_task(rq, prev, &prev_state);
+#endif
switch_count = &prev->nvcsw;
}

--
2.20.1

Return-Path: <linux-kernel+bounces-667881-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 9254C41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:36:05 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 7C2751C000C5
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:36:01 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 073C622CBDC;
Fri, 30 May 2025 09:32:36 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="A3BLCd+H"
Received: from mail-pg1-f170.google.com (mail-pg1-f170.google.com [209.85.215.170])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3A24E223DC4
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:32:33 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.170
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597554; cv=none; b=kOs4jWXjzjd+V5V9iizfAPjCPWFPNid8wjaPgJ/mw28mDZUTeHMxyEy7WU1XRLDrXFhXqi9C3iH1YH6H+4gmMNTrUF8SCJFzuer+JtIhvdJrAAu54QO1C8iP75RgtdKT7HIB8CZOEp86oXoFEl2QoEqA3fMRTVSo2/nf/aa50lg=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597554; c=relaxed/simple;
bh=p7fOy/mpX7DgxiI+SapqF82ZoB9lHQGYA8nT1M0yWa8=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=dPtVY/uXf/0CVV8BqSGWFYsodoUDib2KurkA6Qkdwzw2Vggau7bQu1oBhmSs1/dxejmYTkOWc6oklgPz9vLqLvCsqHTjbBJ5Kpxioy1krqADr1KfNuTEnO78dG/YSi8Tar5+act4abdZVpIjQxrJ4QtNFYnGENdgOhTrPFGsknQ=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=A3BLCd+H; arc=none smtp.client-ip=209.85.215.170
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pg1-f170.google.com with SMTP id 41be03b00d2f7-b2c2c762a89so1337908a12.0
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:32:32 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597552; x=1749202352; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=PX/EGT89O+ZEiBiVi4yPzaBsD+Uha5lcnizaJutApCU=;
b=A3BLCd+H0XngyTK/aF3Qe3w7oFLL/TjuuoQLSpAqpevzMPXgXES+cXNeABDMSAxj54
jnJJob7xXOtFqVanAP1tnKnxxegezSXETfo+yFjgYlpoEhoWrQ70MtCjgii2VPGOkACD
+qWuJ+yVZmVykAIebhMKB4OkY4V8mNEWJWaeKE6B8MoRkKXXCWoMt2yzGTuIId3eiWNG
BBku4qA7Sd31bsvfmRnY7hfj7RCLU1m4vbzep0cDBhxBX8nbfipk058dW/RL2cSaSw7Q
1/ugwCYRTPAO9I+Yg1nstYKXH1V7v5zsR3dThG7ULbqGrk1KlHjAlkabkC6r/6dm/kIe
fQEQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597552; x=1749202352;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=PX/EGT89O+ZEiBiVi4yPzaBsD+Uha5lcnizaJutApCU=;
b=jnPANCFP6xZY10q0QKxGJfQRoFfKBmn+p8DS+a29aNfnsn6BwVh6F4ieQni14b/oDx
BUS1nzXbAS/lK4oR153uTK9/FaKEHrIg43lISZ889V5T8m3i/rybdMJinrYp4e1E1LZi
BDN/BeWtM4Zv+0ScjDFXqsSgSsikD9UnjrwV9cYDihiyS7XDk2ldIUDmApZqeBWRFkzg
7ko52plYFcEFVYimlA1h8QRTP1azHd/GTFey4fqvS7vHDcHEpOAhf2QJyHScjuwwswHi
aloHUEsxodmSS00R+tXfJ0jkpA85ajKGuYG4KzzvFRF2bxrUr8dbwTFly9jWnIsPD53j
U1Hg==
X-Forwarded-Encrypted: i=1; AJvYcCV9pVa5DwQVr6N4oLMp3iIOovKt3N5MPgCE5OKQ/dhLyhH4GcjHM7zSqO472jD4ts/XqGx9f0RPOOHNWX0=@vger.kernel.org
X-Gm-Message-State: AOJu0Yzwr+aSYiQ8WQOUeAtfs7qi8WXSjScKW93L0JEy/fcuWLtdDPLI
jdsOkKpxYNHTCoNJJHojJHi+POe0gYXLIvprwiKNhrM3LsCW2WLE31E2smHm+dqY/pw=
X-Gm-Gg: ASbGncsOukfMB5zxIQv2oKv9fmzHy0gBILoMY4mHo8IzwGAIxsND47btTt8gQrUqU78
VQKi0EbIu1KI39XeuTT+AOajB36otqRxGtuE2QhhlAJ0s+n838lQ1cYkt+9pDTuy/TdVkwimKzI
JOdqyXYlWwOIysh5nTFOccrLTuXreKXYVEUMQM7JylKVyrEn7XCEDQjJxYf+lLa1+9U2RfxjZLl
BkiJIpPfXYq6qL+N52GlxERKp96egU9FsWitskS4JbulX9PEQpiDiRumrEB108zAtS0OK+CBM7/
j9o5IqPhD/nGp/hk5Z6dsJLCbh8wCNHjuP7TtNhkPyKjG2UI8boMu7+fwarGWy7cTc3GGB1zdfm
vXz1TzmD1lA==
X-Google-Smtp-Source: AGHT+IHBBO2+1F1DkG3gsf9hrt/klQKja6FHxL7s2ecvsv49mNDcLA/dM81wOfpp/PHgMJRmMz0RTg==
X-Received: by 2002:a17:90b:3dce:b0:311:baa0:89ca with SMTP id 98e67ed59e1d1-31241e98d1bmr3478119a91.34.1748597552297;
Fri, 30 May 2025 02:32:32 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.32.17
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:32:32 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 16/35] RPAL: add cpu lock interface
Date: Fri, 30 May 2025 17:27:44 +0800
Message-Id: <8ff6cea94a6438a0856c86a11d56be462314b1f8.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

Lazy switch enables the kernel to switch from one task to another to keep
the kernel context and user context matched. For the scheduler, both tasks
involved in the context switch must reside in the same run queue (rq).
Therefore, before a lazy switch occurs, the kernel must first bind both
tasks to the same CPU to facilitate the subsequent context switch.

This patch introduces the rpal_lock_cpu() interface, which binds two tasks
to the same CPU while bypassing cpumask restrictions. The rpal_unlock_cpu()
function serves as the inverse operation to release this binding. To ensure
consistency, the kernel must prevent other threads from modifying the CPU
affinity of tasks locked by rpal_lock_cpu(). Therefore, when using
set_cpus_allowed_ptr() to change a task's CPU affinity, other threads must
wait until the binding established by rpal_lock_cpu() is released before
proceeding with modifications.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/core.c | 18 +++++++
arch/x86/rpal/thread.c | 14 ++++++
include/linux/rpal.h | 8 +++
kernel/sched/core.c | 109 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 149 insertions(+)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 61f5d40b0157..c185a453c1b2 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -15,6 +15,24 @@ int __init rpal_init(void);
bool rpal_inited;
unsigned long rpal_cap;

+static inline void rpal_lock_cpu(struct task_struct *tsk)
+{
+ rpal_set_cpus_allowed_ptr(tsk, true);
+ if (unlikely(!irqs_disabled())) {
+ local_irq_disable();
+ rpal_err("%s: irq is enabled\n", __func__);
+ }
+}
+
+static inline void rpal_unlock_cpu(struct task_struct *tsk)
+{
+ rpal_set_cpus_allowed_ptr(tsk, false);
+ if (unlikely(!irqs_disabled())) {
+ local_irq_disable();
+ rpal_err("%s: irq is enabled\n", __func__);
+ }
+}
+
int __init rpal_init(void)
{
int ret = 0;
diff --git a/arch/x86/rpal/thread.c b/arch/x86/rpal/thread.c
index e50a4c865ff8..bc203e9c6e5e 100644
--- a/arch/x86/rpal/thread.c
+++ b/arch/x86/rpal/thread.c
@@ -47,6 +47,10 @@ int rpal_register_sender(unsigned long addr)
}

rpal_common_data_init(&rsd->rcd);
+ if (rpal_init_thread_pending(&rsd->rcd)) {
+ ret = -ENOMEM;
+ goto free_rsd;
+ }
rsd->rsp = rsp;
rsd->scc = (struct rpal_sender_call_context *)(addr - rsp->user_start +
rsp->kernel_start);
@@ -58,6 +62,8 @@ int rpal_register_sender(unsigned long addr)

return 0;

+free_rsd:
+ kfree(rsd);
put_shared_page:
rpal_put_shared_page(rsp);
out:
@@ -77,6 +83,7 @@ int rpal_unregister_sender(void)

rpal_put_shared_page(rsd->rsp);
rpal_clear_current_thread_flag(RPAL_SENDER_BIT);
+ rpal_free_thread_pending(&rsd->rcd);
kfree(rsd);

atomic_dec(&cur->thread_cnt);
@@ -116,6 +123,10 @@ int rpal_register_receiver(unsigned long addr)
}

rpal_common_data_init(&rrd->rcd);
+ if (rpal_init_thread_pending(&rrd->rcd)) {
+ ret = -ENOMEM;
+ goto free_rrd;
+ }
rrd->rsp = rsp;
rrd->rcc =
(struct rpal_receiver_call_context *)(addr - rsp->user_start +
@@ -128,6 +139,8 @@ int rpal_register_receiver(unsigned long addr)

return 0;

+free_rrd:
+ kfree(rrd);
put_shared_page:
rpal_put_shared_page(rsp);
out:
@@ -147,6 +160,7 @@ int rpal_unregister_receiver(void)

rpal_put_shared_page(rrd->rsp);
rpal_clear_current_thread_flag(RPAL_RECEIVER_BIT);
+ rpal_free_thread_pending(&rrd->rcd);
kfree(rrd);

atomic_dec(&cur->thread_cnt);
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 4f4719bb7eae..5b115be14a55 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -99,6 +99,7 @@ extern unsigned long rpal_cap;
enum rpal_task_flag_bits {
RPAL_SENDER_BIT,
RPAL_RECEIVER_BIT,
+ RPAL_CPU_LOCKED_BIT,
};

enum rpal_receiver_state {
@@ -270,8 +271,12 @@ struct rpal_shared_page {
struct rpal_common_data {
/* back pointer to task_struct */
struct task_struct *bp_task;
+ /* pending struct for cpu locking */
+ void *pending;
/* service id of rpal_service */
int service_id;
+ /* cpumask before locked */
+ cpumask_t old_mask;
};

struct rpal_receiver_data {
@@ -464,4 +469,7 @@ struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
extern void rpal_pick_mmap_base(struct mm_struct *mm,
struct rlimit *rlim_stack);
int rpal_try_to_wake_up(struct task_struct *p);
+int rpal_init_thread_pending(struct rpal_common_data *rcd);
+void rpal_free_thread_pending(struct rpal_common_data *rcd);
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 045e92ee2e3b..a862bf4a0161 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3155,6 +3155,104 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
return ret;
}

+#ifdef CONFIG_RPAL
+int rpal_init_thread_pending(struct rpal_common_data *rcd)
+{
+ struct set_affinity_pending *pending;
+
+ pending = kzalloc(sizeof(*pending), GFP_KERNEL);
+ if (!pending)
+ return -ENOMEM;
+ pending->stop_pending = 0;
+ pending->arg = (struct migration_arg){
+ .task = current,
+ .pending = NULL,
+ };
+ rcd->pending = pending;
+ return 0;
+}
+
+void rpal_free_thread_pending(struct rpal_common_data *rcd)
+{
+ if (rcd->pending != NULL)
+ kfree(rcd->pending);
+}
+
+/*
+ * CPU lock is forced and all cpumask will be ignored by RPAL temporary.
+ */
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock)
+{
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ struct set_affinity_pending *pending = p->rpal_cd->pending;
+ struct cpumask mask;
+ unsigned int dest_cpu;
+ struct rq_flags rf;
+ struct rq *rq;
+ int ret = 0;
+ struct affinity_context ac = {
+ .new_mask = &mask,
+ .flags = 0,
+ };
+
+ if (unlikely(p->flags & PF_KTHREAD))
+ rpal_err("p: %d, p->flags & PF_KTHREAD\n", p->pid);
+
+ rq = task_rq_lock(p, &rf);
+
+ if (is_lock) {
+ cpumask_copy(&p->rpal_cd->old_mask, &p->cpus_mask);
+ cpumask_clear(&mask);
+ cpumask_set_cpu(smp_processor_id(), &mask);
+ rpal_set_task_thread_flag(p, RPAL_CPU_LOCKED_BIT);
+ } else {
+ cpumask_copy(&mask, &p->rpal_cd->old_mask);
+ rpal_clear_task_thread_flag(p, RPAL_CPU_LOCKED_BIT);
+ }
+
+ update_rq_clock(rq);
+
+ if (cpumask_equal(&p->cpus_mask, ac.new_mask))
+ goto out;
+ /*
+ * Picking a ~random cpu helps in cases where we are changing affinity
+ * for groups of tasks (ie. cpuset), so that load balancing is not
+ * immediately required to distribute the tasks within their new mask.
+ */
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ac.new_mask);
+ if (dest_cpu >= nr_cpu_ids) {
+ ret = -EINVAL;
+ goto out;
+ }
+ __do_set_cpus_allowed(p, &ac);
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ preempt_disable();
+ task_rq_unlock(rq, p, &rf);
+ preempt_enable();
+ } else {
+ pending->arg.dest_cpu = dest_cpu;
+
+ if (task_on_cpu(rq, p) ||
+ READ_ONCE(p->__state) == TASK_WAKING) {
+ preempt_disable();
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ } else {
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(rq, &rf, p, dest_cpu);
+ task_rq_unlock(rq, p, &rf);
+ }
+ }
+
+ return 0;
+
+out:
+ task_rq_unlock(rq, p, &rf);
+ return ret;
+}
+#endif
+
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
@@ -3169,7 +3267,18 @@ int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
struct rq_flags rf;
struct rq *rq;

+#ifdef CONFIG_RPAL
+retry:
+ rq = task_rq_lock(p, &rf);
+ if (rpal_test_task_thread_flag(p, RPAL_CPU_LOCKED_BIT)) {
+ update_rq_clock(rq);
+ task_rq_unlock(rq, p, &rf);
+ schedule();
+ goto retry;
+ }
+#else
rq = task_rq_lock(p, &rf);
+#endif
/*
* Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
* flags are set.
--
2.20.1

Return-Path: <linux-kernel+bounces-667882-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 8D87941E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:36:17 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 6E45C1889E14
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:36:18 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 45C8121CA1C;
Fri, 30 May 2025 09:32:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="TGRyou1+"
Received: from mail-pg1-f172.google.com (mail-pg1-f172.google.com [209.85.215.172])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9CEF3191499
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:32:48 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.172
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597570; cv=none; b=YcHWlywGf/LG/SmA0VnbzB1yiWwQGCLzl693T+t8enpcN4JGNrV4MAOv8lSqHc1E0KZYgQxLa3p/oCLKdOp4EB2FMXZBkOkeWocYgxFMz9FbKnBBomRSBOxvvSBaAXu6TiiCtzXREESMk6OBuLdZa5ozYnm5kvAv5p7tgtKa8fk=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597570; c=relaxed/simple;
bh=gli6f5Pht7GF/KlCotN9/63ZsUUJh74HTo8OtVCQXPg=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=YWJwjktg2KDtYXoMklmtp3isLSvcM0wXnJHGpv7iZrzHhXy1iBFgnU407Kpg+flij+KC7ml6DLdTJQoN661mFMNA3D8LkDSDJac1KayLvX5TQqx9IanraTLsbK6HHZZSQ3eechPHNQyL+FlkFKlqozeYNAh+25FYB0PeaBxXLTM=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=TGRyou1+; arc=none smtp.client-ip=209.85.215.172
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pg1-f172.google.com with SMTP id 41be03b00d2f7-b26c5fd40a9so2319221a12.1
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:32:48 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597568; x=1749202368; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=7odL+uROHJjULwRoyw6w9QE9qQsYAMCGXEhdmggZkMI=;
b=TGRyou1+eoQBbWF9THJjh1oSMbLF2olFxpCnXsDswPGjPu0YnZ2bJxH94qIXOBGdZ9
3nf74OUenUuM3UitMYVqcsSUR8fPNwxslOznu1WUFo0TiGV7ZVFWe6Co94WUjwHpxPcL
SV9wdUwyXHcHfyxsArc5HSRPIaaYHekpxs4V1pEl2/E0gp9DTlLS3HY57ZP/FE/kxHOl
TYUqMcWq4KtwsurVUNXgxm+hPyraAPBpSkWB1vNiTtzIwtH63i+AQhWjNCGI7KeoBi8v
yLKfp3nWPL0O9eOrUpaY5oG0r2wBdZ1hQlz4QdT0FXwhgRwB+HbcI/BVHIz0b9vLrF7k
QrSg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597568; x=1749202368;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=7odL+uROHJjULwRoyw6w9QE9qQsYAMCGXEhdmggZkMI=;
b=vF/WQrzTXWmCR6PsM3rmOc8AZ2Fy2n/Cfh5/tzSGAlznfFDcLfpsTx2VqWKrt8OP2Q
G3DSSmy7C+ui2VroQEZIZuTuRXjmakqUiHH3b8ffZ3eJz8enxfMeXslGxwx4YT16Gn02
hYPl0l1rLU1OlCY4yRVnpzGBrZnwi3ShCwusyTQNMDm90z5CyLRbFzYDhNINs1p5fDwI
RjEHN6BjrAFgc6BYLbjUeYROi/wAsEhvtnVsmaQ7b6hIAbi2HGzyQTq+g+1pYKNFiQ0M
FiRY0saa8svsZ1aoP2zVgWOuKhriqjMxlyDciEPQFOZXdOlA/R50gGEoOdpX9lyypJR6
0FYQ==
X-Forwarded-Encrypted: i=1; AJvYcCXdmKzUaak1kNhrksN2UL3Bo/pb2S35tj8b0mwOzJ1Tqo42A7lDGEWRoJmNFI+UPdWCdXF7JhbvpeY0ASc=@vger.kernel.org
X-Gm-Message-State: AOJu0YypeiCf8pSI9LUma11pT+kqRakI63QBfGiSwV3DTb/sb+Hk22ec
dldiC7dYxM2vWdon7YH64Jos5AwoTZjhULyhTrXqg7r1WypbTkThUrcTpB5NO7zbBf0=
X-Gm-Gg: ASbGncuOe7IrcPNto6rGnZQufPx6w/FNqvsH5jfosHAoAZeDpGF9mi9ZomJN1TLJdqj
lo4pkt21nBYvUBrHIAG6smbRwKY4akbEVtUeLRdUKWnP20BfAbfdgb5J31tU2kKjPUz04srWgpE
g24JJh3A6R1OA2+FtbkyWlYuNICWDX70+uj11GARPuGgrFSRDWTKkLskwtQlwMmW/TqPjZgGJ28
ng4gvtxKaasfB3yHZ0m5eaXiCRE9KWV7loKl3+TtJsPUa7xNt0rgzoljrzbLaRV9UqgkaDipiNk
6YAX/Ns3wiV1LhW3p6Xbv8L8K/4xRTZ1gNRVrnoemkaY1S1lHhHdsI0/yMWe2Yv1pr0eYpx8lVh
5TArE6tEzQw==
X-Google-Smtp-Source: AGHT+IE+I3OxPDYMOIkiL8a1fAG5KorJApLQ8jYCZgv/H0F4D9V+Weoec5Tdc5pVnNzU/vxuroLuaA==
X-Received: by 2002:a17:90b:3eca:b0:30e:5c74:53c9 with SMTP id 98e67ed59e1d1-31246453833mr3702580a91.11.1748597567790;
Fri, 30 May 2025 02:32:47 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.32.32
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:32:47 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 17/35] RPAL: add a mapping between fsbase and tasks
Date: Fri, 30 May 2025 17:27:45 +0800
Message-Id: <964eab3190221c0c880ee9a52957865512c8571c.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

RPAL relies on the value of the fsbase register to determine whether a lazy
switch is necessary. Therefore, a mapping between fsbase and tasks must be
established.

This patch allows a thread to register its fsbase value when it is
registered as a receiver. The rpal_find_next_task() interface is used to
locate the receiver corresponding to a given fsbase value. Additionally, a
new rpal_misidentify() interface has been added to check if the current
fsbase value matches the current task. If they do not match, the task
corresponding to the fsbase is identified, the RPAL_LAZY_SWITCHED_BIT flag
is set, and the current task is recorded. The kernel can later use this
flag and the recorded task to backtrack to the task before the lazy switch.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/core.c | 85 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/rpal/thread.c | 57 +++++++++++++++++++++++++++-
include/linux/rpal.h | 15 ++++++++
3 files changed, 156 insertions(+), 1 deletion(-)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index c185a453c1b2..19c4ef38bca3 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -7,6 +7,7 @@
*/

#include <linux/rpal.h>
+#include <asm/fsgsbase.h>

#include "internal.h"

@@ -33,12 +34,96 @@ static inline void rpal_unlock_cpu(struct task_struct *tsk)
}
}

+
+static inline struct task_struct *rpal_get_sender_task(void)
+{
+ struct task_struct *next;
+
+ next = current->rpal_rd->sender;
+ current->rpal_rd->sender = NULL;
+
+ return next;
+}
+
+/*
+ * RPAL uses the value of fsbase (which libc uses as the base
+ * address for thread-local storage) to determine whether a
+ * lazy switch should be performed.
+ */
+static inline struct task_struct *rpal_misidentify(void)
+{
+ struct task_struct *next = NULL;
+ struct rpal_service *cur = rpal_current_service();
+ unsigned long fsbase;
+
+ fsbase = rdfsbase();
+ if (unlikely(!rpal_is_correct_address(cur, fsbase))) {
+ if (rpal_test_current_thread_flag(RPAL_LAZY_SWITCHED_BIT)) {
+ /* current is receiver, next is sender */
+ next = rpal_get_sender_task();
+ if (unlikely(next == NULL)) {
+ rpal_err("cannot find sender task\n");
+ goto out;
+ }
+ } else {
+ /* current is sender, next is receiver */
+ next = rpal_find_next_task(fsbase);
+ if (unlikely(next == NULL)) {
+ rpal_err(
+ "cannot find receiver task, fsbase: 0x%016lx\n",
+ fsbase);
+ goto out;
+ }
+ rpal_set_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT);
+ next->rpal_rd->sender = current;
+ }
+ }
+out:
+ return next;
+}
+
+struct task_struct *rpal_find_next_task(unsigned long fsbase)
+{
+ struct rpal_service *cur = rpal_current_service();
+ struct rpal_service *tgt;
+ struct task_struct *tsk = NULL;
+ int i;
+
+ tgt = rpal_get_mapped_service_by_addr(cur, fsbase);
+ if (unlikely(!tgt)) {
+ pr_debug("rpal debug: cannot find legal rs, fsbase: 0x%016lx\n",
+ fsbase);
+ return NULL;
+ }
+ for (i = 0; i < RPAL_MAX_RECEIVER_NUM; ++i) {
+ if (tgt->fs_tsk_map[i].fsbase == fsbase) {
+ tsk = tgt->fs_tsk_map[i].tsk;
+ break;
+ }
+ }
+ rpal_put_service(tgt);
+
+ return tsk;
+}
+
+static bool check_hardware_features(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) {
+ rpal_err("no fsgsbase feature\n");
+ return false;
+ }
+ return true;
+}
+
int __init rpal_init(void)
{
int ret = 0;

rpal_cap = 0;

+ if (!check_hardware_features())
+ goto fail;
+
ret = rpal_service_init();
if (ret)
goto fail;
diff --git a/arch/x86/rpal/thread.c b/arch/x86/rpal/thread.c
index bc203e9c6e5e..db3b13ff82be 100644
--- a/arch/x86/rpal/thread.c
+++ b/arch/x86/rpal/thread.c
@@ -7,9 +7,53 @@
*/

#include <linux/rpal.h>
+#include <asm/fsgsbase.h>

#include "internal.h"

+static bool set_fs_tsk_map(void)
+{
+ struct rpal_service *cur = rpal_current_service();
+ struct rpal_fsbase_tsk_map *ftm;
+ unsigned long fsbase = rdfsbase();
+ bool success = false;
+ int i = 0;
+
+ for (i = 0; i < RPAL_MAX_RECEIVER_NUM; ++i) {
+ ftm = &cur->fs_tsk_map[i];
+ if (ftm->fsbase == 0 &&
+ cmpxchg64(&ftm->fsbase, 0, fsbase) == 0) {
+ ftm->tsk = current;
+ success = true;
+ break;
+ }
+ }
+
+ return success;
+}
+
+static bool clear_fs_tsk_map(void)
+{
+ struct rpal_service *cur = rpal_current_service();
+ struct rpal_fsbase_tsk_map *ftm;
+ unsigned long fsbase = rdfsbase();
+ bool success = false;
+ int i = 0;
+
+ for (i = 0; i < RPAL_MAX_RECEIVER_NUM; ++i) {
+ ftm = &cur->fs_tsk_map[i];
+ if (ftm->fsbase == fsbase) {
+ ftm->tsk = NULL;
+ barrier();
+ ftm->fsbase = 0;
+ success = true;
+ break;
+ }
+ }
+
+ return success;
+}
+
static void rpal_common_data_init(struct rpal_common_data *rcd)
{
rcd->bp_task = current;
@@ -54,6 +98,7 @@ int rpal_register_sender(unsigned long addr)
rsd->rsp = rsp;
rsd->scc = (struct rpal_sender_call_context *)(addr - rsp->user_start +
rsp->kernel_start);
+ rsd->receiver = NULL;

current->rpal_sd = rsd;
rpal_set_current_thread_flag(RPAL_SENDER_BIT);
@@ -122,15 +167,21 @@ int rpal_register_receiver(unsigned long addr)
goto put_shared_page;
}

+ if (!set_fs_tsk_map()) {
+ ret = -EAGAIN;
+ goto free_rrd;
+ }
+
rpal_common_data_init(&rrd->rcd);
if (rpal_init_thread_pending(&rrd->rcd)) {
ret = -ENOMEM;
- goto free_rrd;
+ goto clear_fs;
}
rrd->rsp = rsp;
rrd->rcc =
(struct rpal_receiver_call_context *)(addr - rsp->user_start +
rsp->kernel_start);
+ rrd->sender = NULL;

current->rpal_rd = rrd;
rpal_set_current_thread_flag(RPAL_RECEIVER_BIT);
@@ -139,6 +190,8 @@ int rpal_register_receiver(unsigned long addr)

return 0;

+clear_fs:
+ clear_fs_tsk_map();
free_rrd:
kfree(rrd);
put_shared_page:
@@ -158,6 +211,8 @@ int rpal_unregister_receiver(void)
goto out;
}

+ clear_fs_tsk_map();
+
rpal_put_shared_page(rrd->rsp);
rpal_clear_current_thread_flag(RPAL_RECEIVER_BIT);
rpal_free_thread_pending(&rrd->rcd);
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 5b115be14a55..45137770fac6 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -80,6 +80,9 @@
/* No more than 15 services can be requested due to limitation of MPK. */
#define MAX_REQUEST_SERVICE 15

+/* We allow at most 16 receiver thread in one process */
+#define RPAL_MAX_RECEIVER_NUM 16
+
enum {
RPAL_REQUEST_MAP,
RPAL_REVERSE_MAP,
@@ -100,6 +103,7 @@ enum rpal_task_flag_bits {
RPAL_SENDER_BIT,
RPAL_RECEIVER_BIT,
RPAL_CPU_LOCKED_BIT,
+ RPAL_LAZY_SWITCHED_BIT,
};

enum rpal_receiver_state {
@@ -145,6 +149,11 @@ struct rpal_poll_data {
wait_queue_head_t rpal_waitqueue;
};

+struct rpal_fsbase_tsk_map {
+ unsigned long fsbase;
+ struct task_struct *tsk;
+};
+
/*
* Each RPAL process (a.k.a RPAL service) should have a pointer to
* struct rpal_service in all its tasks' task_struct.
@@ -202,6 +211,9 @@ struct rpal_service {
/* Notify service is released by others */
struct rpal_poll_data rpd;

+ /* fsbase / pid map */
+ struct rpal_fsbase_tsk_map fs_tsk_map[RPAL_MAX_RECEIVER_NUM];
+
/* delayed service put work */
struct delayed_work delayed_put_work;

@@ -283,12 +295,14 @@ struct rpal_receiver_data {
struct rpal_common_data rcd;
struct rpal_shared_page *rsp;
struct rpal_receiver_call_context *rcc;
+ struct task_struct *sender;
};

struct rpal_sender_data {
struct rpal_common_data rcd;
struct rpal_shared_page *rsp;
struct rpal_sender_call_context *scc;
+ struct task_struct *receiver;
};

enum rpal_command_type {
@@ -465,6 +479,7 @@ struct rpal_service *rpal_get_mapped_service_by_id(struct rpal_service *rs,
int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
unsigned long addr, int error_code);
struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
+struct task_struct *rpal_find_next_task(unsigned long fsbase);

extern void rpal_pick_mmap_base(struct mm_struct *mm,
struct rlimit *rlim_stack);
--
2.20.1

Return-Path: <linux-kernel+bounces-667883-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sv.mirrors.kernel.org (sv.mirrors.kernel.org [139.178.88.99])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 2F6FE41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:36:21 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sv.mirrors.kernel.org (Postfix) with ESMTPS id F04713BCB12
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:35:59 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 1CE68227BAA;
Fri, 30 May 2025 09:33:07 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="R9FVY4LF"
Received: from mail-pj1-f52.google.com (mail-pj1-f52.google.com [209.85.216.52])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2757522170B
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:33:03 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.52
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597586; cv=none; b=TKOZOJn26rLUJ1Xyt9nhdi8SzGvoG4C3WRUlP6B3gKNdyWCKGv5SOkaaQGkjf4RZv78/msIxV4j+WGLmZGOgB9YauK1fBcCWQsdhgO4/O/hPci9uNVqLb4iXnBPbVeeIYJlTOgj4VQbGArF/XaOSqYDJTmzpXXajD3UwW+GJ4pg=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597586; c=relaxed/simple;
bh=eF1gU64raWj5Emit2P3crNbm/xRze6gEOQkFioffxcs=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=FnJdftun2OGJQ7BddFXxFkimTgoV6RIUt7xmgaej5KtCwPwkP9vuwJi4KUppmt1OhxVk8CbWb76uZapYdZlpIynd0XFf1/xUVHZNDaFcypLLCpIxPQUfJPPmHsQSvGnwtNr5Fyd+gBcLX94lnlJH0qxQJ56m+svdToyhYkWY/oM=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=R9FVY4LF; arc=none smtp.client-ip=209.85.216.52
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pj1-f52.google.com with SMTP id 98e67ed59e1d1-3081f72c271so1408383a91.0
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:33:03 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597583; x=1749202383; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=dlxrRjV8tJmbvpYFl1bSrUikb+IFpVRg2V1s2EQ5voc=;
b=R9FVY4LF1Zrf/Gp3vENFInTrH6itORQS+OA1/+RoJ+ucGi05UAckNOWotomOVQctzl
zG7AWofEo4WqwFTlXq50CKPfhYItqxmTicTdRO5SWw7ohYdp02PlQddlnla1xotDbo+W
CeGfj86h1fo4ZKax11DGGNo6+4KLAOifBpSNtrGAhViBoHwkYhwKN+P6DijMC57AyoWK
GghgOqvu8A54AzXTqf1gxGTET8iBpxpIArrUZVZSqr51J1pjrGa950Ba6pib26pYu7LB
KugC80k8QvQRYrihDIhMSHDZFF4VnXJUR+siVu2gcstIfhNNC6smd4Ra1/VHIMj1J1r+
HGag==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597583; x=1749202383;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=dlxrRjV8tJmbvpYFl1bSrUikb+IFpVRg2V1s2EQ5voc=;
b=ZL/V7nulTRP1vti1ugRPkqwT8dod9EpA+XP6JxF5X7G9t0YYHDYBUwoK2qClpNeSLw
5kw8QN9ASWOOWh8uQpz12+ZH1H+QJ1rScCeinI5w+0zzsxAeqGYoCEZdk7wU+VgaeBmC
a9nmj4UbssZPhbHXbKgKdPP0k8213Fg23VH1mZRnN2a+t8LJ7CkPc9F46ijiRkaK6qGs
9DUe3IKiJ6fh++XE6+UP119xK24F+P7TYW4eUOtaOmTKApwtYVOI+yAf29IH5esenKZu
vZVci5RUjFKDWbIpLsCjPKx/DjFORi/Gv4Nouw5a+whC3KwESPyRK/lR2EYrQNHFFtiZ
n3Pw==
X-Forwarded-Encrypted: i=1; AJvYcCURPVZTBUH2Z5UhKFHSmmYFzve3C9K1shtJstbs4qkP6oyXCfYmLIzAFM+kGegtG1mAjwJEtIwA/cYbEjc=@vger.kernel.org
X-Gm-Message-State: AOJu0YxxOyL6wyY/U0ARwDwsOrzc9JlVeH/1ILMcm7ce5sDaEZcGBYAE
zEPr7HrH0nab4glT1S1bFNxVAK4qFRFh9mcYROOB1Rtz6StxVVSzvqG8I6Ne0DeLZyQ=
X-Gm-Gg: ASbGnctD1qWR3D5K0zp1WUcJ3stw1rAS+EFHdRedEdgbTeE9GMIO9pSdNNXtASvyG2q
mgPIggxNHATwJujHTCOdb0Yz3YU0WthjdYV0AQvOTR4iJe+P5+obcpyTkmqJY4iGoDeHi4egaMc
UCpqAaI7vMyP6ygmk5MEfOjDx9/4S/cgOkLL2V/TronuZ/x2xYJ36NinxI2cpiM9DThrKNZBagn
6L5gSSNtHA5iAnf70hFCwgvNEiSw6sKDIlX4oPYmcy6IZOU8xZN5pdpYd/yfR+3Xqpegei0LGLl
ac4PybmXSilUaYosDxwjIT4ESbYydBXuTq+itlMhfIlEPsDYVctkFliEW8sge9jVqKbdllzN2iF
yaI0+UDjJQeQEmDQYU/G7
X-Google-Smtp-Source: AGHT+IFRxV/uO1ynrbPQpKYbWo1BdkPpqPJfwKaGM+M1EblRzTIKuZgxM4JDshmIh4OUS5cYyqK/IQ==
X-Received: by 2002:a17:90b:380a:b0:311:c596:5c6f with SMTP id 98e67ed59e1d1-31250422c83mr2343246a91.17.1748597583233;
Fri, 30 May 2025 02:33:03 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.32.48
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:33:02 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 18/35] sched: pick a specified task
Date: Fri, 30 May 2025 17:27:46 +0800
Message-Id: <6e785c48ed266694748e0e71e264b94b27d9fa7b.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

When a lazy switch occurs, the kernel already gets the task_struct of the
next task to switch to. However, the CFS does not provide an interface to
explicitly specify the next task. Therefore, RPAL must implement its own
mechanism to pick a specified task.

This patch introduces two interfaces, rpal_pick_next_task_fair() and
rpal_pick_task_fair(), to achieve this functionality. These interfaces
leverage the sched_entity of the target task to modify the CFS data
structures directly. Additionally, the patch adapts to the SCHED_CORE
feature by temporarily setting the highest weight for the specified task,
ensuring that the core will select this task preferentially during
scheduling decisions.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 212 +++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 109 ++++++++++++++++++++++
kernel/sched/sched.h | 8 ++
3 files changed, 329 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a862bf4a0161..2e76376c5172 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11003,3 +11003,215 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
set_next_task(rq, ctx->p);
}
#endif /* CONFIG_SCHED_CLASS_EXT */
+
+#ifdef CONFIG_RPAL
+#ifdef CONFIG_SCHED_CORE
+static inline struct task_struct *
+__rpal_pick_next_task(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next, struct rq_flags *rf)
+{
+ struct task_struct *p;
+
+ if (likely(prev->sched_class == &fair_sched_class &&
+ next->sched_class == &fair_sched_class)) {
+ p = rpal_pick_next_task_fair(prev, next, rq, rf);
+ return p;
+ }
+
+ BUG();
+}
+
+static struct task_struct *rpal_pick_next_task(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next,
+ struct rq_flags *rf)
+{
+ struct task_struct *p;
+ const struct cpumask *smt_mask;
+ bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
+ unsigned long cookie;
+ int i, cpu, occ = 0;
+ struct rq *rq_i;
+ bool need_sync;
+
+ if (!sched_core_enabled(rq))
+ return __rpal_pick_next_task(rq, prev, next, rf);
+
+ cpu = cpu_of(rq);
+
+ /* Stopper task is switching into idle, no need core-wide selection. */
+ if (cpu_is_offline(cpu)) {
+ rq->core_pick = NULL;
+ return __rpal_pick_next_task(rq, prev, next, rf);
+ }
+
+ if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+ rq->core->core_pick_seq != rq->core_sched_seq &&
+ rq->core_pick) {
+ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+ /* ignore rq->core_pick, always pick next */
+ if (rq->core_pick == next) {
+ put_prev_task(rq, prev);
+ set_next_task(rq, next);
+
+ rq->core_pick = NULL;
+ goto out;
+ }
+ }
+
+ put_prev_task_balance(rq, prev, rf);
+
+ smt_mask = cpu_smt_mask(cpu);
+ need_sync = !!rq->core->core_cookie;
+
+ /* reset state */
+ rq->core->core_cookie = 0UL;
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
+ need_sync = true;
+ fi_before = true;
+ }
+
+ rq->core->core_task_seq++;
+
+ if (!need_sync) {
+ next = rpal_pick_task_fair(rq, next);
+ if (!next->core_cookie) {
+ rq->core_pick = NULL;
+ /*
+ * For robustness, update the min_vruntime_fi for
+ * unconstrained picks as well.
+ */
+ WARN_ON_ONCE(fi_before);
+ task_vruntime_update(rq, next, false);
+ goto out_set_next;
+ }
+ }
+
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ rq_i = cpu_rq(i);
+
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
+ update_rq_clock(rq_i);
+
+ /* ignore prio, always pick next */
+ if (i == cpu)
+ rq_i->core_pick = rpal_pick_task_fair(rq, next);
+ else
+ rq_i->core_pick = pick_task(rq_i);
+ }
+
+ cookie = rq->core->core_cookie = next->core_cookie;
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick;
+
+ if (!cookie_equals(p, cookie)) {
+ p = NULL;
+ if (cookie)
+ p = sched_core_find(rq_i, cookie);
+ if (!p)
+ p = idle_sched_class.pick_task(rq_i);
+ }
+
+ rq_i->core_pick = p;
+
+ if (p == rq_i->idle) {
+ if (rq_i->nr_running) {
+ rq->core->core_forceidle_count++;
+ if (!fi_before)
+ rq->core->core_forceidle_seq++;
+ }
+ } else {
+ occ++;
+ }
+ }
+
+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
+ rq->core->core_pick_seq = rq->core->core_task_seq;
+ WARN_ON_ONCE(next != rq->core_pick);
+ rq->core_sched_seq = rq->core->core_pick_seq;
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+
+ /*
+ * An online sibling might have gone offline before a task
+ * could be picked for it, or it might be offline but later
+ * happen to come online, but its too late and nothing was
+ * picked for it. That's Ok - it will pick tasks for itself,
+ * so ignore it.
+ */
+ if (!rq_i->core_pick)
+ continue;
+
+ /*
+ * Update for new !FI->FI transitions, or if continuing to be in !FI:
+ * fi_before fi update?
+ * 0 0 1
+ * 0 1 1
+ * 1 0 1
+ * 1 1 0
+ */
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick,
+ !!rq->core->core_forceidle_count);
+
+ rq_i->core_pick->core_occupation = occ;
+
+ if (i == cpu) {
+ rq_i->core_pick = NULL;
+ continue;
+ }
+
+ /* Did we break L1TF mitigation requirements? */
+ WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+
+ if (rq_i->curr == rq_i->core_pick) {
+ rq_i->core_pick = NULL;
+ continue;
+ }
+
+ resched_curr(rq_i);
+ }
+
+out_set_next:
+ set_next_task(rq, next);
+out:
+ if (rq->core->core_forceidle_count && next == rq->idle)
+ queue_core_balance(rq);
+
+ return next;
+}
+#else
+static inline struct task_struct *
+rpal_pick_next_task(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next, struct rq_flags *rf)
+{
+ struct task_struct *p;
+
+ if (likely(prev->sched_class == &fair_sched_class &&
+ next->sched_class == &fair_sched_class)) {
+ p = rpal_pick_next_task_fair(prev, next, rq, rf);
+ return p;
+ }
+
+ BUG();
+}
+#endif
+#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 125912c0e9dd..d9c16d974a47 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8983,6 +8983,115 @@ void fair_server_init(struct rq *rq)
dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
}

+#ifdef CONFIG_RPAL
+/* if the next is throttled, unthrottle it */
+static void rpal_unthrottle(struct rq *rq, struct task_struct *next)
+{
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+
+ se = &next->se;
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+
+ if (cfs_rq == &rq->cfs)
+ break;
+ }
+}
+
+struct task_struct *rpal_pick_task_fair(struct rq *rq, struct task_struct *next)
+{
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+
+ rpal_unthrottle(rq, next);
+
+ se = &next->se;
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq->curr && cfs_rq->curr->on_rq)
+ update_curr(cfs_rq);
+
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ continue;
+
+ clear_buddies(cfs_rq, se);
+ }
+
+ return next;
+}
+
+struct task_struct *rpal_pick_next_task_fair(struct task_struct *prev,
+ struct task_struct *next,
+ struct rq *rq, struct rq_flags *rf)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se;
+ struct task_struct *p;
+
+ rpal_unthrottle(rq, next);
+
+ p = rpal_pick_task_fair(rq, next);
+
+ if (!sched_fair_runnable(rq))
+ panic("rpal error: !sched_fair_runnable\n");
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ __put_prev_set_next_dl_server(rq, prev, next);
+
+ se = &next->se;
+ p = task_of(se);
+
+ /*
+ * Since we haven't yet done put_prev_entity and if the selected task
+ * is a different task than we started out with, try and touch the
+ * least amount of cfs_rqs.
+ */
+ if (prev != p) {
+ struct sched_entity *pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+#endif
+#ifdef CONFIG_SMP
+ /*
+ * Move the next running task to the front of
+ * the list, so our cfs_tasks list becomes MRU
+ * one.
+ */
+ list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
+
+ WARN_ON_ONCE(se->sched_delayed);
+
+ if (hrtick_enabled_fair(rq))
+ hrtick_start_fair(rq, p);
+
+ update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
+
+ return p;
+}
+#endif
+
/*
* Account for a descheduled task:
*/
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c5a6a503eb6d..f8fd26b584c9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2575,6 +2575,14 @@ static inline bool sched_fair_runnable(struct rq *rq)

extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_task_idle(struct rq *rq);
+#ifdef CONFIG_RPAL
+extern struct task_struct *rpal_pick_task_fair(struct rq *rq,
+ struct task_struct *next);
+extern struct task_struct *rpal_pick_next_task_fair(struct task_struct *prev,
+ struct task_struct *next,
+ struct rq *rq,
+ struct rq_flags *rf);
+#endif

#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
--
2.20.1

Return-Path: <linux-kernel+bounces-667884-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from ny.mirrors.kernel.org (ny.mirrors.kernel.org [147.75.199.223])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 1791341E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:36:28 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by ny.mirrors.kernel.org (Postfix) with ESMTPS id 541744E4866
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:36:29 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 2B62422F169;
Fri, 30 May 2025 09:33:08 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="RRJ8MM+W"
Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 85B10221566
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:33:05 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597587; cv=none; b=AtywJwGN9ZeIw31znCKp8hYZX4qiFCniohWFK5w/ZDSq0DUoiVqkRP/CK7aR58NiEtdRkCp/zRwSgXNl3mjtKZPgVdOelkiS8ojhEqlPTI1KpYg7m46qQQEU7g06Nfn6HMoY/PY9UyNKBqVbfLG0JQisAsTUgw0xO/2TeLG0NmM=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597587; c=relaxed/simple;
bh=HWBJBkrabhiA7Ms8ILFCAOOED9oIjHs3RLCzQ5lxjLQ=;
h=From:In-Reply-To:References:To:Cc:Subject:MIME-Version:
Content-Type:Date:Message-ID; b=maIVeyWRXAOagonsXT1x8ZyLFdva+bNdfVxv/LS7ozae3rTMZ2VPrrhP3YTKm+iE7dFanJKWhrUHECdf/eSYJNnKcVax3UmgnTT1kyu87Aj/YMCbSS5lpXHY8I3z1OE7oHhlZC+mxW7fszNFRHyHGq7p6iwIGRgM0DVgiTY2rLg=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=RRJ8MM+W; arc=none smtp.client-ip=170.10.133.124
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com;
s=mimecast20190719; t=1748597584;
h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
to:to:cc:cc:mime-version:mime-version:content-type:content-type:
in-reply-to:in-reply-to:references:references;
bh=htv9cZewB1UVDzHczYFZHahQGakMX4ZHGulof9YJI7w=;
b=RRJ8MM+Wwo7VQjqzXqXnKyPf+6qlFPlqceBxkBC9DJUjeqRdv+tzPs74KTUqPU3bwBRHVF
we0vEUpsAWD8UyIaAnwkE7lqc1K8JmhChTG3FdtNTCfe4oebnznSXyRuzKC1gMAHJshsK2
AP00fZkGY0EUGyz3bjHUpnZVjlGqgXE=
Received: from mx-prod-mc-08.mail-002.prod.us-west-2.aws.redhat.com
(ec2-35-165-154-97.us-west-2.compute.amazonaws.com [35.165.154.97]) by
relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3,
cipher=TLS_AES_256_GCM_SHA384) id us-mta-154-IQ9cafIjNpmDcy6ghYPVkA-1; Fri,
30 May 2025 05:32:58 -0400
X-MC-Unique: IQ9cafIjNpmDcy6ghYPVkA-1
X-Mimecast-MFC-AGG-ID: IQ9cafIjNpmDcy6ghYPVkA_1748597576
Received: from mx-prod-int-03.mail-002.prod.us-west-2.aws.redhat.com (mx-prod-int-03.mail-002.prod.us-west-2.aws.redhat.com [10.30.177.12])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256)
(No client certificate requested)
by mx-prod-mc-08.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTPS id 3A80A18001EA;
Fri, 30 May 2025 09:32:56 +0000 (UTC)
Received: from warthog.procyon.org.uk (unknown [10.42.28.2])
by mx-prod-int-03.mail-002.prod.us-west-2.aws.redhat.com (Postfix) with ESMTP id 136BD19560A7;
Fri, 30 May 2025 09:32:53 +0000 (UTC)
Organization: Red Hat UK Ltd. Registered Address: Red Hat UK Ltd, Amberley
Place, 107-111 Peascod Street, Windsor, Berkshire, SI4 1TE, United
Kingdom.
Registered in England and Wales under Company Registration No. 3798903
From: David Howells <dhowells@xxxxxxxxxx>
In-Reply-To: <20250527084916.1699109-1-suhui@xxxxxxxxxxxx>
References: <20250527084916.1699109-1-suhui@xxxxxxxxxxxx>
To: Su Hui <suhui@xxxxxxxxxxxx>
Cc: dhowells@xxxxxxxxxx, marc.dionne@xxxxxxxxxxxx,
linux-afs@xxxxxxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx,
kernel-janitors@xxxxxxxxxxxxxxx
Subject: Re: [PATCH] afs: Replace simple_strtoul with kstrtoul in afs_parse_address
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-ID: <685977.1748597572.1@xxxxxxxxxxxxxxxxxxxxxx>
Date: Fri, 30 May 2025 10:32:52 +0100
Message-ID: <685978.1748597572@xxxxxxxxxxxxxxxxxxxxxx>
X-Scanned-By: MIMEDefang 3.0 on 10.30.177.12
X-Spam-Status: No, score=-6.3 required=5.0 tests=DKIMWL_WL_HIGH,DKIM_SIGNED,
DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
MAILING_LIST_MULTI,RCVD_IN_DNSWL_MED,
RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,RCVD_IN_VALIDITY_RPBL_BLOCKED,
SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

Su Hui <suhui@xxxxxxxxxxxx> wrote:

> kstrtoul() is better because simple_strtoul() ignores overflow which
> may lead to unexpected results.

Overflow in what sense? Are we talking about a mathematical overflow or not
checking the text beyond the end of the number?

David

Return-Path: <linux-kernel+bounces-667885-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sy.mirrors.kernel.org (sy.mirrors.kernel.org [147.75.48.161])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 0AD6441E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:36:53 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sy.mirrors.kernel.org (Postfix) with ESMTPS id A47757A3060
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:35:34 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 432A222A7F3;
Fri, 30 May 2025 09:33:22 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="bTSGMaBj"
Received: from mail-pl1-f182.google.com (mail-pl1-f182.google.com [209.85.214.182])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6E10D228CBC
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:33:19 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.214.182
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597601; cv=none; b=S+WiR7oVLH6WwVAaT5fHc+1Ggxd9QlIyynGGk6RwdTbwT6E88R9t3uH1IIsMqgeRL+IOkhzGrfc165SEkG4Y9kBZLzvwFovp0Te8RJNvdGCnmtlmzsqNIftDy+dDiE+Qusfz3FQtJstrhVgqNfVRifPIO54fDZYUq4i3PA+ul9Y=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597601; c=relaxed/simple;
bh=/coDkbmv4cf7oezrwY1QvpvI3EAobs0jwEpk59sloz8=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=sZtDHVlE9kTPy82BkNnK47NbWQ6mnNaoAjT62fN/Tb1obWgmA5H4cWxsv27xw6BtEqRAzwt6kzTzXKWbFIVmx5nF/VLOnfxKPOpM9dopaB3nVdmh+qnVfv80pTCuiNaPB8TmhXmMUuVS5g3lbtNyg9TUlEs8QAtS5h4agtUL1J0=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=bTSGMaBj; arc=none smtp.client-ip=209.85.214.182
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pl1-f182.google.com with SMTP id d9443c01a7336-234ade5a819so17420825ad.1
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:33:19 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597598; x=1749202398; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=nfNjslNRihNfiwS5akxUBJSCwiXFfcNf3iOCsj/nqzk=;
b=bTSGMaBjRBh1IgEiJ3JOdq5VzJVp+wZ5mTu5WKcDcBiSWf2sLqZDc14pzQ/2sC1ct8
v1HhpaVodkOP6ZirSUp9P3NxEbVso014oEzzpZWt1ejAeICD/0dugdhnDf/rpEN4LxoC
SoZjHkqnf0EwMv8Axpx4GbhrsS4+uUgFHbaVHYFSzzQZn8AYL2CaKVi7V0DTY3TUvcpK
OJeRvORwn/JeJPBpIN3ma+pgg58xWTnKDEMh27BumuRl546SqXf1HFG1p8n8Rk+qItRM
rhWY+Fnbd/hHJu+nPNDQOA8rP0yOUHWwOpiNVPpgdor09puCOzQxiDCNa+QSVMEJmmrt
AqFQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597598; x=1749202398;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=nfNjslNRihNfiwS5akxUBJSCwiXFfcNf3iOCsj/nqzk=;
b=a2ENBtz42Xat64pj4jib3Xvflh93IUbiz42QSuyx7aZ2zFNywTI3XUaHyECj00k2mx
QC6y5QcEfVpS5GgSWVlTRcQm3BnSv6nXCOel0zM6z8CKBr0HW+aMtMtAM6S4AX7E5gZv
+6NMwMa8xtNhHG+MLkM5a0tNI9RPc5JEqbi03gj7I5Q5jYunL/Ual/pmqaUpcsFyyMOk
3IItAUDtCA7ldLf7bmPotoP7p5s8TvmWjQLItYAi8JvM49j/tJM07npAg7jiDzpSRqBe
ceA4HvrEIgy4egfTaCIljYNMQgSjrBHm2lAfAVaBpgg0NUsRyghPqU3eW752s1g/8jbO
uyvA==
X-Forwarded-Encrypted: i=1; AJvYcCVJ6mgHWdMmUWBHveOcRP19AQRcMsAQOBvLcZIQcbksjhWF61qNb5ArmElRwhZytfpEWm3v+uof2kACXuM=@vger.kernel.org
X-Gm-Message-State: AOJu0YxJ9NUtckakNB3sOusjkUgJQ1qBuMBCgAf2oh+SSl7cPVo4lAAD
Tw8XDbJw09qtDtVnyT5aQgBH2ZQBs+Nd1dPtsgAFy/GjJwI+Oc2f2XHWKL2mlhPTZxo=
X-Gm-Gg: ASbGncuVgGWwycup5/McUloq/a5qXpt/AtzSoJFtBfowjJgfMKPpdR7UnhHBKE9aYpe
Wlq3TPbn3yqOe/93oBNYLWXaOs0XLiHz6m4xQwyzuvD4wtACBVxqZsTaITfyx95l0R+gPT4vYnq
N8rf+0LmgY1s0LYVOlEFRC5IIIcZleJ695487L6WKeUvA5dkHN+BswyCqOAus9fE/YCMJSblaMy
5IurHqHrQkDqjst2NQZT7m5FYcON8R1R/ZoJ1OL+pE5FNupmyEtZ2JzuZ3MZBLahQ8QYN1N23e3
ZZ+9X2xQLitKVzoHOndSlG0Dj8hK07zn1PrGWqEQ5EkiCBU6gCW7vhaj1YrONCGXgfIceOzV7MN
PiVB0GUve+g==
X-Google-Smtp-Source: AGHT+IEyXGYQ2GCz5bdSvzR+T/Z8enoWNYF8Thjcb9xcpCHkk9mGcXueyM6Xj7aL+U99FPOWwALWDw==
X-Received: by 2002:a17:90b:3e45:b0:311:a623:676c with SMTP id 98e67ed59e1d1-31241e8d325mr4349242a91.27.1748597598509;
Fri, 30 May 2025 02:33:18 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.33.03
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:33:18 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 19/35] RPAL: add lazy switch main logic
Date: Fri, 30 May 2025 17:27:47 +0800
Message-Id: <91e9db5ad4a3e1e58a666bd496e55d8f8db2c63c.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

The implementation of lazy switch differs from a regular schedule() in
three key aspects:

1. It occurs at the kernel entry with irq disabled.
2. The next task is explicitly pre-determined rather than selected by
the scheduler.
3. User-space context (excluding general-purpose registers) remains
unchanged across the switch.

This patch introduces the rpal_schedule() interface to address these
requirements. Firstly, the rpal_schedule() skips irq enabling in
finish_lock_switch(), preserving the irq-disabled state required
during kernel entry. Secondly, the rpal_pick_next_task() interface is
used to explicitly specify the target task, bypassing the default
scheduler's decision-making process. Thirdly, non-general-purpose
registers (e.g., FPU, vector units) are not restored during the switch,
ensuring user space context remains intact. Handling of general-purpose
registers will be addressed in a subsequent patch by RPAL before invoking
rpal_schedule().

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/kernel/process_64.c | 75 +++++++++++++++++++++
include/linux/rpal.h | 3 +
kernel/sched/core.c | 126 +++++++++++++++++++++++++++++++++++
3 files changed, 204 insertions(+)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4830e9215de7..efc3f238c486 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -753,6 +753,81 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
return prev_p;
}

+#ifdef CONFIG_RPAL
+__no_kmsan_checks
+__visible __notrace_funcgraph struct task_struct *
+__rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev = &prev_p->thread;
+ struct thread_struct *next = &next_p->thread;
+ int cpu = smp_processor_id();
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(hardirq_stack_inuse));
+
+ /* no need to switch fpu */
+ /* __fpu_invalidate_fpregs_state() */
+ x86_task_fpu(prev_p)->last_cpu = -1;
+ /* fpregs_activate() */
+ __this_cpu_write(fpu_fpregs_owner_ctx, x86_task_fpu(next_p));
+ trace_x86_fpu_regs_activated(x86_task_fpu(next_p));
+ x86_task_fpu(next_p)->last_cpu = cpu;
+ set_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD);
+ clear_tsk_thread_flag(next_p, TIF_NEED_FPU_LOAD);
+
+ /* no need to save fs */
+ savesegment(gs, prev_p->thread.gsindex);
+ if (static_cpu_has(X86_FEATURE_FSGSBASE))
+ prev_p->thread.gsbase = __rdgsbase_inactive();
+ else
+ save_base_legacy(prev_p, prev_p->thread.gsindex, GS);
+
+ load_TLS(next, cpu);
+
+ arch_end_context_switch(next_p);
+
+ savesegment(es, prev->es);
+ if (unlikely(next->es | prev->es))
+ loadsegment(es, next->es);
+
+ savesegment(ds, prev->ds);
+ if (unlikely(next->ds | prev->ds))
+ loadsegment(ds, next->ds);
+
+ /* no need to load fs */
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ if (unlikely(prev->gsindex || next->gsindex))
+ loadseg(GS, next->gsindex);
+
+ __wrgsbase_inactive(next->gsbase);
+ } else {
+ load_seg_legacy(prev->gsindex, prev->gsbase, next->gsindex,
+ next->gsbase, GS);
+ }
+
+ /* skip pkru load as we will use pkru in RPAL */
+
+ this_cpu_write(current_task, next_p);
+ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
+
+ /* no need to load fpu */
+
+ update_task_stack(next_p);
+ switch_to_extra(prev_p, next_p);
+
+ if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
+ unsigned short ss_sel;
+
+ savesegment(ss, ss_sel);
+ if (ss_sel != __KERNEL_DS)
+ loadsegment(ss, __KERNEL_DS);
+ }
+ resctrl_sched_in(next_p);
+
+ return prev_p;
+}
+#endif
+
void set_personality_64bit(void)
{
/* inherit personality from parent */
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 45137770fac6..0813db4552c0 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -487,4 +487,7 @@ int rpal_try_to_wake_up(struct task_struct *p);
int rpal_init_thread_pending(struct rpal_common_data *rcd);
void rpal_free_thread_pending(struct rpal_common_data *rcd);
int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock);
+void rpal_schedule(struct task_struct *next);
+asmlinkage struct task_struct *
+__rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e76376c5172..760d88458b39 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6827,6 +6827,12 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
if (unlikely(is_special_task_state(task_state)))
flags |= DEQUEUE_SPECIAL;

+#ifdef CONFIG_RPAL
+ /* DELAY_DEQUEUE will cause CPU stalls after lazy switch, skip it */
+ if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT))
+ flags |= DEQUEUE_SPECIAL;
+#endif
+
/*
* __schedule() ttwu()
* prev_state = prev->state; if (p->on_rq && ...)
@@ -11005,6 +11011,62 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
#endif /* CONFIG_SCHED_CLASS_EXT */

#ifdef CONFIG_RPAL
+static struct rq *rpal_finish_task_switch(struct task_struct *prev)
+ __releases(rq->lock)
+{
+ struct rq *rq = this_rq();
+ struct mm_struct *mm = rq->prev_mm;
+
+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+ "corrupted preempt_count: %s/%d/0x%x\n",
+ current->comm, current->pid, preempt_count()))
+ preempt_count_set(FORK_PREEMPT_COUNT);
+
+ rq->prev_mm = NULL;
+ vtime_task_switch(prev);
+ perf_event_task_sched_in(prev, current);
+ finish_task(prev);
+ tick_nohz_task_switch();
+
+ /* finish_lock_switch, not enable irq */
+ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
+ __balance_callbacks(rq);
+ raw_spin_rq_unlock(rq);
+
+ finish_arch_post_lock_switch();
+ kcov_finish_switch(current);
+ kmap_local_sched_in();
+
+ fire_sched_in_preempt_notifiers(current);
+ if (mm) {
+ membarrier_mm_sync_core_before_usermode(mm);
+ mmdrop(mm);
+ }
+
+ return rq;
+}
+
+static __always_inline struct rq *rpal_context_switch(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next,
+ struct rq_flags *rf)
+{
+ /* irq is off */
+ prepare_task_switch(rq, prev, next);
+ arch_start_context_switch(prev);
+
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_use_mm(next->mm);
+
+ switch_mm_cid(rq, prev, next);
+
+ prepare_lock_switch(rq, next, rf);
+ __rpal_switch_to(prev, next);
+ barrier();
+ return rpal_finish_task_switch(prev);
+}
+
#ifdef CONFIG_SCHED_CORE
static inline struct task_struct *
__rpal_pick_next_task(struct rq *rq, struct task_struct *prev,
@@ -11214,4 +11276,68 @@ rpal_pick_next_task(struct rq *rq, struct task_struct *prev,
BUG();
}
#endif
+
+/* enter and exit with irqs disabled() */
+void __sched notrace rpal_schedule(struct task_struct *next)
+{
+ struct task_struct *prev, *picked;
+ bool preempt = false;
+ unsigned long *switch_count;
+ unsigned long prev_state;
+ struct rq_flags rf;
+ struct rq *rq;
+ int cpu;
+
+ /* sched_mode = SM_NONE */
+
+ preempt_disable();
+
+ trace_sched_entry_tp(preempt, CALLER_ADDR0);
+
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ prev = rq->curr;
+
+ schedule_debug(prev, preempt);
+
+ if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
+ hrtick_clear(rq);
+
+ rcu_note_context_switch(preempt);
+ rq_lock(rq, &rf);
+ smp_mb__after_spinlock();
+
+ rq->clock_update_flags <<= 1;
+ update_rq_clock(rq);
+ rq->clock_update_flags = RQCF_UPDATED;
+
+ switch_count = &prev->nivcsw;
+
+ prev_state = READ_ONCE(prev->__state);
+ if (prev_state) {
+ try_to_block_task(rq, prev, &prev_state);
+ switch_count = &prev->nvcsw;
+ }
+
+ picked = rpal_pick_next_task(rq, prev, next, &rf);
+ rq_set_donor(rq, next);
+ if (unlikely(next != picked))
+ panic("rpal error: next != picked\n");
+
+ clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
+ rq->last_seen_need_resched_ns = 0;
+
+ rq->nr_switches++;
+ RCU_INIT_POINTER(rq->curr, next);
+ ++*switch_count;
+ migrate_disable_switch(rq, prev);
+ psi_account_irqtime(rq, prev, next);
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
+ prev->se.sched_delayed);
+ trace_sched_switch(preempt, prev, next, prev_state);
+ rq = rpal_context_switch(rq, prev, next, &rf);
+ trace_sched_exit_tp(true, CALLER_ADDR0);
+ preempt_enable_no_resched();
+}
#endif
--
2.20.1

Return-Path: <linux-kernel+bounces-667886-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from ny.mirrors.kernel.org (ny.mirrors.kernel.org [147.75.199.223])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 87F7441E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:37:05 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by ny.mirrors.kernel.org (Postfix) with ESMTPS id BC8584E4715
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:37:06 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 939CC22AE5E;
Fri, 30 May 2025 09:33:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="hvzumzsQ"
Received: from mail-pg1-f171.google.com (mail-pg1-f171.google.com [209.85.215.171])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id D09F422A81F
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:33:34 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.171
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597616; cv=none; b=sdB+QOuMnsH7TSqymGs3I6E7A9s9JATgtQj5MPDMUbgG6nJ/meIl7dp6agGxcFrf/aDD0WFlf3zgORdPYIk0ebSjZ3n0DHvvOSpei8BgQXQ3Bhd9cr130DHY5Nc1T8f+v/rDz75i+Z5jNbbbkhuqt0IPnlwr/r2XKNFo/HbKfFE=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597616; c=relaxed/simple;
bh=46IfJD6O+6SqRHValc80DBFn7y6T8YogODhoWzSz5NE=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=DhzL+8mKKCmKN5XtflpdhCiDoI6fyTGxnS2r1cznn0VupA+C772gpKoohLQm6YBAsydkLd3aklwhkLviNuTHg1qRqg6GMNTpfbeffIc02VgFYLVPaCbdw8OnWkM74gc19qOHF+//9Yw8mWDtGy9Sbhf0II8ynsHR8lwgOtbnyM8=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=hvzumzsQ; arc=none smtp.client-ip=209.85.215.171
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pg1-f171.google.com with SMTP id 41be03b00d2f7-b2c3c689d20so1404760a12.3
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:33:34 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597614; x=1749202414; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=IlVLKhSr7F4Ter0cssioRNY+mLV+eE5Kw7Y7qzhdKJU=;
b=hvzumzsQd/1QOcNR9/uhM+UmOTvrOHgj5CQds8LFcKMO4yHbvfVDmRmkOK6fP31723
a2Dw4rKLbTbq4oEyVstg0aHaELNgWLSLwt/7MEYhDfeB0QEX+BXX1stuyUxLyqQGyo11
9aQUDCFWq+umcopDz9cMCi2RvOn24YCWUXNoMNZjBGJkZrXZPL5K+Dj9JRvaI1x+t6Vs
xDRt/xrw6eyF8CKrIHUAIZibeDJ1LTOWDp+mO2BGmW1JbiFtF9QYH/CW15U1wVOmukrA
UGCsxW1tIxdx89/BGZ2+wPJom/pKhj/+A792P8wSh0ggoSYui8F4AfSIBX2lgiNEx966
iTYw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597614; x=1749202414;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=IlVLKhSr7F4Ter0cssioRNY+mLV+eE5Kw7Y7qzhdKJU=;
b=JAoCmkPast5VfRjLjDvReJneWxxxbvRT25y7EXOUagg0VJJDW3iL8+Q8e3I6VwCxvi
jq1NdzEiu0Q5f5MNs4xywDNmxeJhXMZGlaZO79rRBbnl8IUAIo17GZiDC6NmjwRdU32N
lxWvTWlNu3ZjfflB2AhfB0jbYUoF9rZOkWOBNVRoyghaUKR8JYn9I7EcbLHlyQsvgsr2
KmXUj9yeE8sOoFh25wDdAUAnGZnVK5bBQK55McAOGhhzudhCuLSYviGtfl4zOrMwpy5j
cPe3piA8RuixE6P9chj+SLctYJfYAD/2aYjJc1FNMNVFBYSQy862mLg0/fBKf+TvGoxH
DxMA==
X-Forwarded-Encrypted: i=1; AJvYcCWm422wvLekWVyVgmz4aUsh5DenvR0hCJj4jyxRM/szbI7HBBLzNTfkLzNSoyeQiZxSjModzpDRXkwd7qY=@vger.kernel.org
X-Gm-Message-State: AOJu0YxO8RZ2MU3og25dI+xp96xsl4J414pWI5peiC4/K03QD3lca9Ha
eVymSoAmVN5ad8Ack5sva301+cLIS4jWq5814a/wGG8DcZBTlljj0elWaoiNhrQzfMg=
X-Gm-Gg: ASbGncv+ypHjFCB/NCOW8gY22LMa1v0F9rW6ATrER9MxsoxUbSzeFXet1PK2bmApbQG
PI9B96oOUcdfwNO7I+R5VQuUAwjYLC9DRc5jDfO0abe1SdsOQG9AHzzaXQKiktcyjPuVn58Fk18
O4T3NtuD7ovRLppZOgNnxFDGeoozbSM+H8WCiI4Op1U7rBrHExxfYWAi11YGzZ2zIv8EtMnVNQY
LlNAFZjhJ3X2ZCS7OSmE5ki0rb+i/K/Rmp72YB+9WWtZB9fSlKEGN1uSvf+sEZPs/qR+wuVB1OA
HTCjtZO7pNE/dv6o4KrCscdDniAx+1fleX9Kk13foCfqunYtW0EZuMZfbGJ07G4eEGRUTRij0xD
ZwaxZ06qK+Q==
X-Google-Smtp-Source: AGHT+IF9UpDBed3ioNwTFOPq4QXc23on+j+5SNtglVD6Wkl9cnonPhoeqxqRBdAvjyhyCYWucdvIFA==
X-Received: by 2002:a17:90a:e708:b0:311:a314:c2d1 with SMTP id 98e67ed59e1d1-3124150e443mr4004525a91.6.1748597613983;
Fri, 30 May 2025 02:33:33 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.33.19
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:33:33 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch
Date: Fri, 30 May 2025 17:27:48 +0800
Message-Id: <4cd58d0e989640f0c230196e81cec5cee0ceb476.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

After lazy switch the task before the lazy switch will lose its user mode
context (which is passed to the task after the lazy switch). Therefore,
RPAL needs to handle the issue of the previous task losing its user mode
context.

After the lazy switch occurs, the sender can resume execution in two ways.
One way is to be scheduled by the scheduler. In this case, RPAL handles
this issue in a manner similar to ret_from_fork. the sender will enter
rpal_ret_from_lazy_switch through the constructed stack frame by lazy
switchto execute the return logic and finally return to the pre-defined
user mode (referred to as "kernel return"). The other way is to be switched
back to by the receiver through another lazy switch. In this case, the
receiver will pass the user mode context to the sender, so there is no need
to construct a user mode context for the sender. And the receiver can
return to the user mode through the kernel return method.

rpal_ret_from_lazy_switch primarily handles scheduler cleanup work, similar
to schedule_tail(), but does not perform set_child_tid-otherwise, it might
cause set_child_tid to be executed repeatedly. It then calls
rpal_kernel_ret(), which is primarily used to set the states of the sender
and receiver and attempt to unlock the CPU. Finally, it performs syscall
cleanup work and returns to user mode.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/entry/entry_64.S | 23 ++++++++++++++++++++
arch/x86/rpal/core.c | 45 +++++++++++++++++++++++++++++++++++++--
include/linux/rpal.h | 5 ++++-
kernel/sched/core.c | 25 +++++++++++++++++++++-
4 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ed04a968cc7d..13b4d0684575 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -169,6 +169,29 @@ SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
int3
SYM_CODE_END(entry_SYSCALL_64)

+#ifdef CONFIG_RPAL
+SYM_CODE_START(rpal_ret_from_lazy_switch)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
+ movq %rax, %rdi
+ call rpal_schedule_tail
+
+ movq %rsp, %rdi
+ call rpal_kernel_ret
+
+ movq %rsp, %rdi
+ call syscall_exit_to_user_mode /* returns with IRQs disabled */
+
+ UNWIND_HINT_REGS
+#ifdef CONFIG_X86_FRED
+ ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
+ "jmp asm_fred_exit_user", X86_FEATURE_FRED
+#else
+ jmp swapgs_restore_regs_and_return_to_usermode
+#endif
+SYM_CODE_END(rpal_ret_from_lazy_switch)
+#endif
+
/*
* %rdi: prev task
* %rsi: next task
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 19c4ef38bca3..ed4c11e6838c 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -18,7 +18,7 @@ unsigned long rpal_cap;

static inline void rpal_lock_cpu(struct task_struct *tsk)
{
- rpal_set_cpus_allowed_ptr(tsk, true);
+ rpal_set_cpus_allowed_ptr(tsk, true, false);
if (unlikely(!irqs_disabled())) {
local_irq_disable();
rpal_err("%s: irq is enabled\n", __func__);
@@ -27,13 +27,54 @@ static inline void rpal_lock_cpu(struct task_struct *tsk)

static inline void rpal_unlock_cpu(struct task_struct *tsk)
{
- rpal_set_cpus_allowed_ptr(tsk, false);
+ rpal_set_cpus_allowed_ptr(tsk, false, false);
if (unlikely(!irqs_disabled())) {
local_irq_disable();
rpal_err("%s: irq is enabled\n", __func__);
}
}

+static inline void rpal_unlock_cpu_kernel_ret(struct task_struct *tsk)
+{
+ rpal_set_cpus_allowed_ptr(tsk, false, true);
+}
+
+void rpal_kernel_ret(struct pt_regs *regs)
+{
+ struct task_struct *tsk;
+ struct rpal_receiver_call_context *rcc;
+ int state;
+
+ if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
+ rcc = current->rpal_rd->rcc;
+ atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
+ } else {
+ tsk = current->rpal_sd->receiver;
+ rcc = tsk->rpal_rd->rcc;
+ rpal_clear_task_thread_flag(tsk, RPAL_LAZY_SWITCHED_BIT);
+ state = atomic_xchg(&rcc->sender_state, RPAL_SENDER_STATE_KERNEL_RET);
+ WARN_ON_ONCE(state != RPAL_SENDER_STATE_CALL);
+ /* make sure kernel return is finished */
+ smp_mb();
+ WRITE_ONCE(tsk->rpal_rd->sender, NULL);
+ /*
+ * We must unlock receiver first, otherwise we may unlock
+ * receiver which is already locked by another sender.
+ *
+ * Sender A Receiver B Sender C
+ * lazy switch (A->B)
+ * kernel return
+ * unlock cpu A
+ * epoll_wait
+ * lazy switch(C->B)
+ * lock cpu B
+ * unlock cpu B
+ * BUG() BUG()
+ */
+ rpal_unlock_cpu_kernel_ret(tsk);
+ rpal_unlock_cpu_kernel_ret(current);
+ }
+}

static inline struct task_struct *rpal_get_sender_task(void)
{
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 0813db4552c0..01b582fa821e 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -480,14 +480,17 @@ int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
unsigned long addr, int error_code);
struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
struct task_struct *rpal_find_next_task(unsigned long fsbase);
+void rpal_kernel_ret(struct pt_regs *regs);

extern void rpal_pick_mmap_base(struct mm_struct *mm,
struct rlimit *rlim_stack);
int rpal_try_to_wake_up(struct task_struct *p);
int rpal_init_thread_pending(struct rpal_common_data *rcd);
void rpal_free_thread_pending(struct rpal_common_data *rcd);
-int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock);
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock,
+ bool is_kernel_ret);
void rpal_schedule(struct task_struct *next);
asmlinkage struct task_struct *
__rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
+asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 760d88458b39..0f9343698198 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3181,7 +3181,8 @@ void rpal_free_thread_pending(struct rpal_common_data *rcd)
/*
* CPU lock is forced and all cpumask will be ignored by RPAL temporary.
*/
-int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock)
+int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock,
+ bool is_kernel_ret)
{
const struct cpumask *cpu_valid_mask = cpu_active_mask;
struct set_affinity_pending *pending = p->rpal_cd->pending;
@@ -3210,6 +3211,9 @@ int rpal_set_cpus_allowed_ptr(struct task_struct *p, bool is_lock)
rpal_clear_task_thread_flag(p, RPAL_CPU_LOCKED_BIT);
}

+ if (is_kernel_ret)
+ return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
+
update_rq_clock(rq);

if (cpumask_equal(&p->cpus_mask, ac.new_mask))
@@ -11011,6 +11015,25 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
#endif /* CONFIG_SCHED_CLASS_EXT */

#ifdef CONFIG_RPAL
+asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev)
+ __releases(rq->lock)
+{
+ /*
+ * New tasks start with FORK_PREEMPT_COUNT, see there and
+ * finish_task_switch() for details.
+ *
+ * finish_task_switch() will drop rq->lock() and lower preempt_count
+ * and the preempt_enable() will end up enabling preemption (on
+ * PREEMPT_COUNT kernels).
+ */
+
+ finish_task_switch(prev);
+ trace_sched_exit_tp(true, CALLER_ADDR0);
+ preempt_enable();
+
+ calculate_sigpending();
+}
+
static struct rq *rpal_finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
--
2.20.1

Return-Path: <linux-kernel+bounces-667887-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id DDEC741E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:37:53 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 9C3D4188FA4C
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:37:39 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id B793122A80C;
Fri, 30 May 2025 09:33:53 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="U7jWFD62"
Received: from mail-pj1-f53.google.com (mail-pj1-f53.google.com [209.85.216.53])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 498A222129F
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:33:50 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.53
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597632; cv=none; b=P7Unlc3WJ/9hzb8yxNyFahP4iCDhOt649Knhu240jcfoFdY+dJM312Fm9wjmh/suZOvQf25faRIKd2sy8HwxXlSApIfBPWQ/LMwRNFP4DKsXzSkUdTfNEp3TgLL2ewJq4GL9CHzYWMO2ExHj5pXYJzvwL+Hq/vk5qFktSPWH9IA=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597632; c=relaxed/simple;
bh=s8bvP/o1nUrwW7KF0DJ/pHvc3b43enUVUs73szJcvsA=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=GiVr5QKQDimoOPKqjeKaO1sRME6qUTVVgl5MhGrHYGtspNu53feOQqNtcpcZO5W6PnJsXJjQeVKW/jYF+LSKVIGH2PCKYqLKNv6UE/fmLXxf1ldygwu2VpF7SuAI5TtFeYgTwS9KunUyMxlyjEA6Eu1J7zi4b2QpxQ/gIkT49GI=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=U7jWFD62; arc=none smtp.client-ip=209.85.216.53
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pj1-f53.google.com with SMTP id 98e67ed59e1d1-3119822df05so1886086a91.0
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:33:50 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597629; x=1749202429; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=+2NFrx3BFqUZITaGnYPssFKiBOpe/dz6IJRWesRURms=;
b=U7jWFD62nxS8TfhhmrTlMIbWjZGLXbkA3Q1GVDdMyhiibiNlV0l5YWrgI+JavnBPpb
Gxq6GuT3tq2RN7mnaF9YyNih9RiSM57U7umDgRMf9gzCA7z4SZVNodtbn4WIDsRDWvcb
BDTiIIyLlwNpb72o/C/BqOfI6mMv++FKB0YtlMAqTK7QUP77Ct2MzDCe56gVPo2RnuIa
ybkovV0jXidA3NGQGQtlHGZDDaLJjNDCp8qJlsHjNJN6dALtCaW3w29v1/hm4x3Bs6B1
T6boHJn4kXyByPOr22bDDLKGgZ4JH8Z/S43M/bOkjz2DaPNk11bLJojgqn7XecTnHmd1
803g==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597629; x=1749202429;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=+2NFrx3BFqUZITaGnYPssFKiBOpe/dz6IJRWesRURms=;
b=kFvZ04zKeA6TfGUfZvNnzOqeTTtFG58fKASX1c4FDIgiohu/V8qzh1jFcpR+dulmsK
M0s80AF6jVsszaOc+ab6pJgbBiYjhZMvGTpW+P8aA5yqmwRJgX66r4UU9JAkxsVnF1D7
EMOuty2QIbOl3WHck0xZX5aSuv+x5Hwm/U3S5lrZXAtsAdQhz4vgZTKLyl5Y5lNDCI7Z
4FuG/yxXfif03JZZDiKHGCcnPW94yPIiJNx9FQ/UfkianssjATHIdo7r/QUsjOLitkPW
XsO4ijqf3ICXOTMnx7+A63b5hjuMIuVYb6CmaIa+bafFW43Un972vKT2vtiwKlwZLkGC
3IpQ==
X-Forwarded-Encrypted: i=1; AJvYcCXZ+u7P1kTRTdr9nXeGiOn8ofieA3iMthsObYWXbUu9ptemyjHwryfFCx3Di1jZlCfPww+Qz3iqabNpo24=@vger.kernel.org
X-Gm-Message-State: AOJu0YzteLB5PRcrk5b5FA0fLfW7sC9HQzrEtq5gzIum5NchWBROc6p5
l5FFHqvpYPjPmZR+VyxIOV0LjckXVOj6ZCoIk5G9NW8FrOrghgPHCLROUVgK8j525S8=
X-Gm-Gg: ASbGncu4mgnR2USzKoaHIKVjXbeN57lc0w6RYjdyAKToC2QsUJlk8c4jepaXY/tLjTS
tATK0wpTgNhTdq/YlRmPVPBpm07mMxpraeIax/ycimY/8kjJCEK8s6GE4kmxydUgBdNr6w6hVWV
7qg4Av17dJeKb0hj6DpvrrwZ5YgHlaqI3mIrQtWLVmVApMkowwlUa/VHsx7PXzsxJfpWA9B/q4l
d0r9BT7D5mJezq6+QcGKGxLyEli0Lkk/FT0CT6nzA/6gR5wvdiaQvjj1ODmoE7r0iN/puw9/cc7
cvE1iNLsWMv3ZUYni7zdkIuXzseYBO9ZbntFx+++WKlkcFXBiFUjgAgqNyQxCldJ4xolf3PCkk+
n1aYz2LgYNQ==
X-Google-Smtp-Source: AGHT+IHkejFlF7JacpAFc31d67Aa0abDbDTBPS+AvGZCJTX8xpAHAqZtwkMExSrIMlmiA7Dd0bS1gw==
X-Received: by 2002:a17:90b:3ec3:b0:310:cea4:e3b9 with SMTP id 98e67ed59e1d1-31250452c5fmr1772636a91.34.1748597629413;
Fri, 30 May 2025 02:33:49 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.33.34
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:33:49 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch
Date: Fri, 30 May 2025 17:27:49 +0800
Message-Id: <924aa7959502c4c3271cb311632eb505e894e26e.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

At the kernel entry point, RPAL performs a lazy switch. Therefore, it is
necessary to hook all kernel entry points to execute the logic related to
the lazy switch. At the kernel entry, apart from some necessary operations
related to the lazy switch (such as ensuring that the general-purpose
registers remain unchanged before and after the lazy switch), the task
before the lazy switch will lose its user mode context (which is passed to
the task after the lazy switch). Therefore, the kernel entry also needs to
handle the issue of the previous task losing its user mode context.

This patch hooks all locations where the transition from user mode to
kernel mode occurs, including entry_SYSCALL_64, error_entry, and
asm_exc_nmi. When the kernel detects a mismatch between the kernel-mode and
user mode contexts, it executes the logic related to the lazy switch.
Taking the switch from the sender to the receiver as an example, the
receiver thread is first locked to the CPU where the sender is located.
Then, the receiver thread in the CALL state is woken up through
rpal_try_to_wake_up(). The general purpose register state (pt_regs) of the
sender is copied to the receiver, and rpal_schedule() is executed to
complete the lazy switch. Regarding the issue of the sender losing its
context, the kernel loads the pre-saved user mode context of the sender
into the sender's pt_regs and constructs the kernel stack frame of the
sender in a manner similar to the fork operation.

The handling of the switch from the receiver to the sender is similar,
except that the receiver will be unlocked from the current CPU, and the
receiver can only return to the user mode through the kernel return method.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/entry/entry_64.S | 137 ++++++++++++++++++++++++++++++++++
arch/x86/kernel/asm-offsets.c | 3 +
arch/x86/rpal/core.c | 137 ++++++++++++++++++++++++++++++++++
include/linux/rpal.h | 6 ++
4 files changed, 283 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 13b4d0684575..59c38627510d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -118,6 +118,20 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
UNTRAIN_RET
CLEAR_BRANCH_HISTORY

+#ifdef CONFIG_RPAL
+ /*
+ * We first check if it is a RPAL sender/receiver with
+ * current->rpal_cd. For non-RPAL task, we just skip it.
+ * For rpal task, We may need to check if it needs to do
+ * lazy switch.
+ */
+ movq PER_CPU_VAR(current_task), %r13
+ movq TASK_rpal_cd(%r13), %rax
+ testq %rax, %rax
+ jz _do_syscall
+ jmp do_rpal_syscall
+_do_syscall:
+#endif
call do_syscall_64 /* returns with IRQs disabled */

/*
@@ -190,6 +204,101 @@ SYM_CODE_START(rpal_ret_from_lazy_switch)
jmp swapgs_restore_regs_and_return_to_usermode
#endif
SYM_CODE_END(rpal_ret_from_lazy_switch)
+
+/* return address offset of stack frame */
+#define RPAL_FRAME_RET_ADDR_OFFSET -56
+
+SYM_CODE_START(do_rpal_syscall)
+ movq %rsp, %r14
+ call rpal_syscall_64_context_switch
+ testq %rax, %rax
+ jz 1f
+
+ /*
+ * When we come here, everything but stack switching is finished.
+ * This makes current task use another task's kernel stack. Thus,
+ * we need to do stack switching here.
+ *
+ * At the meanwhile, the previous task's stack content is corrupted,
+ * we also need to rebuild its stack frames, so that it will jump to
+ * rpal_ret_from_lazy_switch when it is scheduled in. This is inspired
+ * by ret_from_fork.
+ */
+ movq TASK_threadsp(%rax), %rsp
+#ifdef CONFIG_STACKPROTECTOR
+ movq TASK_stack_canary(%rax), %rbx
+ movq %rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+ /* rebuild src's frame */
+ movq $rpal_ret_from_lazy_switch, -8(%r14)
+ leaq RPAL_FRAME_RET_ADDR_OFFSET(%r14), %rbx
+ movq %rbx, TASK_threadsp(%r13)
+
+ movq %r13, %rdi
+ /*
+ * Everything of task switch is done, but we still need to do
+ * a little extra things for lazy switch.
+ */
+ call rpal_lazy_switch_tail
+
+1:
+ movq ORIG_RAX(%rsp), %rsi
+ movq %rsp, %rdi
+ jmp _do_syscall
+SYM_CODE_END(do_rpal_syscall)
+
+SYM_CODE_START(do_rpal_error)
+ popq %r12
+ movq %rax, %rsp
+ movq %rax, %r14
+ movq %rax, %rdi
+ call rpal_exception_context_switch
+ testq %rax, %rax
+ jz 1f
+
+ movq TASK_threadsp(%rax), %rsp
+ ENCODE_FRAME_POINTER
+#ifdef CONFIG_STACKPROTECTOR
+ movq TASK_stack_canary(%rax), %rbx
+ movq %rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+ /* rebuild src's frame */
+ movq $rpal_ret_from_lazy_switch, -8(%r14)
+ leaq RPAL_FRAME_RET_ADDR_OFFSET(%r14), %rbx
+ movq %rbx, TASK_threadsp(%r13)
+
+ movq %r13, %rdi
+ call rpal_lazy_switch_tail
+1:
+ movq %rsp, %rax
+ pushq %r12
+ jmp _do_error
+SYM_CODE_END(do_rpal_error)
+
+SYM_CODE_START(do_rpal_nmi)
+ movq %rsp, %r14
+ movq %rsp, %rdi
+ call rpal_nmi_context_switch
+ testq %rax, %rax
+ jz 1f
+
+ movq TASK_threadsp(%rax), %rsp
+ ENCODE_FRAME_POINTER
+#ifdef CONFIG_STACKPROTECTOR
+ movq TASK_stack_canary(%rax), %rbx
+ movq %rbx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+ /* rebuild src's frame */
+ movq $rpal_ret_from_lazy_switch, -8(%r14)
+ leaq RPAL_FRAME_RET_ADDR_OFFSET(%r14), %rbx
+ movq %rbx, TASK_threadsp(%r13)
+
+ movq %r13, %rdi
+ call rpal_lazy_switch_tail
+
+1:
+ jmp _do_nmi
+SYM_CODE_END(do_rpal_nmi)
#endif

/*
@@ -1047,7 +1156,22 @@ SYM_CODE_START(error_entry)

leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
/* Put us onto the real thread stack. */
+#ifdef CONFIG_RPAL
+ call sync_regs
+ /*
+ * Check whether we need to perform lazy switch after we
+ * switch to the real thread stack.
+ */
+ movq PER_CPU_VAR(current_task), %r13
+ movq TASK_rpal_cd(%r13), %rdi
+ testq %rdi, %rdi
+ jz _do_error
+ jmp do_rpal_error
+_do_error:
+ RET
+#else
jmp sync_regs
+#endif

/*
* There are two places in the kernel that can potentially fault with
@@ -1206,6 +1330,19 @@ SYM_CODE_START(asm_exc_nmi)
IBRS_ENTER
UNTRAIN_RET

+#ifdef CONFIG_RPAL
+ /*
+ * Check whether we need to perform lazy switch only when
+ * we come from userspace.
+ */
+ movq PER_CPU_VAR(current_task), %r13
+ movq TASK_rpal_cd(%r13), %rax
+ testq %rax, %rax
+ jz _do_nmi
+ jmp do_rpal_nmi
+_do_nmi:
+#endif
+
/*
* At this point we no longer need to worry about stack damage
* due to nesting -- we're on the normal thread stack and we're
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 6259b474073b..010202c31b37 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -46,6 +46,9 @@ static void __used common(void)
#ifdef CONFIG_STACKPROTECTOR
OFFSET(TASK_stack_canary, task_struct, stack_canary);
#endif
+#ifdef CONFIG_RPAL
+ OFFSET(TASK_rpal_cd, task_struct, rpal_cd);
+#endif

BLANK();
OFFSET(pbe_address, pbe, address);
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index ed4c11e6838c..c48df1ce4324 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -7,6 +7,7 @@
*/

#include <linux/rpal.h>
+#include <linux/sched/task_stack.h>
#include <asm/fsgsbase.h>

#include "internal.h"
@@ -39,6 +40,20 @@ static inline void rpal_unlock_cpu_kernel_ret(struct task_struct *tsk)
rpal_set_cpus_allowed_ptr(tsk, false, true);
}

+void rpal_lazy_switch_tail(struct task_struct *tsk)
+{
+ struct rpal_receiver_call_context *rcc;
+
+ if (rpal_test_task_thread_flag(current, RPAL_LAZY_SWITCHED_BIT)) {
+ rcc = current->rpal_rd->rcc;
+ atomic_cmpxchg(&rcc->receiver_state, rpal_build_call_state(tsk->rpal_sd),
+ RPAL_RECEIVER_STATE_LAZY_SWITCH);
+ } else {
+ rpal_unlock_cpu(tsk);
+ rpal_unlock_cpu(current);
+ }
+}
+
void rpal_kernel_ret(struct pt_regs *regs)
{
struct task_struct *tsk;
@@ -76,6 +91,87 @@ void rpal_kernel_ret(struct pt_regs *regs)
}
}

+static inline void rebuild_stack(struct rpal_task_context *ctx,
+ struct pt_regs *regs)
+{
+ regs->r12 = ctx->r12;
+ regs->r13 = ctx->r13;
+ regs->r14 = ctx->r14;
+ regs->r15 = ctx->r15;
+ regs->bx = ctx->rbx;
+ regs->bp = ctx->rbp;
+ regs->ip = ctx->rip;
+ regs->sp = ctx->rsp;
+}
+
+static inline void rebuild_sender_stack(struct rpal_sender_data *rsd,
+ struct pt_regs *regs)
+{
+ rebuild_stack(&rsd->scc->rtc, regs);
+}
+
+static inline void rebuild_receiver_stack(struct rpal_receiver_data *rrd,
+ struct pt_regs *regs)
+{
+ rebuild_stack(&rrd->rcc->rtc, regs);
+}
+
+static inline void update_dst_stack(struct task_struct *next,
+ struct pt_regs *src)
+{
+ struct pt_regs *dst;
+
+ dst = task_pt_regs(next);
+ *dst = *src;
+ next->thread.sp = (unsigned long)dst;
+}
+
+/*
+ * rpal_do_kernel_context_switch - the main routine of RPAL lazy switch
+ * @next: task to switch to
+ * @regs: the user pt_regs saved in kernel entry
+ *
+ * This function performs the lazy switch. When switch from sender to
+ * receiver, we need to lock both task to current CPU to avoid double
+ * control flow when we perform lazy switch and after then.
+ */
+static struct task_struct *
+rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
+{
+ struct task_struct *prev = current;
+
+ if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+ current->rpal_sd->receiver = next;
+ rpal_lock_cpu(current);
+ rpal_lock_cpu(next);
+ rpal_try_to_wake_up(next);
+ update_dst_stack(next, regs);
+ /*
+ * When a lazy switch occurs, we need to set the sender's
+ * user-mode context to a predefined state by the sender.
+ * Otherwise, sender's user context will be corrupted.
+ */
+ rebuild_sender_stack(current->rpal_sd, regs);
+ rpal_schedule(next);
+ } else {
+ update_dst_stack(next, regs);
+ /*
+ * When a lazy switch occurs, we need to set the receiver's
+ * user-mode context to a predefined state by the receiver.
+ * Otherwise, sender's user context will be corrupted.
+ */
+ rebuild_receiver_stack(current->rpal_rd, regs);
+ rpal_schedule(next);
+ rpal_clear_task_thread_flag(prev, RPAL_LAZY_SWITCHED_BIT);
+ prev->rpal_rd->sender = NULL;
+ }
+ if (unlikely(!irqs_disabled())) {
+ local_irq_disable();
+ rpal_err("%s: irq is enabled\n", __func__);
+ }
+ return next;
+}
+
static inline struct task_struct *rpal_get_sender_task(void)
{
struct task_struct *next;
@@ -123,6 +219,18 @@ static inline struct task_struct *rpal_misidentify(void)
return next;
}

+static inline struct task_struct *
+rpal_kernel_context_switch(struct pt_regs *regs)
+{
+ struct task_struct *next = NULL;
+
+ next = rpal_misidentify();
+ if (unlikely(next != NULL))
+ next = rpal_do_kernel_context_switch(next, regs);
+
+ return next;
+}
+
struct task_struct *rpal_find_next_task(unsigned long fsbase)
{
struct rpal_service *cur = rpal_current_service();
@@ -147,6 +255,35 @@ struct task_struct *rpal_find_next_task(unsigned long fsbase)
return tsk;
}

+__visible struct task_struct *
+rpal_syscall_64_context_switch(struct pt_regs *regs, unsigned long nr)
+{
+ struct task_struct *next;
+
+ next = rpal_kernel_context_switch(regs);
+
+ return next;
+}
+
+__visible struct task_struct *
+rpal_exception_context_switch(struct pt_regs *regs)
+{
+ struct task_struct *next;
+
+ next = rpal_kernel_context_switch(regs);
+
+ return next;
+}
+
+__visible struct task_struct *rpal_nmi_context_switch(struct pt_regs *regs)
+{
+ struct task_struct *next;
+
+ next = rpal_kernel_context_switch(regs);
+
+ return next;
+}
+
static bool check_hardware_features(void)
{
if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) {
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 01b582fa821e..b24176f3f245 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -479,7 +479,13 @@ struct rpal_service *rpal_get_mapped_service_by_id(struct rpal_service *rs,
int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
unsigned long addr, int error_code);
struct mm_struct *rpal_pf_get_real_mm(unsigned long address, int *rebuild);
+__visible struct task_struct *
+rpal_syscall_64_context_switch(struct pt_regs *regs, unsigned long nr);
+__visible struct task_struct *
+rpal_exception_context_switch(struct pt_regs *regs);
+__visible struct task_struct *rpal_nmi_context_switch(struct pt_regs *regs);
struct task_struct *rpal_find_next_task(unsigned long fsbase);
+void rpal_lazy_switch_tail(struct task_struct *tsk);
void rpal_kernel_ret(struct pt_regs *regs);

extern void rpal_pick_mmap_base(struct mm_struct *mm,
--
2.20.1

Return-Path: <linux-kernel+bounces-667888-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 199E041E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:38:10 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 5FF361BA1345
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:37:52 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 1602522B5A5;
Fri, 30 May 2025 09:33:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="D+pAZ8Wu"
Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.129.124])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id E911522AE6B
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:33:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.129.124
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597634; cv=none; b=PNZJKXGoP6zI2ch0+/j8Fz1eF65AgFBHZ/l54qvv9qCfyJTMvZ2IdqhHniaGHkR2qk9l2c5ejfEoB0pvYNdnir0WUgxQSBK0ZfpDWuZZdMWMH55YiIPYlFwQRyzuChnMyaDO39a4IxTf7EkxCcbOmpHtpkvjtOnknvqhOO0fO2g=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597634; c=relaxed/simple;
bh=DB+1yl7APIInm8MCrBxZ4Qwh/WXcXEhVDIyFl5OmcI4=;
h=Message-ID:Date:MIME-Version:Subject:To:Cc:References:From:
In-Reply-To:Content-Type; b=oni6FwyTQy30YbYhUP/fr8VthbxxY8z4R1nRgyWOlTID6T7lgOBv5ekheNQzZF6TiQyZyIGdy3X06Fw0+DYyzhtSk7R03PV5IP9EHMJVfBPS/H0805SQOsqop2NiUpQYamoVDYNlge85eDh4hdF1gQARkrTF85LbNNPAWW2q3hA=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=D+pAZ8Wu; arc=none smtp.client-ip=170.10.129.124
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com;
s=mimecast20190719; t=1748597630;
h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
to:to:cc:cc:mime-version:mime-version:content-type:content-type:
content-transfer-encoding:content-transfer-encoding:
in-reply-to:in-reply-to:references:references:autocrypt:autocrypt;
bh=3HsXX6R0HPP/biqLepMPHpr/VTNhNhquA8c3ePXroJI=;
b=D+pAZ8WuD9J24oomtH2M3ZVs3+xWmPjRXiblHrzlpCKBeOwF2MqrydB4nO5ecYBGGi7hA3
6uN3D98SD1AA3RATM83/XsTqi53eQQxdCKpvHZ5potyZwj2EkxTN8iKYOrdWm5v2hYDGJj
62hgVe/yS04jgXbO2hf8fBGtMpJ4VYo=
Received: from mail-wr1-f71.google.com (mail-wr1-f71.google.com
[209.85.221.71]) by relay.mimecast.com with ESMTP with STARTTLS
(version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id
us-mta-321-0Im-0xlhNzWE2dnzsEa40g-1; Fri, 30 May 2025 05:33:49 -0400
X-MC-Unique: 0Im-0xlhNzWE2dnzsEa40g-1
X-Mimecast-MFC-AGG-ID: 0Im-0xlhNzWE2dnzsEa40g_1748597628
Received: by mail-wr1-f71.google.com with SMTP id ffacd0b85a97d-3a4eee2398bso774231f8f.1
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:33:49 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597628; x=1749202428;
h=content-transfer-encoding:in-reply-to:organization:autocrypt
:content-language:from:references:cc:to:subject:user-agent
:mime-version:date:message-id:x-gm-message-state:from:to:cc:subject
:date:message-id:reply-to;
bh=3HsXX6R0HPP/biqLepMPHpr/VTNhNhquA8c3ePXroJI=;
b=Z7Cpu/a/eORPCww7zvUbGI2DjYNa7zmrmReZoJJpHJHbl5McWrXkSlE8/Ss2qvWBr2
qvMXlcv06b22+gPPWa4pdjLCcDTa8t7sZiZfiGQ5clLwbKxu2O02b6kmCqUeMOTHpQua
djdV1c5RAgCLkUC5wfSKkHCE4GEGdWgzwWz0dskVhnnxSa+pQ57Cigx8sSfCqL11dkAM
vS06EgcqCg/rxo0N5jjGC6/ub0RiRiTArxKjWCf2wvtyR0EMzWZihkpxFSic7S/jrzXp
N+JJpO5fJ6jlkODnxzilpfI8xlWxiXBZ16+xWLxOIkxCXFZrZOl+mHCE8lA6VqU8Y2h8
pq+Q==
X-Forwarded-Encrypted: i=1; AJvYcCWq6Fgm/p8d+aLYRhq5o9KaqYFHYbBR15S06Q9fzRUXAzSkxPYcomc1wHNqGYLzsjYSPZcRI/6jdMwoN3A=@vger.kernel.org
X-Gm-Message-State: AOJu0YxvQiDzyoxtJe3rb86CFtNNHb6v8zRt5flMna14G3bospvfC8g+
YqFs1hqUoS6JJv8rSSqvgrw8GGXWPIpTdwMXAbfNTafJFvjPW2KNCq6aufUDhYAx/Np612jc5Vv
x9imqKbHptoutPmB4b9XYT6+TDp29RjXRLXKigqIT7uxG/7UkZvgGEoJmhy939FrIEg==
X-Gm-Gg: ASbGncviETj0KcAzCc2iNuJ29mOb9pU3Ae19v3WnBQt2bAF3Af6MxrNhPnWpXe67ckd
1IFxE3X7BdANM4xmJ1qrQ5xwZVIXapgic4bHVUvKCKlvaVeu3UBUYbXiHowHeMYBnzSdesfHO6D
5hkiB/Z0SO5FRqEPCJzgwDI+Mi7di41032HjURE96m/H4b57+K24K62MBg0kmP52ryk/RErDZmE
geovAsCT4+mcTO7Z1xJSuokbm00kCco8eDL/HFKKcnnlEWiUNiPke8YQtXTQ7AVkRIW+ImurCYx
t9RAczheuvsBjnQGJ4C4IyEw1ZnDxwx5mjl2+kwepdJ1tnFFKPuelC/QpdoQDboZBjBKwM8pspT
qxnmJ3flQGi0M/VV7bNdHkVg58Cmek3GgRFHQSqs=
X-Received: by 2002:a05:6000:188b:b0:3a4:e393:11e2 with SMTP id ffacd0b85a97d-3a4f7a366bbmr2070887f8f.34.1748597628397;
Fri, 30 May 2025 02:33:48 -0700 (PDT)
X-Google-Smtp-Source: AGHT+IFPLwoEcfpG65ruS1QubjzGQLrQH9J6REe/EFj1LtU5rHML/KjwauHHFCNN+LVLagp2fysdSQ==
X-Received: by 2002:a05:6000:188b:b0:3a4:e393:11e2 with SMTP id ffacd0b85a97d-3a4f7a366bbmr2070847f8f.34.1748597627996;
Fri, 30 May 2025 02:33:47 -0700 (PDT)
Received: from ?IPV6:2003:d8:2f03:5b00:f549:a879:b2d3:73ee? (p200300d82f035b00f549a879b2d373ee.dip0.t-ipconnect.de. [2003:d8:2f03:5b00:f549:a879:b2d3:73ee])
by smtp.gmail.com with ESMTPSA id ffacd0b85a97d-3a4f00972d9sm4415633f8f.64.2025.05.30.02.33.46
(version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128);
Fri, 30 May 2025 02:33:47 -0700 (PDT)
Message-ID: <bbb19f59-54bc-4399-a387-1df9713fc621@xxxxxxxxxx>
Date: Fri, 30 May 2025 11:33:45 +0200
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
User-Agent: Mozilla Thunderbird
Subject: Re: [PATCH 01/12] mm: Remove PFN_MAP, PFN_SG_CHAIN and PFN_SG_LAST
To: Alistair Popple <apopple@xxxxxxxxxx>, linux-mm@xxxxxxxxx
Cc: gerald.schaefer@xxxxxxxxxxxxx, dan.j.williams@xxxxxxxxx, jgg@xxxxxxxx,
willy@xxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx, nvdimm@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx, linux-ext4@xxxxxxxxxxxxxxx,
linux-xfs@xxxxxxxxxxxxxxx, jhubbard@xxxxxxxxxx, hch@xxxxxx,
zhang.lyra@xxxxxxxxx, debug@xxxxxxxxxxxx, bjorn@xxxxxxxxxx,
balbirs@xxxxxxxxxx, lorenzo.stoakes@xxxxxxxxxx,
linux-arm-kernel@xxxxxxxxxxxxxxxxxxx, loongarch@xxxxxxxxxxxxxxx,
linuxppc-dev@xxxxxxxxxxxxxxxx, linux-riscv@xxxxxxxxxxxxxxxxxxx,
linux-cxl@xxxxxxxxxxxxxxx, dri-devel@xxxxxxxxxxxxxxxxxxxxx, John@xxxxxxxxxx
References: <cover.541c2702181b7461b84f1a6967a3f0e823023fcc.1748500293.git-series.apopple@xxxxxxxxxx>
<cb45fa705b2eefa1228e262778e784e9b3646827.1748500293.git-series.apopple@xxxxxxxxxx>
From: David Hildenbrand <david@xxxxxxxxxx>
Content-Language: en-US
Autocrypt: addr=david@xxxxxxxxxx; keydata=
xsFNBFXLn5EBEAC+zYvAFJxCBY9Tr1xZgcESmxVNI/0ffzE/ZQOiHJl6mGkmA1R7/uUpiCjJ
dBrn+lhhOYjjNefFQou6478faXE6o2AhmebqT4KiQoUQFV4R7y1KMEKoSyy8hQaK1umALTdL
QZLQMzNE74ap+GDK0wnacPQFpcG1AE9RMq3aeErY5tujekBS32jfC/7AnH7I0v1v1TbbK3Gp
XNeiN4QroO+5qaSr0ID2sz5jtBLRb15RMre27E1ImpaIv2Jw8NJgW0k/D1RyKCwaTsgRdwuK
Kx/Y91XuSBdz0uOyU/S8kM1+ag0wvsGlpBVxRR/xw/E8M7TEwuCZQArqqTCmkG6HGcXFT0V9
PXFNNgV5jXMQRwU0O/ztJIQqsE5LsUomE//bLwzj9IVsaQpKDqW6TAPjcdBDPLHvriq7kGjt
WhVhdl0qEYB8lkBEU7V2Yb+SYhmhpDrti9Fq1EsmhiHSkxJcGREoMK/63r9WLZYI3+4W2rAc
UucZa4OT27U5ZISjNg3Ev0rxU5UH2/pT4wJCfxwocmqaRr6UYmrtZmND89X0KigoFD/XSeVv
jwBRNjPAubK9/k5NoRrYqztM9W6sJqrH8+UWZ1Idd/DdmogJh0gNC0+N42Za9yBRURfIdKSb
B3JfpUqcWwE7vUaYrHG1nw54pLUoPG6sAA7Mehl3nd4pZUALHwARAQABzSREYXZpZCBIaWxk
ZW5icmFuZCA8ZGF2aWRAcmVkaGF0LmNvbT7CwZgEEwEIAEICGwMGCwkIBwMCBhUIAgkKCwQW
AgMBAh4BAheAAhkBFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAl8Ox4kFCRKpKXgACgkQTd4Q
9wD/g1oHcA//a6Tj7SBNjFNM1iNhWUo1lxAja0lpSodSnB2g4FCZ4R61SBR4l/psBL73xktp
rDHrx4aSpwkRP6Epu6mLvhlfjmkRG4OynJ5HG1gfv7RJJfnUdUM1z5kdS8JBrOhMJS2c/gPf
wv1TGRq2XdMPnfY2o0CxRqpcLkx4vBODvJGl2mQyJF/gPepdDfcT8/PY9BJ7FL6Hrq1gnAo4
3Iv9qV0JiT2wmZciNyYQhmA1V6dyTRiQ4YAc31zOo2IM+xisPzeSHgw3ONY/XhYvfZ9r7W1l
pNQdc2G+o4Di9NPFHQQhDw3YTRR1opJaTlRDzxYxzU6ZnUUBghxt9cwUWTpfCktkMZiPSDGd
KgQBjnweV2jw9UOTxjb4LXqDjmSNkjDdQUOU69jGMUXgihvo4zhYcMX8F5gWdRtMR7DzW/YE
BgVcyxNkMIXoY1aYj6npHYiNQesQlqjU6azjbH70/SXKM5tNRplgW8TNprMDuntdvV9wNkFs
9TyM02V5aWxFfI42+aivc4KEw69SE9KXwC7FSf5wXzuTot97N9Phj/Z3+jx443jo2NR34XgF
89cct7wJMjOF7bBefo0fPPZQuIma0Zym71cP61OP/i11ahNye6HGKfxGCOcs5wW9kRQEk8P9
M/k2wt3mt/fCQnuP/mWutNPt95w9wSsUyATLmtNrwccz63XOwU0EVcufkQEQAOfX3n0g0fZz
Bgm/S2zF/kxQKCEKP8ID+Vz8sy2GpDvveBq4H2Y34XWsT1zLJdvqPI4af4ZSMxuerWjXbVWb
T6d4odQIG0fKx4F8NccDqbgHeZRNajXeeJ3R7gAzvWvQNLz4piHrO/B4tf8svmRBL0ZB5P5A
2uhdwLU3NZuK22zpNn4is87BPWF8HhY0L5fafgDMOqnf4guJVJPYNPhUFzXUbPqOKOkL8ojk
CXxkOFHAbjstSK5Ca3fKquY3rdX3DNo+EL7FvAiw1mUtS+5GeYE+RMnDCsVFm/C7kY8c2d0G
NWkB9pJM5+mnIoFNxy7YBcldYATVeOHoY4LyaUWNnAvFYWp08dHWfZo9WCiJMuTfgtH9tc75
7QanMVdPt6fDK8UUXIBLQ2TWr/sQKE9xtFuEmoQGlE1l6bGaDnnMLcYu+Asp3kDT0w4zYGsx
5r6XQVRH4+5N6eHZiaeYtFOujp5n+pjBaQK7wUUjDilPQ5QMzIuCL4YjVoylWiBNknvQWBXS
lQCWmavOT9sttGQXdPCC5ynI+1ymZC1ORZKANLnRAb0NH/UCzcsstw2TAkFnMEbo9Zu9w7Kv
AxBQXWeXhJI9XQssfrf4Gusdqx8nPEpfOqCtbbwJMATbHyqLt7/oz/5deGuwxgb65pWIzufa
N7eop7uh+6bezi+rugUI+w6DABEBAAHCwXwEGAEIACYCGwwWIQQb2cqtc1xMOkYN/MpN3hD3
AP+DWgUCXw7HsgUJEqkpoQAKCRBN3hD3AP+DWrrpD/4qS3dyVRxDcDHIlmguXjC1Q5tZTwNB
boaBTPHSy/Nksu0eY7x6HfQJ3xajVH32Ms6t1trDQmPx2iP5+7iDsb7OKAb5eOS8h+BEBDeq
3ecsQDv0fFJOA9ag5O3LLNk+3x3q7e0uo06XMaY7UHS341ozXUUI7wC7iKfoUTv03iO9El5f
XpNMx/YrIMduZ2+nd9Di7o5+KIwlb2mAB9sTNHdMrXesX8eBL6T9b+MZJk+mZuPxKNVfEQMQ
a5SxUEADIPQTPNvBewdeI80yeOCrN+Zzwy/Mrx9EPeu59Y5vSJOx/z6OUImD/GhX7Xvkt3kq
Er5KTrJz3++B6SH9pum9PuoE/k+nntJkNMmQpR4MCBaV/J9gIOPGodDKnjdng+mXliF3Ptu6
3oxc2RCyGzTlxyMwuc2U5Q7KtUNTdDe8T0uE+9b8BLMVQDDfJjqY0VVqSUwImzTDLX9S4g/8
kC4HRcclk8hpyhY2jKGluZO0awwTIMgVEzmTyBphDg/Gx7dZU1Xf8HFuE+UZ5UDHDTnwgv7E
th6RC9+WrhDNspZ9fJjKWRbveQgUFCpe1sa77LAw+XFrKmBHXp9ZVIe90RMe2tRL06BGiRZr
jPrnvUsUUsjRoRNJjKKA/REq+sAnhkNPPZ/NNMjaZ5b8Tovi8C0tmxiCHaQYqj7G2rgnT0kt
WNyWQQ==
Organization: Red Hat
In-Reply-To: <cb45fa705b2eefa1228e262778e784e9b3646827.1748500293.git-series.apopple@xxxxxxxxxx>
Content-Type: text/plain; charset=UTF-8; format=flowed
Content-Transfer-Encoding: 7bit
X-Spam-Status: No, score=-6.3 required=5.0 tests=DKIMWL_WL_HIGH,DKIM_SIGNED,
DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
MAILING_LIST_MULTI,RCVD_IN_DNSWL_MED,
RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,RCVD_IN_VALIDITY_RPBL_BLOCKED,
SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

On 29.05.25 08:32, Alistair Popple wrote:
> The PFN_MAP flag is no longer used for anything, so remove it. The
> PFN_SG_CHAIN and PFN_SG_LAST flags never appear to have been used so
> also remove them.
>
> Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
> Reviewed-by: Christoph Hellwig <hch@xxxxxx>
> ---

With SPECIAL mentioned as well

Acked-by: David Hildenbrand <david@xxxxxxxxxx>

--
Cheers,

David / dhildenb

Return-Path: <linux-kernel+bounces-667889-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id D222B41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:38:36 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 91B0A188A159
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:38:15 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 24412221F04;
Fri, 30 May 2025 09:34:08 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="Tl4Miuqq"
Received: from mail-pj1-f49.google.com (mail-pj1-f49.google.com [209.85.216.49])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8C65A220F2A
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:34:05 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.49
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597647; cv=none; b=RZka/JeQIh3HIC9Y4r74+qEI9vkiOo/lIku5YNZbgD9V9uDpwzPNHMez4lg3RKQ0ZPVkHuBwbYo0OBZHbBrjd8cFxCX0Wu1IhM3DaWDXGPzwYwqjrBD255rEFJ9TznyPS+UZZXFjmid0pkOOirrGcXwwfig89SAO6hZcuh1cWGs=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597647; c=relaxed/simple;
bh=UOL+ld6kJQ8MITB46M/S3E58D1n58tehOr68tDVTVCQ=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=D7IoJKCC8x27n/0dKFKyJBobTxPrKI9/elM5pVskzIt/9Mzvw8T6sW215gT7YqF0uM5vVs5HUM0m5CV60BlsWaiXFpj4co7dxudYWgoauRN7WwGlrlHDkefKIoAobfWDo0mIdvu2lUElVFXUEbRbVPfne3AMrpiJKm6MHDCyxT4=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=Tl4Miuqq; arc=none smtp.client-ip=209.85.216.49
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pj1-f49.google.com with SMTP id 98e67ed59e1d1-311d5fdf1f0so1703356a91.1
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:34:05 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597645; x=1749202445; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=JJCF21JaAHh50QQNMzAFoSnSXgccx/2injFNIwjbQFo=;
b=Tl4MiuqqrXmkBATq/jgK2TPawTNlYG/h9+Mlq3CjwrjDxBrPcQxTtygGQhLnjVkR9o
FJ5CFUvXkks9Y0N6XRX65o8H2hJj5Lnm9VpK6W3Qs3daNtEEl99YbLh++fGyyNRHfb2r
FVIhtgvR/hjYrMqYwujJd+Fj/71j7QLBylWBnJcX2AUOGNWNpdoDJiZ4vIjRJTy8VLRg
VIAa5VVs3rGdL6m8hzuKCzcs7w95d+QUje+kWxGzQ8fFBSLLUF2GCAXmmGngQlP+S2iK
gbYIKMWKgctZryn5cAeNWLNnx2WfhUDPYT/q0wC4VklpjUb8b0CCV8pmRXUcz7Nnp16O
C8Eg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597645; x=1749202445;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=JJCF21JaAHh50QQNMzAFoSnSXgccx/2injFNIwjbQFo=;
b=k8mXd9a6Q+N77/gAOaXYC0Vln/jtsvoRVxPQrJ7Hx1fbEZFqfdxCaW1Kggeq+jd2nV
Fno9xnKDgR9bAklhJYzbrfmOIUvxxIqYkPNhT+6/yuE+gPlWYZpQcToSQB/VhPhREcuc
JaEhC4V8vaGyO1QwS6wkglahhEe6/8/HukmzkhvFPFIjtwxPBcd5o77Wu1PMBVjxHSbG
9NeojQ9yjWVPzGAw6K8C3HkbEyjNFXnMpdlt8IOgYzzJpigEl6ymDbWlhpWAQWp2cWFV
lOm/YZvTaRGySkaflKVuyEjVaUOVaJ6xp87xD2B59dI2/YiFsQxa0zDZ4Y8sK/93CwZC
PEcA==
X-Forwarded-Encrypted: i=1; AJvYcCVj5ZE+UEOM0hYApLbSvJ4xYuw2xTiZGhfQhC3BTKMu9IJX9KUwkMAxRw8yph87jQiTgaiYNviyn/bG4nM=@vger.kernel.org
X-Gm-Message-State: AOJu0YyS7hVk1oUjykZFE6gtou7tqVy92BRVYst9yKDXIcRuy6Pnk/5m
lUbl8OWfuBYw8IjOPQgR8bJKR7qQ+aAo0yTH7nTL76la7YQvFo0CMMdHDzgRKpiPMwKeBhSqvaM
2jpiv
X-Gm-Gg: ASbGncsRhKMJwgNhdDG1XTQs6m5x+XVAkbS6r1klEb7WfwhQNgUz+bu3eU+tvJxR/YM
094u28coLD8PdyexW6qvpbhpIv6f50VFC53HbTkOdpQSt0vVZjjQu853v6WsskaeFbjsPOitDeD
7MQaPy9r3c4kmFiyHR+JMb1m+rgSr1WC1V0Z99xRSK4Fr4VRgDnt2uTGgBbYDVEtFPKiFlOfCB2
n0cCXQ6tmZf0NCNy55JvwbNz9owpPP4OxVJk8xwNFasQfK0xwwDAYYVv3lEG6cEFlba48QZIpPP
faCLPeIzbw2FfCvR3dB0Wg9/MoF17QeRPNEpF+6MQCfQ+HfOPHAO5oj5nfk+pTQLY1V4bK/cu/R
wlibCI1+E0Q==
X-Google-Smtp-Source: AGHT+IHRhTQBcA7s5JBEKqc7+0a9k8cenFZsoQhX8jbpv2vo1Ygv4WOD6zUURQ2dI64sy1qmxMzdzQ==
X-Received: by 2002:a17:90a:e7cb:b0:311:b3e7:fb2c with SMTP id 98e67ed59e1d1-312503643c0mr2125489a91.13.1748597644660;
Fri, 30 May 2025 02:34:04 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.33.49
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:34:04 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 22/35] RPAL: rebuild receiver state
Date: Fri, 30 May 2025 17:27:50 +0800
Message-Id: <af2895170223142a8dc824c7096d986da57aeb96.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

When an RPAL call occurs, the sender modifies the receiver's state. If the
sender exits abnormally after modifying the state or encounters an
unhandled page fault and returns to a recovery point, the receiver's state
will remain as modified by the sender (e.g., in the CALL state). Since the
sender may have exited, the lazy switch will not occur, leaving the
receiver unrecoverable (unable to be woken up via try_to_wake_up()).
Therefore, the kernel must ensure the receiver's state remains valid in
these cases.

This patch addresses this by rebuild receiver's state during unhandled page
faults or sender exits. The kernel detect the fsbase value recorded by
the sender and use the fsbase value to locate the corresponding receiver.
Then kernel checking if the receiver is in the CALL state set by the
sender (using sender_id and service_id carried in the CALL state). If true,
transitioning the receiver from CALL to WAIT state and notifying the
receiver via sender_state that the RPAL call has completed.

This ensures that even if the sender fails, the receiver can recover and
resume normal operation by resetting its state and avoiding permanent
blocking.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/thread.c | 44 +++++++++++++++++++++++++++++++++++++++++-
1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/arch/x86/rpal/thread.c b/arch/x86/rpal/thread.c
index db3b13ff82be..02c1a9c22dd7 100644
--- a/arch/x86/rpal/thread.c
+++ b/arch/x86/rpal/thread.c
@@ -224,6 +224,45 @@ int rpal_unregister_receiver(void)
return ret;
}

+/* sender may corrupt receiver's state if unexpectedly exited, rebuild it */
+static void rpal_rebuild_receiver_context_on_exit(void)
+{
+ struct task_struct *receiver = NULL;
+ struct rpal_sender_data *rsd = current->rpal_sd;
+ struct rpal_sender_call_context *scc = rsd->scc;
+ struct rpal_receiver_data *rrd;
+ struct rpal_receiver_call_context *rcc;
+ unsigned long fsbase;
+ int state = rpal_build_call_state(rsd);
+
+ if (scc->ec.magic != RPAL_ERROR_MAGIC)
+ goto out;
+
+ fsbase = scc->ec.fsbase;
+ if (rpal_is_correct_address(rpal_current_service(), fsbase))
+ goto out;
+
+ receiver = rpal_find_next_task(fsbase);
+ if (!receiver)
+ goto out;
+
+ rrd = receiver->rpal_rd;
+ if (!rrd)
+ goto out;
+
+ rcc = rrd->rcc;
+
+ if (atomic_read(&rcc->receiver_state) == state) {
+ atomic_cmpxchg(&rcc->sender_state, RPAL_SENDER_STATE_CALL,
+ RPAL_SENDER_STATE_KERNEL_RET);
+ atomic_cmpxchg(&rcc->receiver_state, state,
+ RPAL_RECEIVER_STATE_WAIT);
+ }
+
+out:
+ return;
+}
+
int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
unsigned long addr, int error_code)
{
@@ -232,6 +271,7 @@ int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
unsigned long erip, ersp;
int magic;

+ rpal_rebuild_receiver_context_on_exit();
erip = scc->ec.erip;
ersp = scc->ec.ersp;
magic = scc->ec.magic;
@@ -249,8 +289,10 @@ int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,

void exit_rpal_thread(void)
{
- if (rpal_test_current_thread_flag(RPAL_SENDER_BIT))
+ if (rpal_test_current_thread_flag(RPAL_SENDER_BIT)) {
+ rpal_rebuild_receiver_context_on_exit();
rpal_unregister_sender();
+ }

if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT))
rpal_unregister_receiver();
--
2.20.1

Return-Path: <linux-kernel+bounces-667890-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 98CAA41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:38:56 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id E1C111BA1E8C
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:38:31 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 580CC22B5AD;
Fri, 30 May 2025 09:34:23 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="MzMp1xwL"
Received: from mail-pj1-f47.google.com (mail-pj1-f47.google.com [209.85.216.47])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id DEDF228E7
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:34:20 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.47
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597662; cv=none; b=ApUuD1snKakTA9Gc28Aps5tQFi8v0OB9uYzkRFTIWs2TCwLPHGBy5pte3VaTa91G3oI2HrLHgYU/YyxY9MenaJGrsBh5TTPdvp8TMivk5vzHZk7bVxuAox3Zui7NVF8Bf7PEPB6AGNWbGe+GaHKTAEb+1wbRh+zVZSxQ6yMqaQs=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597662; c=relaxed/simple;
bh=OoRqkHVJJA6Fnn08xoi8Z+2+DiJAYYu6EEBieJEgtFs=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=AmUejweQ2caiIH/YRfFlo6d44BHAK80+eJSnBqLYfDP9+sOU8R2jVcUcFCP8NEVHvVxsPGgZASQ9Z/DaA8ju0DDTosjcqiFp3rs/+AT/Fgai00KXbpCSS5JOuVW901UtLg4FV54ZB/blu2AtPWGwws4XKCRSo77h3HPZYfhEyWo=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=MzMp1xwL; arc=none smtp.client-ip=209.85.216.47
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pj1-f47.google.com with SMTP id 98e67ed59e1d1-3122a63201bso1073210a91.0
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:34:20 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597660; x=1749202460; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=J8WmlWSYokiEZq8ImcTa9QlYOraa+eQKx+5TbFt2hGQ=;
b=MzMp1xwLobGJ+PxhL9U31NFiYdW2e6Es6/fKGhgS9fdx/3RJIOx6Z7xgtAlekeM4OF
w2FHcZiGe/oaeAj4agy4q2T6ro7WHZAS1OUnyEjYeoN/KU3T2pJlXdCXQth147r09gg1
upJOCi/JaVg7pKSz9GEHtge688GgY5m+rOiNcbvLzR+b48C5+zHCu0ZzGAMHIY0AOGyC
ne70J2lkOTTmg/LXAciIlOeaD09k81C+JOCe78G+aFeCLKaqvXAoR5kvpqJHHEp/Nwhn
iauXi4dGeW6dZOMk+imu/OBgrpZdQKC1ZZcmC0ljhNIEhpfxQnUphQIRRg6qOjrfu/Kt
Dsvw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597660; x=1749202460;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=J8WmlWSYokiEZq8ImcTa9QlYOraa+eQKx+5TbFt2hGQ=;
b=kg6fN7xbJvcjqYeYIE/bEdnSq3RiaOq9qoAxSsh1FIH7Hc8RmpHAanYGkdIbkbfxj2
jhwbAa0csEupHgOzI2nrAZQSHVQlI8JbNu6dMjLi20YksIVT6WsUQXtWQWAMSBLiGyDS
NfCaLk6e2e7as69l3JNhTns0FpsHIEnliAghffE0IGc0uWFxB5JdJS1tOPqZcc8os3rw
WBZIontnF/I7auV7Bh+Rb/IaALL0fhXAHigXDoDaiMfFTvgp3igpnoPL9oe+K1AlUT8+
CNX7x9ummq9c5h4sIrHaFLdBz6TpHjo46CR01oVD+ZeWkLJR3aL8PPLSeGXOCs3cFxtt
874Q==
X-Forwarded-Encrypted: i=1; AJvYcCWyX7/B2sbrbrYlE9Xp8B7+1w5cO8Sts7SYrfMw9H66oknqbl6EGitmNKSmoBSe+tnd47dTbi6aDog2Uwo=@vger.kernel.org
X-Gm-Message-State: AOJu0YxVPdSW9ck+kXQYYRVjlpmEj3RVF/5NGG3fPb9Y+i7OiDWQwzzW
gVY2V7p8id4tjlzG+wK49x+y2VlI2sVVL2f1LJNJltkTXq2JQNHuZI20GlmR0tgukwY=
X-Gm-Gg: ASbGncsF7pAnCn8fyGHKpaAY0DffTcb8B3V51u6qG4lulnXVck28izUMcpUZeL1PHBd
ZnSNA2c7uVlCn+QsUcw8ro+6GG1tCauBO3KSTS72ci6oX4kUQG8M70qoR0uscrapgzNWfjYnm76
GaqqUmuTN1LecLCWNhixczl/nuqD+E1ITPfmEb55erzL9HDsw7Wo9mzhdTtlhBoXUzGNXhbXjKz
Sw48wO1PceiK4QLQJshRS14GRloZgwhhxpZJSPiBZhidEN1g7bssLzD+3h+/EAqob7v9LSGBzDK
zNGsoeXZMtBqleiAsgas7bsJTmV1dRmHuUgHvdq2Qo860wsWvGr8b4lK56iAMFS71WJnSqLM8+r
Cz21Zm2pN+w==
X-Google-Smtp-Source: AGHT+IF4zFLLKAk8uepZIGBZM+3fXRB/hltSj95jGD1vqbYt83ikw4yeQ1JJHBQBbftK0SQTg3CQlA==
X-Received: by 2002:a17:90a:d2cf:b0:311:a314:c2c7 with SMTP id 98e67ed59e1d1-3124150e346mr4368890a91.2.1748597660011;
Fri, 30 May 2025 02:34:20 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.34.05
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:34:19 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 23/35] RPAL: resume cpumask when fork
Date: Fri, 30 May 2025 17:27:51 +0800
Message-Id: <45c1884aaf21256ed6fc66b4a4a716bffebb54e1.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

After a lazy switch occurs, RPAL locks the receiver to the current CPU by
modifying its cpumask. If the receiver performs a fork operation at this
point, the kernel will copy the modified cpumask to the new task, causing
the new task to be permanently locked on the current CPU.

This patch addresses this issue by detecting whether the original task is
locked to the current CPU by RPAL during fork. If locked, assigning the
cpumask that existed before the lazy switch to the new task. This ensures
the new task will not be locked to the current CPU.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/kernel/process.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c1d2dac72b9c..be8845e2ca4d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -29,6 +29,7 @@
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
#include <linux/entry-common.h>
+#include <linux/rpal.h>
#include <asm/cpu.h>
#include <asm/cpuid/api.h>
#include <asm/apic.h>
@@ -88,6 +89,19 @@ EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);

+#ifdef CONFIG_RPAL
+static void rpal_fix_task_dump(struct task_struct *dst,
+ struct task_struct *src)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&src->pi_lock, flags);
+ if (rpal_test_task_thread_flag(src, RPAL_CPU_LOCKED_BIT))
+ cpumask_copy(&dst->cpus_mask, &src->rpal_cd->old_mask);
+ raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+}
+#endif
+
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
@@ -100,6 +114,10 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif
+#ifdef CONFIG_RPAL
+ if (src->rpal_rs)
+ rpal_fix_task_dump(dst, src);
+#endif

return 0;
}
--
2.20.1

Return-Path: <linux-kernel+bounces-667891-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sv.mirrors.kernel.org (sv.mirrors.kernel.org [139.178.88.99])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id AD52541E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:39:02 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sv.mirrors.kernel.org (Postfix) with ESMTPS id 268869E752C
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:38:11 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id A17F322D9E5;
Fri, 30 May 2025 09:34:36 +0000 (UTC)
Received: from szxga04-in.huawei.com (szxga04-in.huawei.com [45.249.212.190])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id A9B0721E0BB
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:34:33 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=45.249.212.190
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597676; cv=none; b=USK67H+Iavm3yEsZDDk00x9wDvPYunDZiZzqxqj+v+t3wVjyhz10GT+yZl6E7VewmDkHZVL6hHIbrsTxpHCvmQX03BYxt/54PtREcktptOKO3Vcfq1wu6uqMiPHzuRMc3zCCW9/dsNAZe1o3eb3AYqNyBusgm75stqJGBjh5y2I=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597676; c=relaxed/simple;
bh=qN296d2kc28YBwiP7NJmVYDN4EdglpN4dAqYkcuO0IU=;
h=Message-ID:Date:MIME-Version:Subject:To:CC:References:From:
In-Reply-To:Content-Type; b=h+n/ME5FtaFBTGPUvpv6ivP+UrwpBfDZQ14EbuCDkFiXLLmfne7jxOGSKn83Z1f7djFRKA4JAFgbK/Wgd49vMlOqwpR9lHmTSA9JPbC9eLIltFxG2tlAit4dunljPvN9sN1oEp9vKOQbygLb3mPIY6flqEPFAWARHI2ufjvNrSs=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com; spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.190
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com
Received: from mail.maildlp.com (unknown [172.19.88.234])
by szxga04-in.huawei.com (SkyGuard) with ESMTP id 4b7ydw47t0z2CdW8;
Fri, 30 May 2025 17:30:44 +0800 (CST)
Received: from kwepemf100008.china.huawei.com (unknown [7.202.181.222])
by mail.maildlp.com (Postfix) with ESMTPS id 6EDB0140113;
Fri, 30 May 2025 17:34:30 +0800 (CST)
Received: from [10.174.178.24] (10.174.178.24) by
kwepemf100008.china.huawei.com (7.202.181.222) with Microsoft SMTP Server
(version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id
15.2.1544.11; Fri, 30 May 2025 17:34:29 +0800
Message-ID: <c94b97ea-4dd4-7575-2144-81e4272c8fee@xxxxxxxxxx>
Date: Fri, 30 May 2025 17:34:28 +0800
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101
Thunderbird/102.11.0
Subject: Re: [PATCH] fs/resctrl: Restore the missing rdt_last_cmd_clear()
Content-Language: en-US
To: Reinette Chatre <reinette.chatre@xxxxxxxxx>
CC: <bobo.shaobowang@xxxxxxxxxx>, <linux-kernel@xxxxxxxxxxxxxxx>,
<Dave.Martin@xxxxxxx>, <tony.luck@xxxxxxxxx>, <xiaochen.shen@xxxxxxxxx>,
<bp@xxxxxxx>, <fenghua.yu@xxxxxxxxx>, <james.morse@xxxxxxx>
References: <20250529113353.3275066-1-zengheng4@xxxxxxxxxx>
<44a4f211-6723-4fde-822c-d739fa2d603d@xxxxxxxxx>
From: Zeng Heng <zengheng4@xxxxxxxxxx>
In-Reply-To: <44a4f211-6723-4fde-822c-d739fa2d603d@xxxxxxxxx>
Content-Type: text/plain; charset="UTF-8"; format=flowed
Content-Transfer-Encoding: 8bit
X-ClientProxiedBy: dggems705-chm.china.huawei.com (10.3.19.182) To
kwepemf100008.china.huawei.com (7.202.181.222)
X-Spam-Status: No, score=-4.5 required=5.0 tests=HEADER_FROM_DIFFERENT_DOMAINS,
MAILING_LIST_MULTI,NICE_REPLY_A,RCVD_IN_DNSWL_MED,
RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,RCVD_IN_VALIDITY_RPBL_BLOCKED,
SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

On 2025/5/30 6:01, Reinette Chatre wrote:
> Hi Zeng Heng,
>
> Thank you very much for catching this and providing a fix.
>
> On 5/29/25 4:33 AM, Zeng Heng wrote:
>> The fixes tag patch resolves the lockdep warning. However, directly
>> removing rdt_last_cmd_clear() would leave the last_cmd_status interface
>> with stale logs, which does not conform to the functional definition before
>> the fix. Therefore, the rdt_last_cmd_clear() operation is performed after
>> successfully acquiring the rdtgroup_mutex.
>
> I would like to suggest some rework to changelog to meet requirements from
> Documentation/process/maintainer-tip.rst. Specifically the rules about
> imperative tone and structure of the changelog. Below attempts to address
> those requirements but please feel free to rework after you considered the
> rules yourself:
>
> A lockdep fix removed two rdt_last_cmd_clear() calls that were used
> to clear the last_cmd_status buffer but called without holding the
> required rdtgroup_mutex. The impacted resctrl commands are:
> writing to the cpus or cpus_list files and creating a new monitor
> or control group. With stale data in the last_cmd_status buffer the
> impacted resctrl commands report the stale error on success, or append
> its own failure message to the stale error on failure.
>
> Restore the rdt_last_cmd_clear() calls after acquiring rdtgroup_mutex.

Thank you for the correction, I will review the requirements mentioned
in the documents above.

>
>>
>> Fixes: c8eafe149530 ("x86/resctrl: Fix potential lockdep warning")
>> Signed-off-by: Zeng Heng <zengheng4@xxxxxxxxxx>
>> ---
>> fs/resctrl/rdtgroup.c | 4 ++++
>> 1 file changed, 4 insertions(+)
>>
>> diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
>> index cc37f58b47dd..4aae9eb74215 100644
>> --- a/fs/resctrl/rdtgroup.c
>> +++ b/fs/resctrl/rdtgroup.c
>> @@ -536,6 +536,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
>> goto unlock;
>> }
>>
>> + rdt_last_cmd_clear();
>> +
>> if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
>> rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
>> ret = -EINVAL;
>> @@ -3481,6 +3483,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
>> goto out_unlock;
>> }
>>
>> + rdt_last_cmd_clear();
>> +
>
> Could you please move this to be right after acquiring the mutex? I think clearing
> last_cmd_status at beginning of a resctrl command's work is a good pattern to follow.
> Thus a change like:

The patch will be corrected in version v2. Thank you again.

Best regardsï¼?
Zeng Heng

Return-Path: <linux-kernel+bounces-667892-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sv.mirrors.kernel.org (sv.mirrors.kernel.org [139.178.88.99])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 531E341E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:39:13 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sv.mirrors.kernel.org (Postfix) with ESMTPS id 1A5799E6987
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:38:23 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 1C0892309B5;
Fri, 30 May 2025 09:34:39 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="HZMhmXP+"
Received: from mail-pg1-f179.google.com (mail-pg1-f179.google.com [209.85.215.179])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 82A9A22D9E1
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:34:36 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.179
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597678; cv=none; b=n86BDCxj1CGSV7YCd2T3PSJZjoWVdhVgmA3ieBvYv4yNtSLjHdU16yeMkBYxCniKmOasn9X8Hq156sA5SvZaXVUUiIddmOE7rSo90CyQiiOLZZLQnTfuyHEUva1798WDYF+LmAsMPjLpnz5+lEGPR7AD8NohaAIY8GHb5EKXsc4=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597678; c=relaxed/simple;
bh=AaX4tZdXtUo1twDok859Zwut8WsNeeyIc7hLqSJN5fQ=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=M9DCZY9384K0Rv4t8xooStYlINT9HyZ49KSS1MHkWinfxpO9OgYnI4SDisowBbZFMhnm0f8UevzCVgIliJVgMUmjgxKEe8hNudPXD2KFA6KCaiYTx9i89s1adn19Sxs4J9clhlW0h4gKsybFudXU9QWOvUIi7Zz1+1xzrdBsSSI=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=HZMhmXP+; arc=none smtp.client-ip=209.85.215.179
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pg1-f179.google.com with SMTP id 41be03b00d2f7-b271f3ae786so1307585a12.3
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:34:36 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597676; x=1749202476; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=Gd4M6twRFQoCr4fYLEaBSZS3XlcwhQLeBelobi/ni6A=;
b=HZMhmXP+JCNWRJqSd3RPI0w5J/yKoXpICiXXM63dslgXSQCYFZQlNgUy1aj/GGEUZz
2sJr/bbPYkYApZ2CTjK/3k9WQkwH+Oa9CTbVopfxMhWD0W/NnMWUP3CKq2ges1++vfvB
bZvHRKYKEp6YotqTTeazXCc1+LrGsG6UpY1theDhxSTnhJCJGiJI93TXuG04TAoJp3Hc
4dwVBNESqs1+xqcNMoLih6S0pS99gk1XkcFXDniAYBXP4zEcMSFHBlFad/4tI2yEO3QA
J9cW+JVrLP3HAhCapw1rC/dtLab9bMTkGghearsaGmLnXYtzTxoubOlqkgiSm2OXkZZ2
+DWA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597676; x=1749202476;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=Gd4M6twRFQoCr4fYLEaBSZS3XlcwhQLeBelobi/ni6A=;
b=hohCW3QuQ4Dncryqme5rVy+hQT412zReJO1s7+YUWArPIQPOzj2QC1ZPic001hE81n
9pHL/Jj4L2/lktw03TcxHbK+/OOzisikEKuRoZ2UHDe/JlgGrMLiREON/DL8cfO+KkMn
BtwOaJzDyd8ako7k5pZf7wkyAWPnlXs0Oo0Y8WzAMgfI0QSJ0kUzee0YOwXzi3Ww75Kj
iIWr4R0vL7tzej433sWNy3T9Hv39zm9ToEXg6oK6PT/slES+wDgnzgCMzKIf9i/ILTy4
vPt2js8Ntnsw+JU/sKz4Q/Po2by+doWUHyhFn1RCpR2hKB4juyskZErrkKYuE+DgPvtr
QDLg==
X-Forwarded-Encrypted: i=1; AJvYcCVq5EzyD5eVrX8Ye8iqS1aovSBvy3Nya8ziUdna6aS5FD1ik+SldBJRDmM/VUghqZmer/n80A9Lxtluc6c=@vger.kernel.org
X-Gm-Message-State: AOJu0YzejVol+y8caldLj/WTzov6ggx7D48jtC07wgGQpT4et8CaTzK2
zMab805m+hzwGfFeBEK78229RQpHS4yX0IjMXLAGM0iPLyc+rmAgqN/ordayBHE5giM=
X-Gm-Gg: ASbGncsU7QWIP5Vj/7nr2Shbapl7y1M1BKVpSf/9dFnKnTEsghfvY50HuZmOWQtANUJ
60oH+tYDKaPOImI2SK8IxDvNPDMeZx4ThL1kjwmYsyJTOzR/YDHj1paXbQqawKNtry+N7VxeDhV
JPy0VJrpauk6U+O5XmepkDNpEU0OdLipakhDgx1wB9Ds5xcImdY50FNGgCPZFWfXdQPJ+EeV5qj
z3S0VxzyB3VxdWeZ+nUsq8ViJg6Y/TDMqmYe/+X1peFIjPCezpjB/BbDtN15GxWHsYfgbDJ97X0
IFbGpXqiQ6cLLIoeRTOAxDUx+e3VaotjnKukQqMcYRikbVoROaSRzSntWC4rox3z2o81Xfe9jzN
QAQwfK9I7mg==
X-Google-Smtp-Source: AGHT+IFObkOA1vqgzfl2J7tHJ7MiXmVGrPtynvFE/NiK8PUmWtoYGs6g/wJAuSU3bu11FutgVffcuA==
X-Received: by 2002:a17:90a:d2ce:b0:311:b3e7:fb3c with SMTP id 98e67ed59e1d1-31241e97f30mr3743435a91.31.1748597675528;
Fri, 30 May 2025 02:34:35 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.34.20
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:34:35 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 24/35] RPAL: critical section optimization
Date: Fri, 30 May 2025 17:27:52 +0800
Message-Id: <47c919a7d65cb5def07c561e29305d39d9df925f.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

The critical section is defined as the user mode code segment within the
receiver that executes when control returns from the receiver to the
sender. This code segment, located in the receiver, involves operations
such as switching the fsbase register and changing the stack pointer.
Handling the critical section can be categorized into two scenarios:

- First Scenario: If no lazy switch has occurred prior to the return and
the fsbase switch is incomplete, a lazy switch is triggered to
transition the kernel context from the sender to the receiver. After
the fsbase is updated in user mode, another lazy switch occurs to revert
the kernel context from the receiver back to the sender. This results in
two unnecessary lazy switches.

- Second Scenario: If a lazy switch has already occurred during execution
of the critical section, the lazy switch can be preemptively triggered.
This avoids re-entering the kernel solely to initiate another lazy
switch.

The implementation of the critical section involves modifying the fsbase
register in kernel mode and setting the sender's user mode context to a
predefined state. These steps minimize redundant user/kernel transitions
and lazy switches.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/core.c | 88 ++++++++++++++++++++++++++++++++++++++++-
arch/x86/rpal/service.c | 12 ++++++
include/linux/rpal.h | 6 +++
3 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index c48df1ce4324..406d54788bac 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -219,14 +219,98 @@ static inline struct task_struct *rpal_misidentify(void)
return next;
}

+static bool in_ret_section(struct rpal_service *rs, unsigned long ip)
+{
+ return ip >= rs->rsm.rcs.ret_begin && ip < rs->rsm.rcs.ret_end;
+}
+
+/*
+ * rpal_update_fsbase - fastpath when RPAL call returns
+ * @regs: pt_regs saved in kernel entry
+ *
+ * If the user is executing rpal call return code and it does
+ * not update fsbase yet, force fsbase update to perform a
+ * lazy switch immediately.
+ */
+static inline void rpal_update_fsbase(struct pt_regs *regs)
+{
+ struct rpal_service *cur = rpal_current_service();
+ struct task_struct *sender = current->rpal_rd->sender;
+
+ if (in_ret_section(cur, regs->ip))
+ wrfsbase(sender->thread.fsbase);
+}
+
+/*
+ * rpal_skip_receiver_code - skip rpal call return code
+ * @next: the next task to be lazy switched to.
+ * @regs: pt_regs saved in kernel entry
+ *
+ * If the user is executing rpal call return code and we are about
+ * to perform a lazy switch, skip the remaining return code to
+ * release receiver's stack. This avoids stack conflict when there
+ * are more than one senders calls the receiver.
+ */
+static inline void rpal_skip_receiver_code(struct task_struct *next,
+ struct pt_regs *regs)
+{
+ rebuild_sender_stack(next->rpal_sd, regs);
+}
+
+/*
+ * rpal_skip_receiver_code - skip lazy switch when rpal call return
+ * @next: the next task to be lazy switched to.
+ * @regs: pt_regs saved in kernel entry
+ *
+ * If the user is executing rpal call return code and we have not
+ * performed a lazy switch, there is no need to perform lazy switch
+ * now. Update fsbase and other states to avoid lazy switch.
+ */
+static inline struct task_struct *
+rpal_skip_lazy_switch(struct task_struct *next, struct pt_regs *regs)
+{
+ struct rpal_service *tgt;
+
+ tgt = next->rpal_rs;
+ if (in_ret_section(tgt, regs->ip)) {
+ wrfsbase(current->thread.fsbase);
+ rebuild_sender_stack(current->rpal_sd, regs);
+ rpal_clear_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT);
+ next->rpal_rd->sender = NULL;
+ next = NULL;
+ }
+ return next;
+}
+
+static struct task_struct *rpal_fix_critical_section(struct task_struct *next,
+ struct pt_regs *regs)
+{
+ struct rpal_service *cur = rpal_current_service();
+
+ /* sender->receiver */
+ if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT))
+ next = rpal_skip_lazy_switch(next, regs);
+ /* receiver->sender */
+ else if (rpal_is_correct_address(cur, regs->ip))
+ rpal_skip_receiver_code(next, regs);
+
+ return next;
+}
+
static inline struct task_struct *
rpal_kernel_context_switch(struct pt_regs *regs)
{
struct task_struct *next = NULL;

+ if (rpal_test_current_thread_flag(RPAL_LAZY_SWITCHED_BIT))
+ rpal_update_fsbase(regs);
+
next = rpal_misidentify();
- if (unlikely(next != NULL))
- next = rpal_do_kernel_context_switch(next, regs);
+ if (unlikely(next != NULL)) {
+ next = rpal_fix_critical_section(next, regs);
+ if (next)
+ next = rpal_do_kernel_context_switch(next, regs);
+ }

return next;
}
diff --git a/arch/x86/rpal/service.c b/arch/x86/rpal/service.c
index 49458321e7dc..16e94d710445 100644
--- a/arch/x86/rpal/service.c
+++ b/arch/x86/rpal/service.c
@@ -545,6 +545,13 @@ int rpal_release_service(u64 key)
return ret;
}

+static bool rpal_check_critical_section(struct rpal_service *rs,
+ struct rpal_critical_section *rcs)
+{
+ return rpal_is_correct_address(rs, rcs->ret_begin) &&
+ rpal_is_correct_address(rs, rcs->ret_end);
+}
+
int rpal_enable_service(unsigned long arg)
{
struct rpal_service *cur = rpal_current_service();
@@ -562,6 +569,11 @@ int rpal_enable_service(unsigned long arg)
goto out;
}

+ if (!rpal_check_critical_section(cur, &rsm.rcs)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
mutex_lock(&cur->mutex);
if (!cur->enabled) {
cur->rsm = rsm;
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index b24176f3f245..4f1d92053818 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -122,12 +122,18 @@ enum rpal_sender_state {
RPAL_SENDER_STATE_KERNEL_RET,
};

+struct rpal_critical_section {
+ unsigned long ret_begin;
+ unsigned long ret_end;
+};
+
/*
* user_meta will be sent to other service when requested.
*/
struct rpal_service_metadata {
unsigned long version;
void __user *user_meta;
+ struct rpal_critical_section rcs;
};

struct rpal_request_arg {
--
2.20.1

Return-Path: <linux-kernel+bounces-667893-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from ny.mirrors.kernel.org (ny.mirrors.kernel.org [147.75.199.223])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 4DE6441E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:39:17 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by ny.mirrors.kernel.org (Postfix) with ESMTPS id 0B38B17FFE6
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:39:01 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 84DDF2309B2;
Fri, 30 May 2025 09:34:42 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (1024-bit key) header.d=suse.cz header.i=@suse.cz header.b="YGybCRk+";
dkim=permerror (0-bit key) header.d=suse.cz header.i=@suse.cz header.b="QXF936EA";
dkim=pass (1024-bit key) header.d=suse.cz header.i=@suse.cz header.b="kMoFVsNA";
dkim=permerror (0-bit key) header.d=suse.cz header.i=@suse.cz header.b="XbeHvZ6j"
Received: from smtp-out1.suse.de (smtp-out1.suse.de [195.135.223.130])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id ED1D822D9F8
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:34:39 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=195.135.223.130
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597681; cv=none; b=QRiVigpv0c3U71JuRNtbcVBzvls+pCo2Un3q1NAHSo0ttXpog4mSqsl1tOZYfYlSPJGbv1nz8o89yl1TvFC4H/BX1a9QgMl+VcXLlWnoOqBQyijXbrDKJTwtOpodyxV5hhQLcNFIx7liNnOpgYw8D+pFAleuNaL200gB20qCC24=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597681; c=relaxed/simple;
bh=b+wG00NAHNH2WQRgMMTmBkjITAFnm0pZWIaTBBFJZh4=;
h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version:
Content-Type:Content-Disposition:In-Reply-To; b=igsTwGFsnPGlyGRPMUINhFX5g0cVkjwfXZx0K3XmhyCmSk4/6RqqOW5NmlptW4FuX5nItPNh+VwDNtIHwoZVdPCmyFo5ffN3XShvKmY2G0LretnGj+KOjwjHMIgt2uQGUr9tOxSveKJkt5mzXY3I93OLxfpPck6k9644Ix8Vouk=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=suse.cz; spf=pass smtp.mailfrom=suse.cz; dkim=pass (1024-bit key) header.d=suse.cz header.i=@suse.cz header.b=YGybCRk+; dkim=permerror (0-bit key) header.d=suse.cz header.i=@suse.cz header.b=QXF936EA; dkim=pass (1024-bit key) header.d=suse.cz header.i=@suse.cz header.b=kMoFVsNA; dkim=permerror (0-bit key) header.d=suse.cz header.i=@suse.cz header.b=XbeHvZ6j; arc=none smtp.client-ip=195.135.223.130
Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=suse.cz
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=suse.cz
Received: from localhost (unknown [10.100.12.32])
by smtp-out1.suse.de (Postfix) with ESMTP id 0496721199;
Fri, 30 May 2025 09:34:37 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.cz; s=susede2_rsa;
t=1748597678; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
mime-version:mime-version:content-type:content-type:
in-reply-to:in-reply-to:references:references;
bh=ItuZODbQUON55ZfNqCBg9/7wO6q1b21kKNU9knhkEfc=;
b=YGybCRk+GUMDo18XQr/74+0as7Za6cvWedFbBN0eXABoTliEi8gtfARRQzsoQxWiXZ/lZH
jfgFLS4UWXExVFbrbyh09WWV+bQ4ZWC3dEtBjtyyUOHyZgPQLux/gRsBBDuiTN2EceffWJ
PfwieiqzSav0/RJspFcSk862/LETdRU=
DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=suse.cz;
s=susede2_ed25519; t=1748597678;
h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
mime-version:mime-version:content-type:content-type:
in-reply-to:in-reply-to:references:references;
bh=ItuZODbQUON55ZfNqCBg9/7wO6q1b21kKNU9knhkEfc=;
b=QXF936EAdkErhGve4niFPkYmLyG5oz8HHFVS6y3t9bQl2Kse0glawCIuulJsMnlOIPEdXx
aMST8TzTVdMmM5DA==
Authentication-Results: smtp-out1.suse.de;
none
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.cz; s=susede2_rsa;
t=1748597677; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
mime-version:mime-version:content-type:content-type:
in-reply-to:in-reply-to:references:references;
bh=ItuZODbQUON55ZfNqCBg9/7wO6q1b21kKNU9knhkEfc=;
b=kMoFVsNAX9J0KE3foXRbf5+dCc6zzp8RemYVFIyCNQlXT0bOGA1eN1R4LkFPRArUsB9/GL
nMHAi/zkWITPJ7t+qZqIeCQT88ymc5HR3GJB4VfHBnA902ZV6mB766lGH2GX9o81uR5/u5
b5c7ZLZN4KizPO8cjMOY12YVUf8t4iI=
DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=suse.cz;
s=susede2_ed25519; t=1748597677;
h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
mime-version:mime-version:content-type:content-type:
in-reply-to:in-reply-to:references:references;
bh=ItuZODbQUON55ZfNqCBg9/7wO6q1b21kKNU9knhkEfc=;
b=XbeHvZ6j21kgYeJ9uEQAmtKlapnWk6Gk0uFr1d5BKkSqquQ/jbw1S7pZ3tbWsSqZPpeYW5
aXL+vWk1rcwnvHBw==
Date: Fri, 30 May 2025 11:34:36 +0200
From: Jiri Bohac <jbohac@xxxxxxx>
To: David Hildenbrand <david@xxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>, Baoquan He <bhe@xxxxxxxxxx>,
Donald Dutile <ddutile@xxxxxxxxxx>, Vivek Goyal <vgoyal@xxxxxxxxxx>,
Dave Young <dyoung@xxxxxxxxxx>, kexec@xxxxxxxxxxxxxxxxxxx,
Philipp Rudo <prudo@xxxxxxxxxx>, Pingfan Liu <piliu@xxxxxxxxxx>,
Tao Liu <ltao@xxxxxxxxxx>, linux-kernel@xxxxxxxxxxxxxxx,
David Hildenbrand <dhildenb@xxxxxxxxxx>
Subject: Re: [PATCH v2 0/5] kdump: crashkernel reservation from CMA
Message-ID: <aDl7rHb34zIXEf6j@xxxxxxxxxxxxx>
References: <04904e86-5b5f-4aa1-a120-428dac119189@xxxxxxxxxx>
<427fec88-2a74-471e-aeb6-a108ca8c4336@xxxxxxxxxx>
<Z8Z/gnbtiXT9QAZr@MiWiFi-R3L-srv>
<e9c5c247-85fb-43f1-9aa8-47d62321f37b@xxxxxxxxxx>
<aDgQ0lbt1h5v0lgE@tiehlicka>
<a1a5af90-bc8a-448a-81fa-485624d592f3@xxxxxxxxxx>
<aDlsF5tAcUxo4VgT@tiehlicka>
<e0f7fc1e-2227-4c6b-985a-34a697a52679@xxxxxxxxxx>
<aDl1ViMpK_6q_z06@tiehlicka>
<04a49de5-eb79-431b-ba5b-eae2536781c6@xxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <04a49de5-eb79-431b-ba5b-eae2536781c6@xxxxxxxxxx>
X-Spam-Score: -4.30
X-Spamd-Result: default: False [-4.30 / 50.00];
BAYES_HAM(-3.00)[100.00%];
NEURAL_HAM_LONG(-1.00)[-1.000];
NEURAL_HAM_SHORT(-0.20)[-1.000];
MIME_GOOD(-0.10)[text/plain];
MISSING_XM_UA(0.00)[];
FROM_HAS_DN(0.00)[];
MIME_TRACE(0.00)[0:+];
RCPT_COUNT_TWELVE(0.00)[12];
ARC_NA(0.00)[];
RCVD_COUNT_ZERO(0.00)[0];
MID_RHS_MATCH_FROMTLD(0.00)[];
DKIM_SIGNED(0.00)[suse.cz:s=susede2_rsa,suse.cz:s=susede2_ed25519];
FROM_EQ_ENVFROM(0.00)[];
TO_DN_SOME(0.00)[];
TO_MATCH_ENVRCPT_ALL(0.00)[];
FUZZY_BLOCKED(0.00)[rspamd.com];
DBL_BLOCKED_OPENRESOLVER(0.00)[dwarf.suse.cz:mid,localhost:helo]
X-Spam-Level:
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

On Fri, May 30, 2025 at 11:11:40AM +0200, David Hildenbrand wrote:
> On 30.05.25 11:07, Michal Hocko wrote:
> > On Fri 30-05-25 10:39:39, David Hildenbrand wrote:
> > > On 30.05.25 10:28, Michal Hocko wrote:
> > [...]
> > > > All that being said I would go with an additional parameter to the
> > > > kdump cma setup - e.g. cma_sane_dma that would skip waiting and use 10s
> > > > otherwise. That would make the optimized behavior opt in, we do not need
> > > > to support all sorts of timeouts and also learn if this is not
> > > > sufficient.
> > > >
> > > > Makes sense?
> > >
> > > Just so I understand correctly, you mean extending the "crashkernel=" option
> > > with a boolean parameter? If set, e.g., wait 1s, otherwise magic number 10?
> >
> > crashkernel=1G,cma,cma_sane_dma # no wait on transition
>
> But is no wait ok? I mean, any O_DIRECT with any device would at least take
> a bit, no?
>
> Of course, there is a short time between the crash and actually triggerying
> kdump.
>
> > crashkernel=1G,cma # wait on transition with e.g. 10s timeout
>
> In general, would work for me.

I don't like extending the crashkernel= syntax like this.
It would make hooking into the generic parsing code in
parse_crashkernel() really ugly. The syntax is already
convoluted as is and hard enough to explain in the documentation.

Also I don't see how adding a boolean knob is better than adding
one that allows setting any arbitrary timeout. It has less
flexibility and all the drawbacks of having an extra knob.

I am inclined to just setting the fixed delay to 10s for now and
adding a sysfs knob later if someone asks for it.

Would that work for you?

If you don't have other objections to the v3 series,
I'll just update it for v6.15 and post again a v4
with the 10s timeout...

Thanks for your input!

--
Jiri Bohac <jbohac@xxxxxxx>
SUSE Labs, Prague, Czechia

Return-Path: <linux-kernel+bounces-667894-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sy.mirrors.kernel.org (sy.mirrors.kernel.org [147.75.48.161])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id A513741E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:39:25 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sy.mirrors.kernel.org (Postfix) with ESMTPS id 5DD567A319A
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:37:56 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 35B0A22331B;
Fri, 30 May 2025 09:34:53 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=fail reason="signature verification failed" (2048-bit key) header.d=hmeau.com header.i=@hmeau.com header.b="ra1IbCWz"
Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 25427230BD0
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:34:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597688; cv=none; b=j2JSK9iu/p4cBtoknOzAG/c8mi0AqY5dkz6aa406dOFSIpA73Tijd3lfJy8R7kwbdM9HcSkUIGwmFkvEn8nM4wamzO8pF002JoZrM3ok8kR7PixNew59G0duMevNeJwEyGqdt2i4c6xkLtcMRrmleCC7dTnfwAia6/vBcRYdre4=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597688; c=relaxed/simple;
bh=EoT2dKpjRWSuMyiN59l+2L7JeKYfQR+Tq6keNX2utZc=;
h=Date:From:To:Cc:Subject:Message-ID:MIME-Version:Content-Type:
Content-Disposition:In-Reply-To; b=bp9iDoJB+WclVoOR/agzVoReH6xeO584rgkEnMWMs1TB5XsaoetGt4Ikz73EM/yZu+z4AKOViDEOGmoznUdihQqmAx9RB3dqEAk4RJ/T3GbwCtJ+x66nyaBE4kpUrKURuRkmw/vVTBdZZLrjq+MjA+OX0kbWl1bfQ25GvAlYY98=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; dkim=pass (2048-bit key) header.d=hmeau.com header.i=@hmeau.com header.b=ra1IbCWz; arc=none smtp.client-ip=144.6.53.87
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=hmeau.com;
s=formenos; h=In-Reply-To:Content-Type:MIME-Version:Message-ID:Subject:Cc:To:
From:Date:Sender:Reply-To:Content-Transfer-Encoding:Content-ID:
Content-Description:Resent-Date:Resent-From:Resent-Sender:Resent-To:Resent-Cc
:Resent-Message-ID:References:List-Id:List-Help:List-Unsubscribe:
List-Subscribe:List-Post:List-Owner:List-Archive;
bh=tSBVL+2tXg2frEyJDCxYdCIb+/FbCVTtW/DFFR+NWi4=; b=ra1IbCWzYjXAeGcf5LUjRsW/Tc
d31sOF+bs/BjFXRTAkEwHpsUvISLMde6zzUb1/w00iMMuNcYJYwB4GnG/qZBfBHvdwCu5R+OQ9PPk
cyEeHjR9m0XLb6IEsd4T2sNXOqcDB9Zl66vakwjnM92cIdlZn9jmyIWEJGJCRwv1PAquvZY2V5Bvo
dqH2Sh4p0VT0inNUlXKw898NXJy1LiC0bYMZZNJ3yMqKTDODjitNgkFviOk1zbGHXak4rHHg4sQCn
YHtL9PUsGyThmsmAlDdia+aE6L6/odL1XBluYB1+xKLyBY8plqN7FpcRsjStA8Ca1u7lcvKO8fOkO
ZlSZD+1Q==;
Received: from loth.rohan.me.apana.org.au ([192.168.167.2])
by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian))
id 1uKw8H-009kfY-1N;
Fri, 30 May 2025 17:34:34 +0800
Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Fri, 30 May 2025 17:34:33 +0800
Date: Fri, 30 May 2025 17:34:33 +0800
From: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
To: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx, dietmar.eggemann@xxxxxxx,
torvalds@xxxxxxxxxxxxxxxxxxxx, peterz@xxxxxxxxxxxxx,
sshegde@xxxxxxxxxxxxx, tglx@xxxxxxxxxxxxx, vschneid@xxxxxxxxxx,
rostedt@xxxxxxxxxxx, mgorman@xxxxxxx, vincent.guittot@xxxxxxxxxx,
bigeasy@xxxxxxxxxxxxx, mingo@xxxxxxxxxx
Subject: [PATCH] sched: Break dependency loop between sched.h and preempt.h
Message-ID: <aDl7qWiO94tGz6X9@xxxxxxxxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20250528080924.2273858-1-mingo@xxxxxxxxxx>
X-Newsgroups: apana.lists.os.linux.kernel
X-Spam-Status: No, score=-3.1 required=5.0 tests=DKIM_INVALID,DKIM_SIGNED,
HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,RCVD_IN_DNSWL_MED,
RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,RCVD_IN_VALIDITY_RPBL_BLOCKED,
SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

Ingo Molnar <mingo@xxxxxxxxxx> wrote:
>
> The latest version of this series can be found at:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/mingo/tip.git WIP.sched/core

I'm trying to break a dependency loop between linux/sched.h and
linux/preempt.h and your tree seems to be a good base for my
patch.

---8<---
There is a dependency loop between linux/preempt.h and
linux/sched.h:

https://patchwork.kernel.org/project/linux-crypto/patch/20250530041658.909576-1-chenhuacai@xxxxxxxxxxx/

In particular, sched.h relies on PREEMPT_LOCK_OFFSET from preempt.h,
introduced by commit fe32d3cd5e8e; while preempt.h relies on sched.h
for current->softirq_disable_cnt, introduced by commit 728b478d2d35.

sched.h actually includes preempt.h, while preempt.h does not include
sched.h which causes build failures in users who include preempt.h
without sched.h.

Fix this by splitting struct task_struct out of sched.h and into
sched/types.h. Then preempt.h can include sched/types.h and sched.h
can continue to include preempt.h without creating a loop.

Note that the struct seq_file forward declaration has been kept
in linux/sched.h despite there being no users in either sched.h
or sched/types.h. This is because multiple header files are
relying on that forward declaration.

Reported-by: Huacai Chen <chenhuacai@xxxxxxxxxxx>
Signed-off-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1fad1c8a4c76..abb0d982396b 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -7,8 +7,10 @@
* preempt_count (used for kernel preemption, interrupt count, etc.)
*/

+#include <asm/current.h>
#include <linux/linkage.h>
#include <linux/cleanup.h>
+#include <linux/sched/types.h>
#include <linux/types.h>

/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d3167059675..44bfd980f620 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -10,80 +10,21 @@
#include <uapi/linux/sched.h>

#include <asm/current.h>
-#include <asm/processor.h>
#include <linux/thread_info.h>
#include <linux/preempt.h>
#include <linux/cpumask_types.h>

-#include <linux/cache.h>
-#include <linux/irqflags_types.h>
-#include <linux/smp_types.h>
-#include <linux/pid_types.h>
-#include <linux/sem_types.h>
-#include <linux/shm.h>
-#include <linux/kmsan_types.h>
-#include <linux/mutex_types.h>
-#include <linux/plist_types.h>
-#include <linux/hrtimer_types.h>
-#include <linux/timer_types.h>
-#include <linux/seccomp_types.h>
-#include <linux/nodemask_types.h>
-#include <linux/refcount_types.h>
#include <linux/resource.h>
-#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
-#include <linux/signal_types.h>
-#include <linux/syscall_user_dispatch_types.h>
-#include <linux/mm_types_task.h>
-#include <linux/netdevice_xmit.h>
-#include <linux/task_io_accounting.h>
-#include <linux/posix-timers_types.h>
-#include <linux/restart_block.h>
-#include <uapi/linux/rseq.h>
-#include <linux/seqlock_types.h>
-#include <linux/kcsan.h>
-#include <linux/rv.h>
-#include <linux/uidgid_types.h>
#include <linux/tracepoint-defs.h>
#include <asm/kmap_size.h>

-/* task_struct member predeclarations (sorted alphabetically): */
-struct audit_context;
-struct bio_list;
-struct blk_plug;
-struct bpf_local_storage;
-struct bpf_run_ctx;
-struct bpf_net_context;
-struct capture_control;
-struct cfs_rq;
-struct fs_struct;
-struct futex_pi_state;
-struct io_context;
-struct io_uring_task;
-struct mempolicy;
-struct nameidata;
-struct nsproxy;
-struct perf_event_context;
-struct perf_ctx_data;
+/* struct predeclarations (sorted alphabetically): */
struct pid_namespace;
-struct pipe_inode_info;
-struct rcu_node;
-struct reclaim_state;
-struct robust_list_head;
struct root_domain;
-struct rq;
struct sched_attr;
-struct sched_dl_entity;
struct seq_file;
-struct sighand_struct;
-struct signal_struct;
-struct task_delay_info;
-struct task_group;
-struct task_struct;
-struct user_event_mm;
-
-#include <linux/sched/ext.h>

/*
* Task state bitmask. NOTE! These bits are also
@@ -310,14 +251,6 @@ struct user_event_mm;

#define get_current_state() READ_ONCE(current->__state)

-/*
- * Define the task command name length as enum, then it can be visible to
- * BPF programs.
- */
-enum {
- TASK_COMM_LEN = 16,
-};
-
extern void sched_tick(void);

#define MAX_SCHEDULE_TIMEOUT LONG_MAX
@@ -343,46 +276,6 @@ extern void io_schedule(void);
DECLARE_TRACEPOINT(sched_set_state_tp);
extern void __trace_set_current_state(int state_value);

-/**
- * struct prev_cputime - snapshot of system and user cputime
- * @utime: time spent in user mode
- * @stime: time spent in system mode
- * @lock: protects the above two fields
- *
- * Stores previous user/system time values such that we can guarantee
- * monotonicity.
- */
-struct prev_cputime {
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- u64 utime;
- u64 stime;
- raw_spinlock_t lock;
-#endif
-};
-
-enum vtime_state {
- /* Task is sleeping or running in a CPU with VTIME inactive: */
- VTIME_INACTIVE = 0,
- /* Task is idle */
- VTIME_IDLE,
- /* Task runs in kernelspace in a CPU with VTIME active: */
- VTIME_SYS,
- /* Task runs in userspace in a CPU with VTIME active: */
- VTIME_USER,
- /* Task runs as guests in a CPU with VTIME active: */
- VTIME_GUEST,
-};
-
-struct vtime {
- seqcount_t seqcount;
- unsigned long long starttime;
- enum vtime_state state;
- unsigned int cpu;
- u64 utime;
- u64 stime;
- u64 gtime;
-};
-
/*
* Utilization clamp constraints.
* @UCLAMP_MIN: Minimum utilization
@@ -404,380 +297,6 @@ struct sched_param {
int sched_priority;
};

-struct sched_info {
-#ifdef CONFIG_SCHED_INFO
- /* Cumulative counters: */
-
- /* # of times we have run on this CPU: */
- unsigned long pcount;
-
- /* Time spent waiting on a runqueue: */
- unsigned long long run_delay;
-
- /* Max time spent waiting on a runqueue: */
- unsigned long long max_run_delay;
-
- /* Min time spent waiting on a runqueue: */
- unsigned long long min_run_delay;
-
- /* Timestamps: */
-
- /* When did we last run on a CPU? */
- unsigned long long last_arrival;
-
- /* When were we last queued to run? */
- unsigned long long last_queued;
-
-#endif /* CONFIG_SCHED_INFO */
-};
-
-/*
- * Integer metrics need fixed point arithmetic, e.g., sched/fair
- * has a few: load, load_avg, util_avg, freq, and capacity.
- *
- * We define a basic fixed point arithmetic range, and then formalize
- * all these metrics based on that basic range.
- */
-# define SCHED_FIXEDPOINT_SHIFT 10
-# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
-
-/* Increase resolution of cpu_capacity calculations */
-# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
-# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
-
-struct load_weight {
- unsigned long weight;
- u32 inv_weight;
-};
-
-/*
- * The load/runnable/util_avg accumulates an infinite geometric series
- * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
- *
- * [load_avg definition]
- *
- * load_avg = runnable% * scale_load_down(load)
- *
- * [runnable_avg definition]
- *
- * runnable_avg = runnable% * SCHED_CAPACITY_SCALE
- *
- * [util_avg definition]
- *
- * util_avg = running% * SCHED_CAPACITY_SCALE
- *
- * where runnable% is the time ratio that a sched_entity is runnable and
- * running% the time ratio that a sched_entity is running.
- *
- * For cfs_rq, they are the aggregated values of all runnable and blocked
- * sched_entities.
- *
- * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
- * capacity scaling. The scaling is done through the rq_clock_pelt that is used
- * for computing those signals (see update_rq_clock_pelt())
- *
- * N.B., the above ratios (runnable% and running%) themselves are in the
- * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
- * to as large a range as necessary. This is for example reflected by
- * util_avg's SCHED_CAPACITY_SCALE.
- *
- * [Overflow issue]
- *
- * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
- * with the highest load (=88761), always runnable on a single cfs_rq,
- * and should not overflow as the number already hits PID_MAX_LIMIT.
- *
- * For all other cases (including 32-bit kernels), struct load_weight's
- * weight will overflow first before we do, because:
- *
- * Max(load_avg) <= Max(load.weight)
- *
- * Then it is the load_weight's responsibility to consider overflow
- * issues.
- */
-struct sched_avg {
- u64 last_update_time;
- u64 load_sum;
- u64 runnable_sum;
- u32 util_sum;
- u32 period_contrib;
- unsigned long load_avg;
- unsigned long runnable_avg;
- unsigned long util_avg;
- unsigned int util_est;
-} ____cacheline_aligned;
-
-/*
- * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- * updates. When a task is dequeued, its util_est should not be updated if its
- * util_avg has not been updated in the meantime.
- * This information is mapped into the MSB bit of util_est at dequeue time.
- * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
- * it is safe to use MSB.
- */
-#define UTIL_EST_WEIGHT_SHIFT 2
-#define UTIL_AVG_UNCHANGED 0x80000000
-
-struct sched_statistics {
-#ifdef CONFIG_SCHEDSTATS
- u64 wait_start;
- u64 wait_max;
- u64 wait_count;
- u64 wait_sum;
- u64 iowait_count;
- u64 iowait_sum;
-
- u64 sleep_start;
- u64 sleep_max;
- s64 sum_sleep_runtime;
-
- u64 block_start;
- u64 block_max;
- s64 sum_block_runtime;
-
- s64 exec_max;
- u64 slice_max;
-
- u64 nr_migrations_cold;
- u64 nr_failed_migrations_affine;
- u64 nr_failed_migrations_running;
- u64 nr_failed_migrations_hot;
- u64 nr_forced_migrations;
-
- u64 nr_wakeups;
- u64 nr_wakeups_sync;
- u64 nr_wakeups_migrate;
- u64 nr_wakeups_local;
- u64 nr_wakeups_remote;
- u64 nr_wakeups_affine;
- u64 nr_wakeups_affine_attempts;
- u64 nr_wakeups_passive;
- u64 nr_wakeups_idle;
-
-#ifdef CONFIG_SCHED_CORE
- u64 core_forceidle_sum;
-#endif
-#endif /* CONFIG_SCHEDSTATS */
-} ____cacheline_aligned;
-
-struct sched_entity {
- /* For load-balancing: */
- struct load_weight load;
- struct rb_node run_node;
- u64 deadline;
- u64 min_vruntime;
- u64 min_slice;
-
- struct list_head group_node;
- unsigned char on_rq;
- unsigned char sched_delayed;
- unsigned char rel_deadline;
- unsigned char custom_slice;
- /* hole */
-
- u64 exec_start;
- u64 sum_exec_runtime;
- u64 prev_sum_exec_runtime;
- u64 vruntime;
- s64 vlag;
- u64 slice;
-
- u64 nr_migrations;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- int depth;
- struct sched_entity *parent;
- /* rq on which this entity is (to be) queued: */
- struct cfs_rq *cfs_rq;
- /* rq "owned" by this entity/group: */
- struct cfs_rq *my_q;
- /* cached value of my_q->h_nr_running */
- unsigned long runnable_weight;
-#endif
-
- /*
- * Per entity load average tracking.
- *
- * Put into separate cache line so it does not
- * collide with read-mostly values above.
- */
- struct sched_avg avg;
-};
-
-struct sched_rt_entity {
- struct list_head run_list;
- unsigned long timeout;
- unsigned long watchdog_stamp;
- unsigned int time_slice;
- unsigned short on_rq;
- unsigned short on_list;
-
- struct sched_rt_entity *back;
-#ifdef CONFIG_RT_GROUP_SCHED
- struct sched_rt_entity *parent;
- /* rq on which this entity is (to be) queued: */
- struct rt_rq *rt_rq;
- /* rq "owned" by this entity/group: */
- struct rt_rq *my_q;
-#endif
-} __randomize_layout;
-
-typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
-typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
-
-struct sched_dl_entity {
- struct rb_node rb_node;
-
- /*
- * Original scheduling parameters. Copied here from sched_attr
- * during sched_setattr(), they will remain the same until
- * the next sched_setattr().
- */
- u64 dl_runtime; /* Maximum runtime for each instance */
- u64 dl_deadline; /* Relative deadline of each instance */
- u64 dl_period; /* Separation of two instances (period) */
- u64 dl_bw; /* dl_runtime / dl_period */
- u64 dl_density; /* dl_runtime / dl_deadline */
-
- /*
- * Actual scheduling parameters. Initialized with the values above,
- * they are continuously updated during task execution. Note that
- * the remaining runtime could be < 0 in case we are in overrun.
- */
- s64 runtime; /* Remaining runtime for this instance */
- u64 deadline; /* Absolute deadline for this instance */
- unsigned int flags; /* Specifying the scheduler behaviour */
-
- /*
- * Some bool flags:
- *
- * @dl_throttled tells if we exhausted the runtime. If so, the
- * task has to wait for a replenishment to be performed at the
- * next firing of dl_timer.
- *
- * @dl_yielded tells if task gave up the CPU before consuming
- * all its available runtime during the last job.
- *
- * @dl_non_contending tells if the task is inactive while still
- * contributing to the active utilization. In other words, it
- * indicates if the inactive timer has been armed and its handler
- * has not been executed yet. This flag is useful to avoid race
- * conditions between the inactive timer handler and the wakeup
- * code.
- *
- * @dl_overrun tells if the task asked to be informed about runtime
- * overruns.
- *
- * @dl_server tells if this is a server entity.
- *
- * @dl_defer tells if this is a deferred or regular server. For
- * now only defer server exists.
- *
- * @dl_defer_armed tells if the deferrable server is waiting
- * for the replenishment timer to activate it.
- *
- * @dl_server_active tells if the dlserver is active(started).
- * dlserver is started on first cfs enqueue on an idle runqueue
- * and is stopped when a dequeue results in 0 cfs tasks on the
- * runqueue. In other words, dlserver is active only when cpu's
- * runqueue has atleast one cfs task.
- *
- * @dl_defer_running tells if the deferrable server is actually
- * running, skipping the defer phase.
- */
- unsigned int dl_throttled : 1;
- unsigned int dl_yielded : 1;
- unsigned int dl_non_contending : 1;
- unsigned int dl_overrun : 1;
- unsigned int dl_server : 1;
- unsigned int dl_server_active : 1;
- unsigned int dl_defer : 1;
- unsigned int dl_defer_armed : 1;
- unsigned int dl_defer_running : 1;
-
- /*
- * Bandwidth enforcement timer. Each -deadline task has its
- * own bandwidth to be enforced, thus we need one timer per task.
- */
- struct hrtimer dl_timer;
-
- /*
- * Inactive timer, responsible for decreasing the active utilization
- * at the "0-lag time". When a -deadline task blocks, it contributes
- * to GRUB's active utilization until the "0-lag time", hence a
- * timer is needed to decrease the active utilization at the correct
- * time.
- */
- struct hrtimer inactive_timer;
-
- /*
- * Bits for DL-server functionality. Also see the comment near
- * dl_server_update().
- *
- * @rq the runqueue this server is for
- *
- * @server_has_tasks() returns true if @server_pick return a
- * runnable task.
- */
- struct rq *rq;
- dl_server_has_tasks_f server_has_tasks;
- dl_server_pick_f server_pick_task;
-
-#ifdef CONFIG_RT_MUTEXES
- /*
- * Priority Inheritance. When a DEADLINE scheduling entity is boosted
- * pi_se points to the donor, otherwise points to the dl_se it belongs
- * to (the original one/itself).
- */
- struct sched_dl_entity *pi_se;
-#endif
-};
-
-#ifdef CONFIG_UCLAMP_TASK
-/* Number of utilization clamp buckets (shorter alias) */
-#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
-
-/*
- * Utilization clamp for a scheduling entity
- * @value: clamp value "assigned" to a se
- * @bucket_id: bucket index corresponding to the "assigned" value
- * @active: the se is currently refcounted in a rq's bucket
- * @user_defined: the requested clamp value comes from user-space
- *
- * The bucket_id is the index of the clamp bucket matching the clamp value
- * which is pre-computed and stored to avoid expensive integer divisions from
- * the fast path.
- *
- * The active bit is set whenever a task has got an "effective" value assigned,
- * which can be different from the clamp value "requested" from user-space.
- * This allows to know a task is refcounted in the rq's bucket corresponding
- * to the "effective" bucket_id.
- *
- * The user_defined bit is set whenever a task has got a task-specific clamp
- * value requested from userspace, i.e. the system defaults apply to this task
- * just as a restriction. This allows to relax default clamps when a less
- * restrictive task-specific value has been requested, thus allowing to
- * implement a "nice" semantic. For example, a task running with a 20%
- * default boost can still drop its own boosting to 0%.
- */
-struct uclamp_se {
- unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
- unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
- unsigned int active : 1;
- unsigned int user_defined : 1;
-};
-#endif /* CONFIG_UCLAMP_TASK */
-
-union rcu_special {
- struct {
- u8 blocked;
- u8 need_qs;
- u8 exp_hint; /* Hint for performance. */
- u8 need_mb; /* Readers need smp_mb(). */
- } b; /* Bits. */
- u32 s; /* Set of bits. */
-};
-
enum perf_event_task_context {
perf_invalid_context = -1,
perf_hw_context = 0,
@@ -785,863 +304,6 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};

-/*
- * Number of contexts where an event can trigger:
- * task, softirq, hardirq, nmi.
- */
-#define PERF_NR_CONTEXTS 4
-
-struct wake_q_node {
- struct wake_q_node *next;
-};
-
-struct kmap_ctrl {
-#ifdef CONFIG_KMAP_LOCAL
- int idx;
- pte_t pteval[KM_MAX_IDX];
-#endif
-};
-
-struct task_struct {
-#ifdef CONFIG_THREAD_INFO_IN_TASK
- /*
- * For reasons of header soup (see current_thread_info()), this
- * must be the first element of task_struct.
- */
- struct thread_info thread_info;
-#endif
- unsigned int __state;
-
- /* saved state for "spinlock sleepers" */
- unsigned int saved_state;
-
- /*
- * This begins the randomizable portion of task_struct. Only
- * scheduling-critical items should be added above here.
- */
- randomized_struct_fields_start
-
- void *stack;
- refcount_t usage;
- /* Per task flags (PF_*), defined further below: */
- unsigned int flags;
- unsigned int ptrace;
-
-#ifdef CONFIG_MEM_ALLOC_PROFILING
- struct alloc_tag *alloc_tag;
-#endif
-
- int on_cpu;
- struct __call_single_node wake_entry;
- unsigned int wakee_flips;
- unsigned long wakee_flip_decay_ts;
- struct task_struct *last_wakee;
-
- /*
- * recent_used_cpu is initially set as the last CPU used by a task
- * that wakes affine another task. Waker/wakee relationships can
- * push tasks around a CPU where each wakeup moves to the next one.
- * Tracking a recently used CPU allows a quick search for a recently
- * used CPU that may be idle.
- */
- int recent_used_cpu;
- int wake_cpu;
- int on_rq;
-
- int prio;
- int static_prio;
- int normal_prio;
- unsigned int rt_priority;
-
- struct sched_entity se;
- struct sched_rt_entity rt;
- struct sched_dl_entity dl;
- struct sched_dl_entity *dl_server;
-#ifdef CONFIG_SCHED_CLASS_EXT
- struct sched_ext_entity scx;
-#endif
- const struct sched_class *sched_class;
-
-#ifdef CONFIG_SCHED_CORE
- struct rb_node core_node;
- unsigned long core_cookie;
- unsigned int core_occupation;
-#endif
-
-#ifdef CONFIG_CGROUP_SCHED
- struct task_group *sched_task_group;
-#endif
-
-
-#ifdef CONFIG_UCLAMP_TASK
- /*
- * Clamp values requested for a scheduling entity.
- * Must be updated with task_rq_lock() held.
- */
- struct uclamp_se uclamp_req[UCLAMP_CNT];
- /*
- * Effective clamp values used for a scheduling entity.
- * Must be updated with task_rq_lock() held.
- */
- struct uclamp_se uclamp[UCLAMP_CNT];
-#endif
-
- struct sched_statistics stats;
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
- /* List of struct preempt_notifier: */
- struct hlist_head preempt_notifiers;
-#endif
-
-#ifdef CONFIG_BLK_DEV_IO_TRACE
- unsigned int btrace_seq;
-#endif
-
- unsigned int policy;
- unsigned long max_allowed_capacity;
- int nr_cpus_allowed;
- const cpumask_t *cpus_ptr;
- cpumask_t *user_cpus_ptr;
- cpumask_t cpus_mask;
- void *migration_pending;
- unsigned short migration_disabled;
- unsigned short migration_flags;
-
-#ifdef CONFIG_PREEMPT_RCU
- int rcu_read_lock_nesting;
- union rcu_special rcu_read_unlock_special;
- struct list_head rcu_node_entry;
- struct rcu_node *rcu_blocked_node;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-
-#ifdef CONFIG_TASKS_RCU
- unsigned long rcu_tasks_nvcsw;
- u8 rcu_tasks_holdout;
- u8 rcu_tasks_idx;
- int rcu_tasks_idle_cpu;
- struct list_head rcu_tasks_holdout_list;
- int rcu_tasks_exit_cpu;
- struct list_head rcu_tasks_exit_list;
-#endif /* #ifdef CONFIG_TASKS_RCU */
-
-#ifdef CONFIG_TASKS_TRACE_RCU
- int trc_reader_nesting;
- int trc_ipi_to_cpu;
- union rcu_special trc_reader_special;
- struct list_head trc_holdout_list;
- struct list_head trc_blkd_node;
- int trc_blkd_cpu;
-#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
-
- struct sched_info sched_info;
-
- struct list_head tasks;
- struct plist_node pushable_tasks;
- struct rb_node pushable_dl_tasks;
-
- struct mm_struct *mm;
- struct mm_struct *active_mm;
- struct address_space *faults_disabled_mapping;
-
- int exit_state;
- int exit_code;
- int exit_signal;
- /* The signal sent when the parent dies: */
- int pdeath_signal;
- /* JOBCTL_*, siglock protected: */
- unsigned long jobctl;
-
- /* Used for emulating ABI behavior of previous Linux versions: */
- unsigned int personality;
-
- /* Scheduler bits, serialized by scheduler locks: */
- unsigned sched_reset_on_fork:1;
- unsigned sched_contributes_to_load:1;
- unsigned sched_migrated:1;
- unsigned sched_task_hot:1;
-
- /* Force alignment to the next boundary: */
- unsigned :0;
-
- /* Unserialized, strictly 'current' */
-
- /*
- * This field must not be in the scheduler word above due to wakelist
- * queueing no longer being serialized by p->on_cpu. However:
- *
- * p->XXX = X; ttwu()
- * schedule() if (p->on_rq && ..) // false
- * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true
- * deactivate_task() ttwu_queue_wakelist())
- * p->on_rq = 0; p->sched_remote_wakeup = Y;
- *
- * guarantees all stores of 'current' are visible before
- * ->sched_remote_wakeup gets used, so it can be in this word.
- */
- unsigned sched_remote_wakeup:1;
-#ifdef CONFIG_RT_MUTEXES
- unsigned sched_rt_mutex:1;
-#endif
-
- /* Bit to tell TOMOYO we're in execve(): */
- unsigned in_execve:1;
- unsigned in_iowait:1;
-#ifndef TIF_RESTORE_SIGMASK
- unsigned restore_sigmask:1;
-#endif
-#ifdef CONFIG_MEMCG_V1
- unsigned in_user_fault:1;
-#endif
-#ifdef CONFIG_LRU_GEN
- /* whether the LRU algorithm may apply to this access */
- unsigned in_lru_fault:1;
-#endif
-#ifdef CONFIG_COMPAT_BRK
- unsigned brk_randomized:1;
-#endif
-#ifdef CONFIG_CGROUPS
- /* disallow userland-initiated cgroup migration */
- unsigned no_cgroup_migration:1;
- /* task is frozen/stopped (used by the cgroup freezer) */
- unsigned frozen:1;
-#endif
-#ifdef CONFIG_BLK_CGROUP
- unsigned use_memdelay:1;
-#endif
-#ifdef CONFIG_PSI
- /* Stalled due to lack of memory */
- unsigned in_memstall:1;
-#endif
-#ifdef CONFIG_PAGE_OWNER
- /* Used by page_owner=on to detect recursion in page tracking. */
- unsigned in_page_owner:1;
-#endif
-#ifdef CONFIG_EVENTFD
- /* Recursion prevention for eventfd_signal() */
- unsigned in_eventfd:1;
-#endif
-#ifdef CONFIG_ARCH_HAS_CPU_PASID
- unsigned pasid_activated:1;
-#endif
-#ifdef CONFIG_X86_BUS_LOCK_DETECT
- unsigned reported_split_lock:1;
-#endif
-#ifdef CONFIG_TASK_DELAY_ACCT
- /* delay due to memory thrashing */
- unsigned in_thrashing:1;
-#endif
-#ifdef CONFIG_PREEMPT_RT
- struct netdev_xmit net_xmit;
-#endif
- unsigned long atomic_flags; /* Flags requiring atomic access. */
-
- struct restart_block restart_block;
-
- pid_t pid;
- pid_t tgid;
-
-#ifdef CONFIG_STACKPROTECTOR
- /* Canary value for the -fstack-protector GCC feature: */
- unsigned long stack_canary;
-#endif
- /*
- * Pointers to the (original) parent process, youngest child, younger sibling,
- * older sibling, respectively. (p->father can be replaced with
- * p->real_parent->pid)
- */
-
- /* Real parent process: */
- struct task_struct __rcu *real_parent;
-
- /* Recipient of SIGCHLD, wait4() reports: */
- struct task_struct __rcu *parent;
-
- /*
- * Children/sibling form the list of natural children:
- */
- struct list_head children;
- struct list_head sibling;
- struct task_struct *group_leader;
-
- /*
- * 'ptraced' is the list of tasks this task is using ptrace() on.
- *
- * This includes both natural children and PTRACE_ATTACH targets.
- * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
- */
- struct list_head ptraced;
- struct list_head ptrace_entry;
-
- /* PID/PID hash table linkage. */
- struct pid *thread_pid;
- struct hlist_node pid_links[PIDTYPE_MAX];
- struct list_head thread_node;
-
- struct completion *vfork_done;
-
- /* CLONE_CHILD_SETTID: */
- int __user *set_child_tid;
-
- /* CLONE_CHILD_CLEARTID: */
- int __user *clear_child_tid;
-
- /* PF_KTHREAD | PF_IO_WORKER */
- void *worker_private;
-
- u64 utime;
- u64 stime;
-#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
- u64 utimescaled;
- u64 stimescaled;
-#endif
- u64 gtime;
- struct prev_cputime prev_cputime;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- struct vtime vtime;
-#endif
-
-#ifdef CONFIG_NO_HZ_FULL
- atomic_t tick_dep_mask;
-#endif
- /* Context switch counts: */
- unsigned long nvcsw;
- unsigned long nivcsw;
-
- /* Monotonic time in nsecs: */
- u64 start_time;
-
- /* Boot based time in nsecs: */
- u64 start_boottime;
-
- /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
- unsigned long min_flt;
- unsigned long maj_flt;
-
- /* Empty if CONFIG_POSIX_CPUTIMERS=n */
- struct posix_cputimers posix_cputimers;
-
-#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
- struct posix_cputimers_work posix_cputimers_work;
-#endif
-
- /* Process credentials: */
-
- /* Tracer's credentials at attach: */
- const struct cred __rcu *ptracer_cred;
-
- /* Objective and real subjective task credentials (COW): */
- const struct cred __rcu *real_cred;
-
- /* Effective (overridable) subjective task credentials (COW): */
- const struct cred __rcu *cred;
-
-#ifdef CONFIG_KEYS
- /* Cached requested key. */
- struct key *cached_requested_key;
-#endif
-
- /*
- * executable name, excluding path.
- *
- * - normally initialized begin_new_exec()
- * - set it with set_task_comm()
- * - strscpy_pad() to ensure it is always NUL-terminated and
- * zero-padded
- * - task_lock() to ensure the operation is atomic and the name is
- * fully updated.
- */
- char comm[TASK_COMM_LEN];
-
- struct nameidata *nameidata;
-
-#ifdef CONFIG_SYSVIPC
- struct sysv_sem sysvsem;
- struct sysv_shm sysvshm;
-#endif
-#ifdef CONFIG_DETECT_HUNG_TASK
- unsigned long last_switch_count;
- unsigned long last_switch_time;
-#endif
- /* Filesystem information: */
- struct fs_struct *fs;
-
- /* Open file information: */
- struct files_struct *files;
-
-#ifdef CONFIG_IO_URING
- struct io_uring_task *io_uring;
-#endif
-
- /* Namespaces: */
- struct nsproxy *nsproxy;
-
- /* Signal handlers: */
- struct signal_struct *signal;
- struct sighand_struct __rcu *sighand;
- sigset_t blocked;
- sigset_t real_blocked;
- /* Restored if set_restore_sigmask() was used: */
- sigset_t saved_sigmask;
- struct sigpending pending;
- unsigned long sas_ss_sp;
- size_t sas_ss_size;
- unsigned int sas_ss_flags;
-
- struct callback_head *task_works;
-
-#ifdef CONFIG_AUDIT
-#ifdef CONFIG_AUDITSYSCALL
- struct audit_context *audit_context;
-#endif
- kuid_t loginuid;
- unsigned int sessionid;
-#endif
- struct seccomp seccomp;
- struct syscall_user_dispatch syscall_dispatch;
-
- /* Thread group tracking: */
- u64 parent_exec_id;
- u64 self_exec_id;
-
- /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
- spinlock_t alloc_lock;
-
- /* Protection of the PI data structures: */
- raw_spinlock_t pi_lock;
-
- struct wake_q_node wake_q;
-
-#ifdef CONFIG_RT_MUTEXES
- /* PI waiters blocked on a rt_mutex held by this task: */
- struct rb_root_cached pi_waiters;
- /* Updated under owner's pi_lock and rq lock */
- struct task_struct *pi_top_task;
- /* Deadlock detection and priority inheritance handling: */
- struct rt_mutex_waiter *pi_blocked_on;
-#endif
-
-#ifdef CONFIG_DEBUG_MUTEXES
- /* Mutex deadlock detection: */
- struct mutex_waiter *blocked_on;
-#endif
-
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
- struct mutex *blocker_mutex;
-#endif
-
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
- int non_block_count;
-#endif
-
-#ifdef CONFIG_TRACE_IRQFLAGS
- struct irqtrace_events irqtrace;
- unsigned int hardirq_threaded;
- u64 hardirq_chain_key;
- int softirqs_enabled;
- int softirq_context;
- int irq_config;
-#endif
-#ifdef CONFIG_PREEMPT_RT
- int softirq_disable_cnt;
-#endif
-
-#ifdef CONFIG_LOCKDEP
-# define MAX_LOCK_DEPTH 48UL
- u64 curr_chain_key;
- int lockdep_depth;
- unsigned int lockdep_recursion;
- struct held_lock held_locks[MAX_LOCK_DEPTH];
-#endif
-
-#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
- unsigned int in_ubsan;
-#endif
-
- /* Journalling filesystem info: */
- void *journal_info;
-
- /* Stacked block device info: */
- struct bio_list *bio_list;
-
- /* Stack plugging: */
- struct blk_plug *plug;
-
- /* VM state: */
- struct reclaim_state *reclaim_state;
-
- struct io_context *io_context;
-
-#ifdef CONFIG_COMPACTION
- struct capture_control *capture_control;
-#endif
- /* Ptrace state: */
- unsigned long ptrace_message;
- kernel_siginfo_t *last_siginfo;
-
- struct task_io_accounting ioac;
-#ifdef CONFIG_PSI
- /* Pressure stall state */
- unsigned int psi_flags;
-#endif
-#ifdef CONFIG_TASK_XACCT
- /* Accumulated RSS usage: */
- u64 acct_rss_mem1;
- /* Accumulated virtual memory usage: */
- u64 acct_vm_mem1;
- /* stime + utime since last update: */
- u64 acct_timexpd;
-#endif
-#ifdef CONFIG_CPUSETS
- /* Protected by ->alloc_lock: */
- nodemask_t mems_allowed;
- /* Sequence number to catch updates: */
- seqcount_spinlock_t mems_allowed_seq;
- int cpuset_mem_spread_rotor;
-#endif
-#ifdef CONFIG_CGROUPS
- /* Control Group info protected by css_set_lock: */
- struct css_set __rcu *cgroups;
- /* cg_list protected by css_set_lock and tsk->alloc_lock: */
- struct list_head cg_list;
-#endif
-#ifdef CONFIG_X86_CPU_RESCTRL
- u32 closid;
- u32 rmid;
-#endif
-#ifdef CONFIG_FUTEX
- struct robust_list_head __user *robust_list;
-#ifdef CONFIG_COMPAT
- struct compat_robust_list_head __user *compat_robust_list;
-#endif
- struct list_head pi_state_list;
- struct futex_pi_state *pi_state_cache;
- struct mutex futex_exit_mutex;
- unsigned int futex_state;
-#endif
-#ifdef CONFIG_PERF_EVENTS
- u8 perf_recursion[PERF_NR_CONTEXTS];
- struct perf_event_context *perf_event_ctxp;
- struct mutex perf_event_mutex;
- struct list_head perf_event_list;
- struct perf_ctx_data __rcu *perf_ctx_data;
-#endif
-#ifdef CONFIG_DEBUG_PREEMPT
- unsigned long preempt_disable_ip;
-#endif
-#ifdef CONFIG_NUMA
- /* Protected by alloc_lock: */
- struct mempolicy *mempolicy;
- short il_prev;
- u8 il_weight;
- short pref_node_fork;
-#endif
-#ifdef CONFIG_NUMA_BALANCING
- int numa_scan_seq;
- unsigned int numa_scan_period;
- unsigned int numa_scan_period_max;
- int numa_preferred_nid;
- unsigned long numa_migrate_retry;
- /* Migration stamp: */
- u64 node_stamp;
- u64 last_task_numa_placement;
- u64 last_sum_exec_runtime;
- struct callback_head numa_work;
-
- /*
- * This pointer is only modified for current in syscall and
- * pagefault context (and for tasks being destroyed), so it can be read
- * from any of the following contexts:
- * - RCU read-side critical section
- * - current->numa_group from everywhere
- * - task's runqueue locked, task not running
- */
- struct numa_group __rcu *numa_group;
-
- /*
- * numa_faults is an array split into four regions:
- * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
- * in this precise order.
- *
- * faults_memory: Exponential decaying average of faults on a per-node
- * basis. Scheduling placement decisions are made based on these
- * counts. The values remain static for the duration of a PTE scan.
- * faults_cpu: Track the nodes the process was running on when a NUMA
- * hinting fault was incurred.
- * faults_memory_buffer and faults_cpu_buffer: Record faults per node
- * during the current scan window. When the scan completes, the counts
- * in faults_memory and faults_cpu decay and these values are copied.
- */
- unsigned long *numa_faults;
- unsigned long total_numa_faults;
-
- /*
- * numa_faults_locality tracks if faults recorded during the last
- * scan window were remote/local or failed to migrate. The task scan
- * period is adapted based on the locality of the faults with different
- * weights depending on whether they were shared or private faults
- */
- unsigned long numa_faults_locality[3];
-
- unsigned long numa_pages_migrated;
-#endif /* CONFIG_NUMA_BALANCING */
-
-#ifdef CONFIG_RSEQ
- struct rseq __user *rseq;
- u32 rseq_len;
- u32 rseq_sig;
- /*
- * RmW on rseq_event_mask must be performed atomically
- * with respect to preemption.
- */
- unsigned long rseq_event_mask;
-# ifdef CONFIG_DEBUG_RSEQ
- /*
- * This is a place holder to save a copy of the rseq fields for
- * validation of read-only fields. The struct rseq has a
- * variable-length array at the end, so it cannot be used
- * directly. Reserve a size large enough for the known fields.
- */
- char rseq_fields[sizeof(struct rseq)];
-# endif
-#endif
-
-#ifdef CONFIG_SCHED_MM_CID
- int mm_cid; /* Current cid in mm */
- int last_mm_cid; /* Most recent cid in mm */
- int migrate_from_cpu;
- int mm_cid_active; /* Whether cid bitmap is active */
- struct callback_head cid_work;
-#endif
-
- struct tlbflush_unmap_batch tlb_ubc;
-
- /* Cache last used pipe for splice(): */
- struct pipe_inode_info *splice_pipe;
-
- struct page_frag task_frag;
-
-#ifdef CONFIG_TASK_DELAY_ACCT
- struct task_delay_info *delays;
-#endif
-
-#ifdef CONFIG_FAULT_INJECTION
- int make_it_fail;
- unsigned int fail_nth;
-#endif
- /*
- * When (nr_dirtied >= nr_dirtied_pause), it's time to call
- * balance_dirty_pages() for a dirty throttling pause:
- */
- int nr_dirtied;
- int nr_dirtied_pause;
- /* Start of a write-and-pause period: */
- unsigned long dirty_paused_when;
-
-#ifdef CONFIG_LATENCYTOP
- int latency_record_count;
- struct latency_record latency_record[LT_SAVECOUNT];
-#endif
- /*
- * Time slack values; these are used to round up poll() and
- * select() etc timeout values. These are in nanoseconds.
- */
- u64 timer_slack_ns;
- u64 default_timer_slack_ns;
-
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
- unsigned int kasan_depth;
-#endif
-
-#ifdef CONFIG_KCSAN
- struct kcsan_ctx kcsan_ctx;
-#ifdef CONFIG_TRACE_IRQFLAGS
- struct irqtrace_events kcsan_save_irqtrace;
-#endif
-#ifdef CONFIG_KCSAN_WEAK_MEMORY
- int kcsan_stack_depth;
-#endif
-#endif
-
-#ifdef CONFIG_KMSAN
- struct kmsan_ctx kmsan_ctx;
-#endif
-
-#if IS_ENABLED(CONFIG_KUNIT)
- struct kunit *kunit_test;
-#endif
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- /* Index of current stored address in ret_stack: */
- int curr_ret_stack;
- int curr_ret_depth;
-
- /* Stack of return addresses for return function tracing: */
- unsigned long *ret_stack;
-
- /* Timestamp for last schedule: */
- unsigned long long ftrace_timestamp;
- unsigned long long ftrace_sleeptime;
-
- /*
- * Number of functions that haven't been traced
- * because of depth overrun:
- */
- atomic_t trace_overrun;
-
- /* Pause tracing: */
- atomic_t tracing_graph_pause;
-#endif
-
-#ifdef CONFIG_TRACING
- /* Bitmask and counter of trace recursion: */
- unsigned long trace_recursion;
-#endif /* CONFIG_TRACING */
-
-#ifdef CONFIG_KCOV
- /* See kernel/kcov.c for more details. */
-
- /* Coverage collection mode enabled for this task (0 if disabled): */
- unsigned int kcov_mode;
-
- /* Size of the kcov_area: */
- unsigned int kcov_size;
-
- /* Buffer for coverage collection: */
- void *kcov_area;
-
- /* KCOV descriptor wired with this task or NULL: */
- struct kcov *kcov;
-
- /* KCOV common handle for remote coverage collection: */
- u64 kcov_handle;
-
- /* KCOV sequence number: */
- int kcov_sequence;
-
- /* Collect coverage from softirq context: */
- unsigned int kcov_softirq;
-#endif
-
-#ifdef CONFIG_MEMCG_V1
- struct mem_cgroup *memcg_in_oom;
-#endif
-
-#ifdef CONFIG_MEMCG
- /* Number of pages to reclaim on returning to userland: */
- unsigned int memcg_nr_pages_over_high;
-
- /* Used by memcontrol for targeted memcg charge: */
- struct mem_cgroup *active_memcg;
-
- /* Cache for current->cgroups->memcg->objcg lookups: */
- struct obj_cgroup *objcg;
-#endif
-
-#ifdef CONFIG_BLK_CGROUP
- struct gendisk *throttle_disk;
-#endif
-
-#ifdef CONFIG_UPROBES
- struct uprobe_task *utask;
-#endif
-#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
- unsigned int sequential_io;
- unsigned int sequential_io_avg;
-#endif
- struct kmap_ctrl kmap_ctrl;
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
- unsigned long task_state_change;
-# ifdef CONFIG_PREEMPT_RT
- unsigned long saved_state_change;
-# endif
-#endif
- struct rcu_head rcu;
- refcount_t rcu_users;
- int pagefault_disabled;
-#ifdef CONFIG_MMU
- struct task_struct *oom_reaper_list;
- struct timer_list oom_reaper_timer;
-#endif
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *stack_vm_area;
-#endif
-#ifdef CONFIG_THREAD_INFO_IN_TASK
- /* A live task holds one reference: */
- refcount_t stack_refcount;
-#endif
-#ifdef CONFIG_LIVEPATCH
- int patch_state;
-#endif
-#ifdef CONFIG_SECURITY
- /* Used by LSM modules for access restriction: */
- void *security;
-#endif
-#ifdef CONFIG_BPF_SYSCALL
- /* Used by BPF task local storage */
- struct bpf_local_storage __rcu *bpf_storage;
- /* Used for BPF run context */
- struct bpf_run_ctx *bpf_ctx;
-#endif
- /* Used by BPF for per-TASK xdp storage */
- struct bpf_net_context *bpf_net_context;
-
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
- unsigned long lowest_stack;
- unsigned long prev_lowest_stack;
-#endif
-
-#ifdef CONFIG_X86_MCE
- void __user *mce_vaddr;
- __u64 mce_kflags;
- u64 mce_addr;
- __u64 mce_ripv : 1,
- mce_whole_page : 1,
- __mce_reserved : 62;
- struct callback_head mce_kill_me;
- int mce_count;
-#endif
-
-#ifdef CONFIG_KRETPROBES
- struct llist_head kretprobe_instances;
-#endif
-#ifdef CONFIG_RETHOOK
- struct llist_head rethooks;
-#endif
-
-#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
- /*
- * If L1D flush is supported on mm context switch
- * then we use this callback head to queue kill work
- * to kill tasks that are not running on SMT disabled
- * cores
- */
- struct callback_head l1d_flush_kill;
-#endif
-
-#ifdef CONFIG_RV
- /*
- * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
- * If we find justification for more monitors, we can think
- * about adding more or developing a dynamic method. So far,
- * none of these are justified.
- */
- union rv_task_monitor rv[RV_PER_TASK_MONITORS];
-#endif
-
-#ifdef CONFIG_USER_EVENTS
- struct user_event_mm *user_event_mm;
-#endif
-
- /* CPU-specific state of this task: */
- struct thread_struct thread;
-
- /*
- * New fields for task_struct should be added above here, so that
- * they are included in the randomized portion of task_struct.
- */
- randomized_struct_fields_end
-} __attribute__ ((aligned (64)));
-
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)

diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h
index 969aaf5ef9d6..e99751be1c15 100644
--- a/include/linux/sched/types.h
+++ b/include/linux/sched/types.h
@@ -2,7 +2,72 @@
#ifndef _LINUX_SCHED_TYPES_H
#define _LINUX_SCHED_TYPES_H

+#include <asm/processor.h>
+#include <linux/cache.h>
+#include <linux/compiler_types.h>
+#include <linux/cpumask_types.h>
+#include <linux/hrtimer_types.h>
+#include <linux/irqflags_types.h>
+#include <linux/kcsan.h>
+#include <linux/kmsan_types.h>
+#include <linux/latencytop.h>
+#include <linux/llist.h>
+#include <linux/lockdep_types.h>
+#include <linux/mm_types_task.h>
+#include <linux/mutex_types.h>
+#include <linux/netdevice_xmit.h>
+#include <linux/nodemask_types.h>
+#include <linux/pid_types.h>
+#include <linux/plist_types.h>
+#include <linux/posix-timers_types.h>
+#include <linux/rbtree_types.h>
+#include <linux/refcount_types.h>
+#include <linux/restart_block.h>
+#include <linux/rv.h>
+#include <linux/sched/ext.h>
+#include <linux/seccomp_types.h>
+#include <linux/sem_types.h>
+#include <linux/seqlock_types.h>
+#include <linux/shm.h>
+#include <linux/signal_types.h>
+#include <linux/smp_types.h>
+#include <linux/spinlock_types.h>
+#include <linux/syscall_user_dispatch_types.h>
+#include <linux/task_io_accounting.h>
+#include <linux/thread_info.h>
+#include <linux/timer_types.h>
#include <linux/types.h>
+#include <linux/uidgid_types.h>
+#include <uapi/linux/rseq.h>
+
+/* task_struct member predeclarations (sorted alphabetically): */
+struct audit_context;
+struct bio_list;
+struct blk_plug;
+struct bpf_local_storage;
+struct bpf_net_context;
+struct bpf_run_ctx;
+struct capture_control;
+struct cfs_rq;
+struct fs_struct;
+struct futex_pi_state;
+struct io_context;
+struct io_uring_task;
+struct mempolicy;
+struct nameidata;
+struct nsproxy;
+struct perf_ctx_data;
+struct perf_event_context;
+struct pipe_inode_info;
+struct reclaim_state;
+struct robust_list_head;
+struct rq;
+struct sched_dl_entity;
+struct sighand_struct;
+struct signal_struct;
+struct task_delay_info;
+struct task_struct;
+struct user_event_mm;

/**
* struct task_cputime - collected CPU time counts
@@ -20,4 +85,1283 @@ struct task_cputime {
unsigned long long sum_exec_runtime;
};

+/*
+ * Define the task command name length as enum, then it can be visible to
+ * BPF programs.
+ */
+enum {
+ TASK_COMM_LEN = 16,
+};
+
+/**
+ * struct prev_cputime - snapshot of system and user cputime
+ * @utime: time spent in user mode
+ * @stime: time spent in system mode
+ * @lock: protects the above two fields
+ *
+ * Stores previous user/system time values such that we can guarantee
+ * monotonicity.
+ */
+struct prev_cputime {
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ u64 utime;
+ u64 stime;
+ raw_spinlock_t lock;
+#endif
+};
+
+enum vtime_state {
+ /* Task is sleeping or running in a CPU with VTIME inactive: */
+ VTIME_INACTIVE = 0,
+ /* Task is idle */
+ VTIME_IDLE,
+ /* Task runs in kernelspace in a CPU with VTIME active: */
+ VTIME_SYS,
+ /* Task runs in userspace in a CPU with VTIME active: */
+ VTIME_USER,
+ /* Task runs as guests in a CPU with VTIME active: */
+ VTIME_GUEST,
+};
+
+struct vtime {
+ seqcount_t seqcount;
+ unsigned long long starttime;
+ enum vtime_state state;
+ unsigned int cpu;
+ u64 utime;
+ u64 stime;
+ u64 gtime;
+};
+
+struct sched_info {
+#ifdef CONFIG_SCHED_INFO
+ /* Cumulative counters: */
+
+ /* # of times we have run on this CPU: */
+ unsigned long pcount;
+
+ /* Time spent waiting on a runqueue: */
+ unsigned long long run_delay;
+
+ /* Max time spent waiting on a runqueue: */
+ unsigned long long max_run_delay;
+
+ /* Min time spent waiting on a runqueue: */
+ unsigned long long min_run_delay;
+
+ /* Timestamps: */
+
+ /* When did we last run on a CPU? */
+ unsigned long long last_arrival;
+
+ /* When were we last queued to run? */
+ unsigned long long last_queued;
+
+#endif /* CONFIG_SCHED_INFO */
+};
+
+/*
+ * Integer metrics need fixed point arithmetic, e.g., sched/fair
+ * has a few: load, load_avg, util_avg, freq, and capacity.
+ *
+ * We define a basic fixed point arithmetic range, and then formalize
+ * all these metrics based on that basic range.
+ */
+# define SCHED_FIXEDPOINT_SHIFT 10
+# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
+
+/* Increase resolution of cpu_capacity calculations */
+# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
+# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
+
+struct load_weight {
+ unsigned long weight;
+ u32 inv_weight;
+};
+
+/*
+ * The load/runnable/util_avg accumulates an infinite geometric series
+ * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
+ *
+ * [load_avg definition]
+ *
+ * load_avg = runnable% * scale_load_down(load)
+ *
+ * [runnable_avg definition]
+ *
+ * runnable_avg = runnable% * SCHED_CAPACITY_SCALE
+ *
+ * [util_avg definition]
+ *
+ * util_avg = running% * SCHED_CAPACITY_SCALE
+ *
+ * where runnable% is the time ratio that a sched_entity is runnable and
+ * running% the time ratio that a sched_entity is running.
+ *
+ * For cfs_rq, they are the aggregated values of all runnable and blocked
+ * sched_entities.
+ *
+ * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
+ * capacity scaling. The scaling is done through the rq_clock_pelt that is used
+ * for computing those signals (see update_rq_clock_pelt())
+ *
+ * N.B., the above ratios (runnable% and running%) themselves are in the
+ * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
+ * to as large a range as necessary. This is for example reflected by
+ * util_avg's SCHED_CAPACITY_SCALE.
+ *
+ * [Overflow issue]
+ *
+ * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
+ * with the highest load (=88761), always runnable on a single cfs_rq,
+ * and should not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * For all other cases (including 32-bit kernels), struct load_weight's
+ * weight will overflow first before we do, because:
+ *
+ * Max(load_avg) <= Max(load.weight)
+ *
+ * Then it is the load_weight's responsibility to consider overflow
+ * issues.
+ */
+struct sched_avg {
+ u64 last_update_time;
+ u64 load_sum;
+ u64 runnable_sum;
+ u32 util_sum;
+ u32 period_contrib;
+ unsigned long load_avg;
+ unsigned long runnable_avg;
+ unsigned long util_avg;
+ unsigned int util_est;
+} ____cacheline_aligned;
+
+/*
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est at dequeue time.
+ * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ * it is safe to use MSB.
+ */
+#define UTIL_EST_WEIGHT_SHIFT 2
+#define UTIL_AVG_UNCHANGED 0x80000000
+
+struct sched_statistics {
+#ifdef CONFIG_SCHEDSTATS
+ u64 wait_start;
+ u64 wait_max;
+ u64 wait_count;
+ u64 wait_sum;
+ u64 iowait_count;
+ u64 iowait_sum;
+
+ u64 sleep_start;
+ u64 sleep_max;
+ s64 sum_sleep_runtime;
+
+ u64 block_start;
+ u64 block_max;
+ s64 sum_block_runtime;
+
+ s64 exec_max;
+ u64 slice_max;
+
+ u64 nr_migrations_cold;
+ u64 nr_failed_migrations_affine;
+ u64 nr_failed_migrations_running;
+ u64 nr_failed_migrations_hot;
+ u64 nr_forced_migrations;
+
+ u64 nr_wakeups;
+ u64 nr_wakeups_sync;
+ u64 nr_wakeups_migrate;
+ u64 nr_wakeups_local;
+ u64 nr_wakeups_remote;
+ u64 nr_wakeups_affine;
+ u64 nr_wakeups_affine_attempts;
+ u64 nr_wakeups_passive;
+ u64 nr_wakeups_idle;
+
+#ifdef CONFIG_SCHED_CORE
+ u64 core_forceidle_sum;
+#endif
+#endif /* CONFIG_SCHEDSTATS */
+} ____cacheline_aligned;
+
+struct sched_entity {
+ /* For load-balancing: */
+ struct load_weight load;
+ struct rb_node run_node;
+ u64 deadline;
+ u64 min_vruntime;
+ u64 min_slice;
+
+ struct list_head group_node;
+ unsigned char on_rq;
+ unsigned char sched_delayed;
+ unsigned char rel_deadline;
+ unsigned char custom_slice;
+ /* hole */
+
+ u64 exec_start;
+ u64 sum_exec_runtime;
+ u64 prev_sum_exec_runtime;
+ u64 vruntime;
+ s64 vlag;
+ u64 slice;
+
+ u64 nr_migrations;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ int depth;
+ struct sched_entity *parent;
+ /* rq on which this entity is (to be) queued: */
+ struct cfs_rq *cfs_rq;
+ /* rq "owned" by this entity/group: */
+ struct cfs_rq *my_q;
+ /* cached value of my_q->h_nr_running */
+ unsigned long runnable_weight;
+#endif
+
+ /*
+ * Per entity load average tracking.
+ *
+ * Put into separate cache line so it does not
+ * collide with read-mostly values above.
+ */
+ struct sched_avg avg;
+};
+
+struct sched_rt_entity {
+ struct list_head run_list;
+ unsigned long timeout;
+ unsigned long watchdog_stamp;
+ unsigned int time_slice;
+ unsigned short on_rq;
+ unsigned short on_list;
+
+ struct sched_rt_entity *back;
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity *parent;
+ /* rq on which this entity is (to be) queued: */
+ struct rt_rq *rt_rq;
+ /* rq "owned" by this entity/group: */
+ struct rt_rq *my_q;
+#endif
+} __randomize_layout;
+
+typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+
+struct sched_dl_entity {
+ struct rb_node rb_node;
+
+ /*
+ * Original scheduling parameters. Copied here from sched_attr
+ * during sched_setattr(), they will remain the same until
+ * the next sched_setattr().
+ */
+ u64 dl_runtime; /* Maximum runtime for each instance */
+ u64 dl_deadline; /* Relative deadline of each instance */
+ u64 dl_period; /* Separation of two instances (period) */
+ u64 dl_bw; /* dl_runtime / dl_period */
+ u64 dl_density; /* dl_runtime / dl_deadline */
+
+ /*
+ * Actual scheduling parameters. Initialized with the values above,
+ * they are continuously updated during task execution. Note that
+ * the remaining runtime could be < 0 in case we are in overrun.
+ */
+ s64 runtime; /* Remaining runtime for this instance */
+ u64 deadline; /* Absolute deadline for this instance */
+ unsigned int flags; /* Specifying the scheduler behaviour */
+
+ /*
+ * Some bool flags:
+ *
+ * @dl_throttled tells if we exhausted the runtime. If so, the
+ * task has to wait for a replenishment to be performed at the
+ * next firing of dl_timer.
+ *
+ * @dl_yielded tells if task gave up the CPU before consuming
+ * all its available runtime during the last job.
+ *
+ * @dl_non_contending tells if the task is inactive while still
+ * contributing to the active utilization. In other words, it
+ * indicates if the inactive timer has been armed and its handler
+ * has not been executed yet. This flag is useful to avoid race
+ * conditions between the inactive timer handler and the wakeup
+ * code.
+ *
+ * @dl_overrun tells if the task asked to be informed about runtime
+ * overruns.
+ *
+ * @dl_server tells if this is a server entity.
+ *
+ * @dl_defer tells if this is a deferred or regular server. For
+ * now only defer server exists.
+ *
+ * @dl_defer_armed tells if the deferrable server is waiting
+ * for the replenishment timer to activate it.
+ *
+ * @dl_server_active tells if the dlserver is active(started).
+ * dlserver is started on first cfs enqueue on an idle runqueue
+ * and is stopped when a dequeue results in 0 cfs tasks on the
+ * runqueue. In other words, dlserver is active only when cpu's
+ * runqueue has atleast one cfs task.
+ *
+ * @dl_defer_running tells if the deferrable server is actually
+ * running, skipping the defer phase.
+ */
+ unsigned int dl_throttled : 1;
+ unsigned int dl_yielded : 1;
+ unsigned int dl_non_contending : 1;
+ unsigned int dl_overrun : 1;
+ unsigned int dl_server : 1;
+ unsigned int dl_server_active : 1;
+ unsigned int dl_defer : 1;
+ unsigned int dl_defer_armed : 1;
+ unsigned int dl_defer_running : 1;
+
+ /*
+ * Bandwidth enforcement timer. Each -deadline task has its
+ * own bandwidth to be enforced, thus we need one timer per task.
+ */
+ struct hrtimer dl_timer;
+
+ /*
+ * Inactive timer, responsible for decreasing the active utilization
+ * at the "0-lag time". When a -deadline task blocks, it contributes
+ * to GRUB's active utilization until the "0-lag time", hence a
+ * timer is needed to decrease the active utilization at the correct
+ * time.
+ */
+ struct hrtimer inactive_timer;
+
+ /*
+ * Bits for DL-server functionality. Also see the comment near
+ * dl_server_update().
+ *
+ * @rq the runqueue this server is for
+ *
+ * @server_has_tasks() returns true if @server_pick return a
+ * runnable task.
+ */
+ struct rq *rq;
+ dl_server_has_tasks_f server_has_tasks;
+ dl_server_pick_f server_pick_task;
+
+#ifdef CONFIG_RT_MUTEXES
+ /*
+ * Priority Inheritance. When a DEADLINE scheduling entity is boosted
+ * pi_se points to the donor, otherwise points to the dl_se it belongs
+ * to (the original one/itself).
+ */
+ struct sched_dl_entity *pi_se;
+#endif
+};
+
+#ifdef CONFIG_UCLAMP_TASK
+/* Number of utilization clamp buckets (shorter alias) */
+#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
+
+/*
+ * Utilization clamp for a scheduling entity
+ * @value: clamp value "assigned" to a se
+ * @bucket_id: bucket index corresponding to the "assigned" value
+ * @active: the se is currently refcounted in a rq's bucket
+ * @user_defined: the requested clamp value comes from user-space
+ *
+ * The bucket_id is the index of the clamp bucket matching the clamp value
+ * which is pre-computed and stored to avoid expensive integer divisions from
+ * the fast path.
+ *
+ * The active bit is set whenever a task has got an "effective" value assigned,
+ * which can be different from the clamp value "requested" from user-space.
+ * This allows to know a task is refcounted in the rq's bucket corresponding
+ * to the "effective" bucket_id.
+ *
+ * The user_defined bit is set whenever a task has got a task-specific clamp
+ * value requested from userspace, i.e. the system defaults apply to this task
+ * just as a restriction. This allows to relax default clamps when a less
+ * restrictive task-specific value has been requested, thus allowing to
+ * implement a "nice" semantic. For example, a task running with a 20%
+ * default boost can still drop its own boosting to 0%.
+ */
+struct uclamp_se {
+ unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
+ unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
+ unsigned int active : 1;
+ unsigned int user_defined : 1;
+};
+#endif /* CONFIG_UCLAMP_TASK */
+
+union rcu_special {
+ struct {
+ u8 blocked;
+ u8 need_qs;
+ u8 exp_hint; /* Hint for performance. */
+ u8 need_mb; /* Readers need smp_mb(). */
+ } b; /* Bits. */
+ u32 s; /* Set of bits. */
+};
+
+/*
+ * Number of contexts where an event can trigger:
+ * task, softirq, hardirq, nmi.
+ */
+#define PERF_NR_CONTEXTS 4
+
+struct wake_q_node {
+ struct wake_q_node *next;
+};
+
+struct kmap_ctrl {
+#ifdef CONFIG_KMAP_LOCAL
+ int idx;
+ pte_t pteval[KM_MAX_IDX];
+#endif
+};
+
+struct task_struct {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ /*
+ * For reasons of header soup (see current_thread_info()), this
+ * must be the first element of task_struct.
+ */
+ struct thread_info thread_info;
+#endif
+ unsigned int __state;
+
+ /* saved state for "spinlock sleepers" */
+ unsigned int saved_state;
+
+ /*
+ * This begins the randomizable portion of task_struct. Only
+ * scheduling-critical items should be added above here.
+ */
+ randomized_struct_fields_start
+
+ void *stack;
+ refcount_t usage;
+ /* Per task flags (PF_*), defined further below: */
+ unsigned int flags;
+ unsigned int ptrace;
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ struct alloc_tag *alloc_tag;
+#endif
+
+ int on_cpu;
+ struct __call_single_node wake_entry;
+ unsigned int wakee_flips;
+ unsigned long wakee_flip_decay_ts;
+ struct task_struct *last_wakee;
+
+ /*
+ * recent_used_cpu is initially set as the last CPU used by a task
+ * that wakes affine another task. Waker/wakee relationships can
+ * push tasks around a CPU where each wakeup moves to the next one.
+ * Tracking a recently used CPU allows a quick search for a recently
+ * used CPU that may be idle.
+ */
+ int recent_used_cpu;
+ int wake_cpu;
+ int on_rq;
+
+ int prio;
+ int static_prio;
+ int normal_prio;
+ unsigned int rt_priority;
+
+ struct sched_entity se;
+ struct sched_rt_entity rt;
+ struct sched_dl_entity dl;
+ struct sched_dl_entity *dl_server;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ struct sched_ext_entity scx;
+#endif
+ const struct sched_class *sched_class;
+
+#ifdef CONFIG_SCHED_CORE
+ struct rb_node core_node;
+ unsigned long core_cookie;
+ unsigned int core_occupation;
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+ struct task_group *sched_task_group;
+#endif
+
+
+#ifdef CONFIG_UCLAMP_TASK
+ /*
+ * Clamp values requested for a scheduling entity.
+ * Must be updated with task_rq_lock() held.
+ */
+ struct uclamp_se uclamp_req[UCLAMP_CNT];
+ /*
+ * Effective clamp values used for a scheduling entity.
+ * Must be updated with task_rq_lock() held.
+ */
+ struct uclamp_se uclamp[UCLAMP_CNT];
+#endif
+
+ struct sched_statistics stats;
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ /* List of struct preempt_notifier: */
+ struct hlist_head preempt_notifiers;
+#endif
+
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ unsigned int btrace_seq;
+#endif
+
+ unsigned int policy;
+ unsigned long max_allowed_capacity;
+ int nr_cpus_allowed;
+ const cpumask_t *cpus_ptr;
+ cpumask_t *user_cpus_ptr;
+ cpumask_t cpus_mask;
+ void *migration_pending;
+ unsigned short migration_disabled;
+ unsigned short migration_flags;
+
+#ifdef CONFIG_PREEMPT_RCU
+ int rcu_read_lock_nesting;
+ union rcu_special rcu_read_unlock_special;
+ struct list_head rcu_node_entry;
+ struct rcu_node *rcu_blocked_node;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+
+#ifdef CONFIG_TASKS_RCU
+ unsigned long rcu_tasks_nvcsw;
+ u8 rcu_tasks_holdout;
+ u8 rcu_tasks_idx;
+ int rcu_tasks_idle_cpu;
+ struct list_head rcu_tasks_holdout_list;
+ int rcu_tasks_exit_cpu;
+ struct list_head rcu_tasks_exit_list;
+#endif /* #ifdef CONFIG_TASKS_RCU */
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ int trc_reader_nesting;
+ int trc_ipi_to_cpu;
+ union rcu_special trc_reader_special;
+ struct list_head trc_holdout_list;
+ struct list_head trc_blkd_node;
+ int trc_blkd_cpu;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+
+ struct sched_info sched_info;
+
+ struct list_head tasks;
+ struct plist_node pushable_tasks;
+ struct rb_node pushable_dl_tasks;
+
+ struct mm_struct *mm;
+ struct mm_struct *active_mm;
+ struct address_space *faults_disabled_mapping;
+
+ int exit_state;
+ int exit_code;
+ int exit_signal;
+ /* The signal sent when the parent dies: */
+ int pdeath_signal;
+ /* JOBCTL_*, siglock protected: */
+ unsigned long jobctl;
+
+ /* Used for emulating ABI behavior of previous Linux versions: */
+ unsigned int personality;
+
+ /* Scheduler bits, serialized by scheduler locks: */
+ unsigned sched_reset_on_fork:1;
+ unsigned sched_contributes_to_load:1;
+ unsigned sched_migrated:1;
+ unsigned sched_task_hot:1;
+
+ /* Force alignment to the next boundary: */
+ unsigned :0;
+
+ /* Unserialized, strictly 'current' */
+
+ /*
+ * This field must not be in the scheduler word above due to wakelist
+ * queueing no longer being serialized by p->on_cpu. However:
+ *
+ * p->XXX = X; ttwu()
+ * schedule() if (p->on_rq && ..) // false
+ * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true
+ * deactivate_task() ttwu_queue_wakelist())
+ * p->on_rq = 0; p->sched_remote_wakeup = Y;
+ *
+ * guarantees all stores of 'current' are visible before
+ * ->sched_remote_wakeup gets used, so it can be in this word.
+ */
+ unsigned sched_remote_wakeup:1;
+#ifdef CONFIG_RT_MUTEXES
+ unsigned sched_rt_mutex:1;
+#endif
+
+ /* Bit to tell TOMOYO we're in execve(): */
+ unsigned in_execve:1;
+ unsigned in_iowait:1;
+#ifndef TIF_RESTORE_SIGMASK
+ unsigned restore_sigmask:1;
+#endif
+#ifdef CONFIG_MEMCG_V1
+ unsigned in_user_fault:1;
+#endif
+#ifdef CONFIG_LRU_GEN
+ /* whether the LRU algorithm may apply to this access */
+ unsigned in_lru_fault:1;
+#endif
+#ifdef CONFIG_COMPAT_BRK
+ unsigned brk_randomized:1;
+#endif
+#ifdef CONFIG_CGROUPS
+ /* disallow userland-initiated cgroup migration */
+ unsigned no_cgroup_migration:1;
+ /* task is frozen/stopped (used by the cgroup freezer) */
+ unsigned frozen:1;
+#endif
+#ifdef CONFIG_BLK_CGROUP
+ unsigned use_memdelay:1;
+#endif
+#ifdef CONFIG_PSI
+ /* Stalled due to lack of memory */
+ unsigned in_memstall:1;
+#endif
+#ifdef CONFIG_PAGE_OWNER
+ /* Used by page_owner=on to detect recursion in page tracking. */
+ unsigned in_page_owner:1;
+#endif
+#ifdef CONFIG_EVENTFD
+ /* Recursion prevention for eventfd_signal() */
+ unsigned in_eventfd:1;
+#endif
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
+ unsigned pasid_activated:1;
+#endif
+#ifdef CONFIG_X86_BUS_LOCK_DETECT
+ unsigned reported_split_lock:1;
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+ /* delay due to memory thrashing */
+ unsigned in_thrashing:1;
+#endif
+#ifdef CONFIG_PREEMPT_RT
+ struct netdev_xmit net_xmit;
+#endif
+ unsigned long atomic_flags; /* Flags requiring atomic access. */
+
+ struct restart_block restart_block;
+
+ pid_t pid;
+ pid_t tgid;
+
+#ifdef CONFIG_STACKPROTECTOR
+ /* Canary value for the -fstack-protector GCC feature: */
+ unsigned long stack_canary;
+#endif
+ /*
+ * Pointers to the (original) parent process, youngest child, younger sibling,
+ * older sibling, respectively. (p->father can be replaced with
+ * p->real_parent->pid)
+ */
+
+ /* Real parent process: */
+ struct task_struct __rcu *real_parent;
+
+ /* Recipient of SIGCHLD, wait4() reports: */
+ struct task_struct __rcu *parent;
+
+ /*
+ * Children/sibling form the list of natural children:
+ */
+ struct list_head children;
+ struct list_head sibling;
+ struct task_struct *group_leader;
+
+ /*
+ * 'ptraced' is the list of tasks this task is using ptrace() on.
+ *
+ * This includes both natural children and PTRACE_ATTACH targets.
+ * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
+ */
+ struct list_head ptraced;
+ struct list_head ptrace_entry;
+
+ /* PID/PID hash table linkage. */
+ struct pid *thread_pid;
+ struct hlist_node pid_links[PIDTYPE_MAX];
+ struct list_head thread_node;
+
+ struct completion *vfork_done;
+
+ /* CLONE_CHILD_SETTID: */
+ int __user *set_child_tid;
+
+ /* CLONE_CHILD_CLEARTID: */
+ int __user *clear_child_tid;
+
+ /* PF_KTHREAD | PF_IO_WORKER */
+ void *worker_private;
+
+ u64 utime;
+ u64 stime;
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
+ u64 utimescaled;
+ u64 stimescaled;
+#endif
+ u64 gtime;
+ struct prev_cputime prev_cputime;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ struct vtime vtime;
+#endif
+
+#ifdef CONFIG_NO_HZ_FULL
+ atomic_t tick_dep_mask;
+#endif
+ /* Context switch counts: */
+ unsigned long nvcsw;
+ unsigned long nivcsw;
+
+ /* Monotonic time in nsecs: */
+ u64 start_time;
+
+ /* Boot based time in nsecs: */
+ u64 start_boottime;
+
+ /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
+ unsigned long min_flt;
+ unsigned long maj_flt;
+
+ /* Empty if CONFIG_POSIX_CPUTIMERS=n */
+ struct posix_cputimers posix_cputimers;
+
+#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+ struct posix_cputimers_work posix_cputimers_work;
+#endif
+
+ /* Process credentials: */
+
+ /* Tracer's credentials at attach: */
+ const struct cred __rcu *ptracer_cred;
+
+ /* Objective and real subjective task credentials (COW): */
+ const struct cred __rcu *real_cred;
+
+ /* Effective (overridable) subjective task credentials (COW): */
+ const struct cred __rcu *cred;
+
+#ifdef CONFIG_KEYS
+ /* Cached requested key. */
+ struct key *cached_requested_key;
+#endif
+
+ /*
+ * executable name, excluding path.
+ *
+ * - normally initialized begin_new_exec()
+ * - set it with set_task_comm()
+ * - strscpy_pad() to ensure it is always NUL-terminated and
+ * zero-padded
+ * - task_lock() to ensure the operation is atomic and the name is
+ * fully updated.
+ */
+ char comm[TASK_COMM_LEN];
+
+ struct nameidata *nameidata;
+
+#ifdef CONFIG_SYSVIPC
+ struct sysv_sem sysvsem;
+ struct sysv_shm sysvshm;
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+ unsigned long last_switch_count;
+ unsigned long last_switch_time;
+#endif
+ /* Filesystem information: */
+ struct fs_struct *fs;
+
+ /* Open file information: */
+ struct files_struct *files;
+
+#ifdef CONFIG_IO_URING
+ struct io_uring_task *io_uring;
+#endif
+
+ /* Namespaces: */
+ struct nsproxy *nsproxy;
+
+ /* Signal handlers: */
+ struct signal_struct *signal;
+ struct sighand_struct __rcu *sighand;
+ sigset_t blocked;
+ sigset_t real_blocked;
+ /* Restored if set_restore_sigmask() was used: */
+ sigset_t saved_sigmask;
+ struct sigpending pending;
+ unsigned long sas_ss_sp;
+ size_t sas_ss_size;
+ unsigned int sas_ss_flags;
+
+ struct callback_head *task_works;
+
+#ifdef CONFIG_AUDIT
+#ifdef CONFIG_AUDITSYSCALL
+ struct audit_context *audit_context;
+#endif
+ kuid_t loginuid;
+ unsigned int sessionid;
+#endif
+ struct seccomp seccomp;
+ struct syscall_user_dispatch syscall_dispatch;
+
+ /* Thread group tracking: */
+ u64 parent_exec_id;
+ u64 self_exec_id;
+
+ /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
+ spinlock_t alloc_lock;
+
+ /* Protection of the PI data structures: */
+ raw_spinlock_t pi_lock;
+
+ struct wake_q_node wake_q;
+
+#ifdef CONFIG_RT_MUTEXES
+ /* PI waiters blocked on a rt_mutex held by this task: */
+ struct rb_root_cached pi_waiters;
+ /* Updated under owner's pi_lock and rq lock */
+ struct task_struct *pi_top_task;
+ /* Deadlock detection and priority inheritance handling: */
+ struct rt_mutex_waiter *pi_blocked_on;
+#endif
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ /* Mutex deadlock detection: */
+ struct mutex_waiter *blocked_on;
+#endif
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+ struct mutex *blocker_mutex;
+#endif
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ int non_block_count;
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ struct irqtrace_events irqtrace;
+ unsigned int hardirq_threaded;
+ u64 hardirq_chain_key;
+ int softirqs_enabled;
+ int softirq_context;
+ int irq_config;
+#endif
+#ifdef CONFIG_PREEMPT_RT
+ int softirq_disable_cnt;
+#endif
+
+#ifdef CONFIG_LOCKDEP
+# define MAX_LOCK_DEPTH 48UL
+ u64 curr_chain_key;
+ int lockdep_depth;
+ unsigned int lockdep_recursion;
+ struct held_lock held_locks[MAX_LOCK_DEPTH];
+#endif
+
+#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
+ unsigned int in_ubsan;
+#endif
+
+ /* Journalling filesystem info: */
+ void *journal_info;
+
+ /* Stacked block device info: */
+ struct bio_list *bio_list;
+
+ /* Stack plugging: */
+ struct blk_plug *plug;
+
+ /* VM state: */
+ struct reclaim_state *reclaim_state;
+
+ struct io_context *io_context;
+
+#ifdef CONFIG_COMPACTION
+ struct capture_control *capture_control;
+#endif
+ /* Ptrace state: */
+ unsigned long ptrace_message;
+ kernel_siginfo_t *last_siginfo;
+
+ struct task_io_accounting ioac;
+#ifdef CONFIG_PSI
+ /* Pressure stall state */
+ unsigned int psi_flags;
+#endif
+#ifdef CONFIG_TASK_XACCT
+ /* Accumulated RSS usage: */
+ u64 acct_rss_mem1;
+ /* Accumulated virtual memory usage: */
+ u64 acct_vm_mem1;
+ /* stime + utime since last update: */
+ u64 acct_timexpd;
+#endif
+#ifdef CONFIG_CPUSETS
+ /* Protected by ->alloc_lock: */
+ nodemask_t mems_allowed;
+ /* Sequence number to catch updates: */
+ seqcount_spinlock_t mems_allowed_seq;
+ int cpuset_mem_spread_rotor;
+#endif
+#ifdef CONFIG_CGROUPS
+ /* Control Group info protected by css_set_lock: */
+ struct css_set __rcu *cgroups;
+ /* cg_list protected by css_set_lock and tsk->alloc_lock: */
+ struct list_head cg_list;
+#endif
+#ifdef CONFIG_X86_CPU_RESCTRL
+ u32 closid;
+ u32 rmid;
+#endif
+#ifdef CONFIG_FUTEX
+ struct robust_list_head __user *robust_list;
+#ifdef CONFIG_COMPAT
+ struct compat_robust_list_head __user *compat_robust_list;
+#endif
+ struct list_head pi_state_list;
+ struct futex_pi_state *pi_state_cache;
+ struct mutex futex_exit_mutex;
+ unsigned int futex_state;
+#endif
+#ifdef CONFIG_PERF_EVENTS
+ u8 perf_recursion[PERF_NR_CONTEXTS];
+ struct perf_event_context *perf_event_ctxp;
+ struct mutex perf_event_mutex;
+ struct list_head perf_event_list;
+ struct perf_ctx_data __rcu *perf_ctx_data;
+#endif
+#ifdef CONFIG_DEBUG_PREEMPT
+ unsigned long preempt_disable_ip;
+#endif
+#ifdef CONFIG_NUMA
+ /* Protected by alloc_lock: */
+ struct mempolicy *mempolicy;
+ short il_prev;
+ u8 il_weight;
+ short pref_node_fork;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ int numa_scan_seq;
+ unsigned int numa_scan_period;
+ unsigned int numa_scan_period_max;
+ int numa_preferred_nid;
+ unsigned long numa_migrate_retry;
+ /* Migration stamp: */
+ u64 node_stamp;
+ u64 last_task_numa_placement;
+ u64 last_sum_exec_runtime;
+ struct callback_head numa_work;
+
+ /*
+ * This pointer is only modified for current in syscall and
+ * pagefault context (and for tasks being destroyed), so it can be read
+ * from any of the following contexts:
+ * - RCU read-side critical section
+ * - current->numa_group from everywhere
+ * - task's runqueue locked, task not running
+ */
+ struct numa_group __rcu *numa_group;
+
+ /*
+ * numa_faults is an array split into four regions:
+ * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
+ * in this precise order.
+ *
+ * faults_memory: Exponential decaying average of faults on a per-node
+ * basis. Scheduling placement decisions are made based on these
+ * counts. The values remain static for the duration of a PTE scan.
+ * faults_cpu: Track the nodes the process was running on when a NUMA
+ * hinting fault was incurred.
+ * faults_memory_buffer and faults_cpu_buffer: Record faults per node
+ * during the current scan window. When the scan completes, the counts
+ * in faults_memory and faults_cpu decay and these values are copied.
+ */
+ unsigned long *numa_faults;
+ unsigned long total_numa_faults;
+
+ /*
+ * numa_faults_locality tracks if faults recorded during the last
+ * scan window were remote/local or failed to migrate. The task scan
+ * period is adapted based on the locality of the faults with different
+ * weights depending on whether they were shared or private faults
+ */
+ unsigned long numa_faults_locality[3];
+
+ unsigned long numa_pages_migrated;
+#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_RSEQ
+ struct rseq __user *rseq;
+ u32 rseq_len;
+ u32 rseq_sig;
+ /*
+ * RmW on rseq_event_mask must be performed atomically
+ * with respect to preemption.
+ */
+ unsigned long rseq_event_mask;
+# ifdef CONFIG_DEBUG_RSEQ
+ /*
+ * This is a place holder to save a copy of the rseq fields for
+ * validation of read-only fields. The struct rseq has a
+ * variable-length array at the end, so it cannot be used
+ * directly. Reserve a size large enough for the known fields.
+ */
+ char rseq_fields[sizeof(struct rseq)];
+# endif
+#endif
+
+#ifdef CONFIG_SCHED_MM_CID
+ int mm_cid; /* Current cid in mm */
+ int last_mm_cid; /* Most recent cid in mm */
+ int migrate_from_cpu;
+ int mm_cid_active; /* Whether cid bitmap is active */
+ struct callback_head cid_work;
+#endif
+
+ struct tlbflush_unmap_batch tlb_ubc;
+
+ /* Cache last used pipe for splice(): */
+ struct pipe_inode_info *splice_pipe;
+
+ struct page_frag task_frag;
+
+#ifdef CONFIG_TASK_DELAY_ACCT
+ struct task_delay_info *delays;
+#endif
+
+#ifdef CONFIG_FAULT_INJECTION
+ int make_it_fail;
+ unsigned int fail_nth;
+#endif
+ /*
+ * When (nr_dirtied >= nr_dirtied_pause), it's time to call
+ * balance_dirty_pages() for a dirty throttling pause:
+ */
+ int nr_dirtied;
+ int nr_dirtied_pause;
+ /* Start of a write-and-pause period: */
+ unsigned long dirty_paused_when;
+
+#ifdef CONFIG_LATENCYTOP
+ int latency_record_count;
+ struct latency_record latency_record[LT_SAVECOUNT];
+#endif
+ /*
+ * Time slack values; these are used to round up poll() and
+ * select() etc timeout values. These are in nanoseconds.
+ */
+ u64 timer_slack_ns;
+ u64 default_timer_slack_ns;
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ unsigned int kasan_depth;
+#endif
+
+#ifdef CONFIG_KCSAN
+ struct kcsan_ctx kcsan_ctx;
+#ifdef CONFIG_TRACE_IRQFLAGS
+ struct irqtrace_events kcsan_save_irqtrace;
+#endif
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ int kcsan_stack_depth;
+#endif
+#endif
+
+#ifdef CONFIG_KMSAN
+ struct kmsan_ctx kmsan_ctx;
+#endif
+
+#if IS_ENABLED(CONFIG_KUNIT)
+ struct kunit *kunit_test;
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* Index of current stored address in ret_stack: */
+ int curr_ret_stack;
+ int curr_ret_depth;
+
+ /* Stack of return addresses for return function tracing: */
+ unsigned long *ret_stack;
+
+ /* Timestamp for last schedule: */
+ unsigned long long ftrace_timestamp;
+ unsigned long long ftrace_sleeptime;
+
+ /*
+ * Number of functions that haven't been traced
+ * because of depth overrun:
+ */
+ atomic_t trace_overrun;
+
+ /* Pause tracing: */
+ atomic_t tracing_graph_pause;
+#endif
+
+#ifdef CONFIG_TRACING
+ /* Bitmask and counter of trace recursion: */
+ unsigned long trace_recursion;
+#endif /* CONFIG_TRACING */
+
+#ifdef CONFIG_KCOV
+ /* See kernel/kcov.c for more details. */
+
+ /* Coverage collection mode enabled for this task (0 if disabled): */
+ unsigned int kcov_mode;
+
+ /* Size of the kcov_area: */
+ unsigned int kcov_size;
+
+ /* Buffer for coverage collection: */
+ void *kcov_area;
+
+ /* KCOV descriptor wired with this task or NULL: */
+ struct kcov *kcov;
+
+ /* KCOV common handle for remote coverage collection: */
+ u64 kcov_handle;
+
+ /* KCOV sequence number: */
+ int kcov_sequence;
+
+ /* Collect coverage from softirq context: */
+ unsigned int kcov_softirq;
+#endif
+
+#ifdef CONFIG_MEMCG_V1
+ struct mem_cgroup *memcg_in_oom;
+#endif
+
+#ifdef CONFIG_MEMCG
+ /* Number of pages to reclaim on returning to userland: */
+ unsigned int memcg_nr_pages_over_high;
+
+ /* Used by memcontrol for targeted memcg charge: */
+ struct mem_cgroup *active_memcg;
+
+ /* Cache for current->cgroups->memcg->objcg lookups: */
+ struct obj_cgroup *objcg;
+#endif
+
+#ifdef CONFIG_BLK_CGROUP
+ struct gendisk *throttle_disk;
+#endif
+
+#ifdef CONFIG_UPROBES
+ struct uprobe_task *utask;
+#endif
+#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
+ unsigned int sequential_io;
+ unsigned int sequential_io_avg;
+#endif
+ struct kmap_ctrl kmap_ctrl;
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ unsigned long task_state_change;
+# ifdef CONFIG_PREEMPT_RT
+ unsigned long saved_state_change;
+# endif
+#endif
+ struct rcu_head rcu;
+ refcount_t rcu_users;
+ int pagefault_disabled;
+#ifdef CONFIG_MMU
+ struct task_struct *oom_reaper_list;
+ struct timer_list oom_reaper_timer;
+#endif
+#ifdef CONFIG_VMAP_STACK
+ struct vm_struct *stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ /* A live task holds one reference: */
+ refcount_t stack_refcount;
+#endif
+#ifdef CONFIG_LIVEPATCH
+ int patch_state;
+#endif
+#ifdef CONFIG_SECURITY
+ /* Used by LSM modules for access restriction: */
+ void *security;
+#endif
+#ifdef CONFIG_BPF_SYSCALL
+ /* Used by BPF task local storage */
+ struct bpf_local_storage __rcu *bpf_storage;
+ /* Used for BPF run context */
+ struct bpf_run_ctx *bpf_ctx;
+#endif
+ /* Used by BPF for per-TASK xdp storage */
+ struct bpf_net_context *bpf_net_context;
+
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+ unsigned long lowest_stack;
+ unsigned long prev_lowest_stack;
+#endif
+
+#ifdef CONFIG_X86_MCE
+ void __user *mce_vaddr;
+ __u64 mce_kflags;
+ u64 mce_addr;
+ __u64 mce_ripv : 1,
+ mce_whole_page : 1,
+ __mce_reserved : 62;
+ struct callback_head mce_kill_me;
+ int mce_count;
+#endif
+
+#ifdef CONFIG_KRETPROBES
+ struct llist_head kretprobe_instances;
+#endif
+#ifdef CONFIG_RETHOOK
+ struct llist_head rethooks;
+#endif
+
+#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
+ /*
+ * If L1D flush is supported on mm context switch
+ * then we use this callback head to queue kill work
+ * to kill tasks that are not running on SMT disabled
+ * cores
+ */
+ struct callback_head l1d_flush_kill;
+#endif
+
+#ifdef CONFIG_RV
+ /*
+ * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
+ * If we find justification for more monitors, we can think
+ * about adding more or developing a dynamic method. So far,
+ * none of these are justified.
+ */
+ union rv_task_monitor rv[RV_PER_TASK_MONITORS];
+#endif
+
+#ifdef CONFIG_USER_EVENTS
+ struct user_event_mm *user_event_mm;
+#endif
+
+ /* CPU-specific state of this task: */
+ struct thread_struct thread;
+
+ /*
+ * New fields for task_struct should be added above here, so that
+ * they are included in the randomized portion of task_struct.
+ */
+ randomized_struct_fields_end
+} __attribute__ ((aligned (64)));
+
#endif /* _LINUX_SCHED_TYPES_H */
--
Email: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Return-Path: <linux-kernel+bounces-667896-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sy.mirrors.kernel.org (sy.mirrors.kernel.org [147.75.48.161])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id C537641E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:39:44 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sy.mirrors.kernel.org (Postfix) with ESMTPS id 0CC587A530F
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:38:26 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id E43332222DF;
Fri, 30 May 2025 09:35:00 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=oracle.com header.i=@oracle.com header.b="btDkUdUp";
dkim=pass (1024-bit key) header.d=oracle.onmicrosoft.com header.i=@oracle.onmicrosoft.com header.b="cCEBbPYu"
Received: from mx0a-00069f02.pphosted.com (mx0a-00069f02.pphosted.com [205.220.165.32])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 23DD5223329;
Fri, 30 May 2025 09:34:57 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=fail smtp.client-ip=205.220.165.32
ARC-Seal:i=2; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597699; cv=fail; b=dxiq7D/NXMuZ47zY9+nabT1zOzc5zl6cCmCsAlJqvSmlIoEs4UYzaszr3IDkA9vepBz06LoAyZiGOVsFVF6NXy3FbGvxE/gEqywmoQEMA16QDk4RBTECMIf5aXDWocUG1SqDDy41w5/UXt2TBOiVg3ig2EllPoPkXfgPxi+BVk4=
ARC-Message-Signature:i=2; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597699; c=relaxed/simple;
bh=q+VHYhTE9aFecDltfYda6QQGYbdez3jnFxvXanydKDk=;
h=Date:From:To:Cc:Subject:Message-ID:References:Content-Type:
Content-Disposition:In-Reply-To:MIME-Version; b=SlVEWPMWwwkSUHcezkiKot/axGn8fqqUK8LX43iHZ504i/icTN/vIIvgfzZiW/aPdzGMJfPsP+bOokBxPvYuvJI6JdvvWy5Os2iYcz5OhSOQkHUwDTUc8FFkAZ7lLltb4LKmiqt2+fInlSudZHwsc7MWZf9T37N2QWl/DYUEv+o=
ARC-Authentication-Results:i=2; smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=oracle.com; spf=pass smtp.mailfrom=oracle.com; dkim=pass (2048-bit key) header.d=oracle.com header.i=@oracle.com header.b=btDkUdUp; dkim=pass (1024-bit key) header.d=oracle.onmicrosoft.com header.i=@oracle.onmicrosoft.com header.b=cCEBbPYu; arc=fail smtp.client-ip=205.220.165.32
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=oracle.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=oracle.com
Received: from pps.filterd (m0246627.ppops.net [127.0.0.1])
by mx0b-00069f02.pphosted.com (8.18.1.2/8.18.1.2) with ESMTP id 54U6u1HR004285;
Fri, 30 May 2025 09:34:42 GMT
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=oracle.com; h=cc
:content-type:date:from:in-reply-to:message-id:mime-version
:references:subject:to; s=corp-2025-04-25; bh=q+VHYhTE9aFecDltfY
da6QQGYbdez3jnFxvXanydKDk=; b=btDkUdUprQ+0w8L7bLFKmdgcqf/ksiNTTs
Dc+5q4Rj+XnY2zc/xFYHgHDDOKMKsfogSSYMYYFCgKayJRWK4zQ+7zjtb9VSzkWG
Axu8uju5GjuezsSjQcKnJ36hKSNyzPYgqt7dsTNzU8TSUlh7fJbFwZIEr0fwvb+m
CADTSRIGKLbGdFyF/aBI4yDXHDDGIeVOvlqozOTQz5RpHtMVL+l8YzXZbCEZyHCD
ik3ZrD6U4zPL4xaDyhAo19ADpVBNrg4t2yiOiDppgiuxF6LHw/71d833vK2Q5Nm2
MzIdC5OsUqCvG+fT6YRW4OF4uaK2YKMF9aohIFj2XgQljPN/P5iw==
Received: from phxpaimrmta03.imrmtpd1.prodappphxaev1.oraclevcn.com (phxpaimrmta03.appoci.oracle.com [138.1.37.129])
by mx0b-00069f02.pphosted.com (PPS) with ESMTPS id 46v0ym21en-1
(version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK);
Fri, 30 May 2025 09:34:42 +0000 (GMT)
Received: from pps.filterd (phxpaimrmta03.imrmtpd1.prodappphxaev1.oraclevcn.com [127.0.0.1])
by phxpaimrmta03.imrmtpd1.prodappphxaev1.oraclevcn.com (8.18.1.2/8.18.1.2) with ESMTP id 54U8fjsG026679;
Fri, 30 May 2025 09:34:42 GMT
Received: from bl2pr02cu003.outbound.protection.outlook.com (mail-eastusazon11010043.outbound.protection.outlook.com [52.101.51.43])
by phxpaimrmta03.imrmtpd1.prodappphxaev1.oraclevcn.com (PPS) with ESMTPS id 46u4jd29s9-1
(version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK);
Fri, 30 May 2025 09:34:41 +0000
ARC-Seal: i=1; a=rsa-sha256; s=arcselector10001; d=microsoft.com; cv=none;
b=fskhCaPucbnEzy1vnbqYMP5RT9hhJ5PXdYbw21rRhAlqWDe45KXCXkoH0mll5520LcNJBZpT+fz/86kUuB7MKPdYvC4xxr4ni1kJcbKmmplI9jYxnwrGw5D4A9+vq8uBJSer6zqVKY84shKkofDZ35pIioqtp0VOd7jI/RK4LUib35rRBfmIOeC3CvsLZhv9yJUGeERg8mGHx1blZjDdRVz4WuvM4ic1AWEa3CPMmjjn+Y7xyLc31zd5/f1K/0YAuXWqsfwdvEPQTPzblMBGgeomlHs09RoYF5Z5rtXS1GtoGhNXX+aB0UFwaGCm7Bgp7fFVjNFLwsAVmq+8xIzBTQ==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com;
s=arcselector10001;
h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1;
bh=q+VHYhTE9aFecDltfYda6QQGYbdez3jnFxvXanydKDk=;
b=s1xicqXkeLfheoZaB2/U+TANiv1vUsJH1S7Mnnp9YrE4ytEzUxiOAtCqZ6V4WJk2hkgmEG8qgesut3VLbO2Y+nLc/EAIrit+AiPYfeRR1Qvdwh5z6MdKZC3dcXisN8j/4csZ1TjOrIBwOmNwG+BXTZlKmFKgQMUBm2mZUfQ6Nll8VQQRbB/BgjWuvHjQeZ7xSS0Zyo/JnrV8/NuVlIqe1Hs7Rz/gmyMSh28wA/VG8Zf3ehJTduLDN2WUuXyxfq8ebSqxr+yu0Py5rw/5ewY9Os8p7ayWILwJ4ObCJ313R8xxWgclMJ6usCv6BKBAb74fM3cbEfgAVWoaSTInwYXgdQ==
ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass
smtp.mailfrom=oracle.com; dmarc=pass action=none header.from=oracle.com;
dkim=pass header.d=oracle.com; arc=none
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=oracle.onmicrosoft.com; s=selector2-oracle-onmicrosoft-com;
h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck;
bh=q+VHYhTE9aFecDltfYda6QQGYbdez3jnFxvXanydKDk=;
b=cCEBbPYu6ukzXZdxXzWSYnYiQjTjlSHlx/BXdMvxn5rJPOpVfFWkDIB9tyHJxriyAx4f3q8iZ2nn9CWC69lnh6CsbskOQGNHhTaD360C0XtG6wfo8dmfbcSe/Cekk1a8baPzMe1JSKWqWN3cPD63tgHV5/anAS36i22GFZMS2K4=
Received: from DM4PR10MB8218.namprd10.prod.outlook.com (2603:10b6:8:1cc::16)
by IA4PR10MB8494.namprd10.prod.outlook.com (2603:10b6:208:564::17) with
Microsoft SMTP Server (version=TLS1_2,
cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.8746.40; Fri, 30 May
2025 09:34:39 +0000
Received: from DM4PR10MB8218.namprd10.prod.outlook.com
([fe80::2650:55cf:2816:5f2]) by DM4PR10MB8218.namprd10.prod.outlook.com
([fe80::2650:55cf:2816:5f2%5]) with mapi id 15.20.8746.030; Fri, 30 May 2025
09:34:39 +0000
Date: Fri, 30 May 2025 10:34:36 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx>
To: David Hildenbrand <david@xxxxxxxxxx>
Cc: Ryan Roberts <ryan.roberts@xxxxxxx>,
Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx>, akpm@xxxxxxxxxxxxxxxxxxxx,
hughd@xxxxxxxxxx, Liam.Howlett@xxxxxxxxxx, npache@xxxxxxxxxx,
dev.jain@xxxxxxx, ziy@xxxxxxxxxx, linux-mm@xxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx
Subject: Re: [PATCH 0/2] fix MADV_COLLAPSE issue if THP settings are disabled
Message-ID: <60a81a60-b7da-4439-b177-9d1bca82828d@lucifer.local>
References: <cover.1748506520.git.baolin.wang@xxxxxxxxxxxxxxxxx>
<05d60e72-3113-41f0-b81f-225397f06c81@xxxxxxx>
<f3dad5b5-143d-4896-b315-38e1d7bb1248@xxxxxxxxxx>
<9b1bac6c-fd9f-4dc1-8c94-c4da0cbb9e7f@xxxxxxx>
<abe284a4-db5c-4a5f-b2fd-e28e1ab93ed1@xxxxxxxxxx>
<6caefe0b-c909-4692-a006-7f8b9c0299a6@xxxxxxxxxx>
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <6caefe0b-c909-4692-a006-7f8b9c0299a6@xxxxxxxxxx>
X-ClientProxiedBy: LO2P265CA0433.GBRP265.PROD.OUTLOOK.COM
(2603:10a6:600:e::13) To DM4PR10MB8218.namprd10.prod.outlook.com
(2603:10b6:8:1cc::16)
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
X-MS-PublicTrafficType: Email
X-MS-TrafficTypeDiagnostic: DM4PR10MB8218:EE_|IA4PR10MB8494:EE_
X-MS-Office365-Filtering-Correlation-Id: 00a0c160-4915-4450-a431-08dd9f5d330e
X-MS-Exchange-SenderADCheck: 1
X-MS-Exchange-AntiSpam-Relay: 0
X-Microsoft-Antispam: BCL:0;ARA:13230040|1800799024|7416014|366016|376014;
X-Microsoft-Antispam-Message-Info:
=?us-ascii?Q?Qa9nM3JPhTfPXWdnRpRGAlF4+LbQtDfY1qCECYuQ6n/feROz10K1KWgJ/EhR?=
=?us-ascii?Q?pYVHW8tK9scDlp1ck/ym3pavi1wmxVnqbWkuXfyfRPOBVUYPHOitsBnTq30u?=
=?us-ascii?Q?nwVBVYBF+bhQm4cQwHYibZbK+tsl2RnfaFuwnrVSgNjUvAesHup5mkOmve1O?=
=?us-ascii?Q?t7RSrWKerqtsbCZU2thwQrxwo5by28FBZ0D33A8oMV8qVSbnRw5SJYGJA3io?=
=?us-ascii?Q?CzwP4y91rL1td/BrPZdEQJ3I3bzo75sf2IMc5y0TkI4Tc90IPwEYTcJo/z3d?=
=?us-ascii?Q?cfSv8POypxECyTa6FmkM3eNDCDHqfpxJSVMXs8/4yoPeyLr1ssCmuO/VhzhR?=
=?us-ascii?Q?td0mFVDe0S5PPae3YV4bO5DQLuTg3XSjmHzgKbbYU6LZbGnKJBPl5ea1QjZV?=
=?us-ascii?Q?tGZbz+8Lgu05/gzZmFOz+3hjI9M/LzIi7ftI1J1brQaDJM9ToelYfsdqg7d/?=
=?us-ascii?Q?EWu8GfiC5RS/nFYYUetPhyJo1lvhpFCG8vsblamahCpNTG64Mh1cFOGW9AfO?=
=?us-ascii?Q?foheUnrWG5hkvmmG0wFRB23QmV9ls+ZdDx2yXvmXsJ5EyI88rlv7qPiMnNA3?=
=?us-ascii?Q?F9xiIoNVgA/4SW7ChFzgoZweaN83LYMWCglIwgSs70HhtvTYYLiwPs4PRtfV?=
=?us-ascii?Q?sMmJ3gKeRjfYr5D1FIebR9/vybnBoytjuckKFojEleAcTgqJurF3g7WvKyVk?=
=?us-ascii?Q?D883Bs3f4/BLk3x18NNfuF+mIAkcUdyjIC7+vh/G2oaIDalH5qzV3qjYMblG?=
=?us-ascii?Q?vMI9qW+eBEZzCX1Cp5Uw7APKTUi/e7skOl9D0mOJXlxGel8QqNaowGbG019i?=
=?us-ascii?Q?IcrXE9Vm67pAAsjvKqCFsq5Kb0k4n6/KM35qHaaa5tLcoqzfBwNXidy7MBbD?=
=?us-ascii?Q?YaGWHRupuLqQded/jbAjMOSPqFIuWXyPAkpCvfXROG2cR5tgOCK9Q3+W+NMR?=
=?us-ascii?Q?3zmS7blpfC4Ikvr1C5I7DX+Jm/04ne2Pe8vcIp2Id3Ip0l+lMGs8zXKXTPUo?=
=?us-ascii?Q?hdnVIk36cOWeOHjAvvDIMLZY5DXynIV3IhbGPqhpN90bUGZNgB+fqUj1VwGD?=
=?us-ascii?Q?yB3EGl5bPAQ61MGC9M3ONsBnUxp2Dqcz+kRIuFg/O7sgWg5QbCqPtVPe+PcE?=
=?us-ascii?Q?T/dM+fwWNtiiAqRrAARk2TfyRnMLcwlGnm9PiEyPa809k4AFQM3QrnfnBJfl?=
=?us-ascii?Q?6cKzw+74zs0sKqU4glhgia5AChIMG7H4eVZ6z79FGkr2+Jq0qPIv+CSD46Up?=
=?us-ascii?Q?O4A1E3Hk4m9VKmDZCgfV5uTh2Kuibtwu7ChTcG9Y+dGtUoWIg2qS9rbPRN81?=
=?us-ascii?Q?+DWPnLdy27DeMObcbhhy7hyWbSN+aKihrXrb6WAeYI3mUXZd0g0/K+4pKPQP?=
=?us-ascii?Q?zxG7eX90R5BHo4/8IGuxgqPFOjUH+v4QBs06aB1f7KX/3xQOnRrN3o+10I0i?=
=?us-ascii?Q?uwL0Hl18za0=3D?=
X-Forefront-Antispam-Report:
CIP:255.255.255.255;CTRY:;LANG:en;SCL:1;SRV:;IPV:NLI;SFV:NSPM;H:DM4PR10MB8218.namprd10.prod.outlook.com;PTR:;CAT:NONE;SFS:(13230040)(1800799024)(7416014)(366016)(376014);DIR:OUT;SFP:1101;
X-MS-Exchange-AntiSpam-MessageData-ChunkCount: 1
X-MS-Exchange-AntiSpam-MessageData-0:
=?us-ascii?Q?kphRsHfP3LjMvJ2NaUoeSnobD18dXFT1He/1dDTZmbLyyRRxoqXYl3y1JcKL?=
=?us-ascii?Q?8Zt9v2EJRb3dWjBbo1zDpF8K1HU9F1uBfjd4zbFPVharEnRTlyNIX4ztFwR4?=
=?us-ascii?Q?Jkt86cMxK95jU8iaQFtNdvOJfu7+CeZQ6xseRRsluC+FC6m4tw6BGOEa3eYg?=
=?us-ascii?Q?XxGcFOCuygWF8v4Tcy/U92AYlL/t5BqKoD+SnTdFr2FKqeO+mpvnedOPpkEe?=
=?us-ascii?Q?z5bGgSDd30ylU+sd6jX8M48N7DfV3c/Kae13K4YCqtfaLZxGKUfidbH4MA/9?=
=?us-ascii?Q?irbdKXxGP5sSFGRXF2GMR1GrBO9B4zUecBZOKO2zN0kUv55HiQtECp0bMKe1?=
=?us-ascii?Q?JQOBZNJg2QfebjjGCv7U4r74Pjel71MQOUZaEsYB2ylNJIVLUkdzfUDwNNIu?=
=?us-ascii?Q?IcsmHkwpwM1RiivThgbo6NPE6tEjY+Ll1hQIak0GINyAOdc9F8fZtrSVsSam?=
=?us-ascii?Q?TLCiEz1AD6PVQ0cKtEIEs2C8TZY/7k2eW3wKLIGGFp6kZfDziCp3INttEqfW?=
=?us-ascii?Q?cnbsHaSArdCijoDVFY060jKZy4D02U4jl3CLbRBW980R8TOwjY6/LrVON6rg?=
=?us-ascii?Q?J3qtL/0O7yTBwTjyqG/L+V6XniDrB6YlJlKBKoHWAwhvrd9W9fN41+KojnJW?=
=?us-ascii?Q?sIpQXzGt3TFZQwNyVZjqvpwEszqA58WmPgB/J9dy0tjXf5+OD8pl9ARQx4zn?=
=?us-ascii?Q?5BOon2JPeER140DVQ3twuVWRn3sg51ntMxF0uetaWXBBrxRnhhqxYPVFdjjT?=
=?us-ascii?Q?tSpif956Y7DkY8jvk61NGaD1xFWviZNORkw+Zr7doObgqES+HFUn0HnEA2Q4?=
=?us-ascii?Q?vUMzuQ6/0ixZdr1QPvJKDhnSj3ggadinRjhqc4oOKRcJrQycWHGipUR70FV2?=
=?us-ascii?Q?kFYxHsAwHUSWcPOXhOGuhOCDF/eP3aszDi3aqkrcXriv6wafp4K4/Cjekark?=
=?us-ascii?Q?BYSmetOafCgrDtwKDcy4tzxITzi6LpXeKY8wNYXk5FC+jAVl7ehqm2vOSMnO?=
=?us-ascii?Q?9esCWtL1p+VSZeMcma3FLGED76a6Hq4ztfOvtVuc7ME9ve5Dwka2LODWj/aE?=
=?us-ascii?Q?jt+yM12rls+Xca666y2cZ/DHrnaCRrwzkLXg3A2HsXi6ALIcf3oIwESswOUt?=
=?us-ascii?Q?YDp8uzYl/TSsIm7S9zi0WUgz0pfQAm8NwlMfQDmft7kZek9yRDk3lCTXW5KB?=
=?us-ascii?Q?gZ1WC8OMrzNeBL3stYFMpQyFa4r8nxIteHsakFWZkrvip90nIOegN2GEu0qb?=
=?us-ascii?Q?Sy82TqQGRdBN8jFlXruBcCehgIyloiSGKUUpI7TEF+jBM7tEkOVWp8uL1dRl?=
=?us-ascii?Q?tFFoD4qrEsotn/XA50ZjbCgqGidCSYzJSDF43g/dddmZ+N8wwnuu/otx8kXo?=
=?us-ascii?Q?SR6zaeslOf3dRKFLwPOeVNUdf2onqsbVvEYSqV8cKi/IkO0HFhTZ/JyRAs7G?=
=?us-ascii?Q?P5CNgTXPUu3jK0/3X5KUf1ZafzwpnrWFq7L4YOlLD+uuTw0hBEXuszTiQhhN?=
=?us-ascii?Q?0uMgJJkHyJPtzu/m5WVG+s4f7Dux7b3gv+mE4X6xCfLYPwE4HmWhFtpS2O88?=
=?us-ascii?Q?MFO+vwQOehqko7K89TI/ZRCHLSZDSlNXflsNjIGnufB8kP784rJZ70Y27+iQ?=
=?us-ascii?Q?UQ=3D=3D?=
X-MS-Exchange-AntiSpam-ExternalHop-MessageData-ChunkCount: 1
X-MS-Exchange-AntiSpam-ExternalHop-MessageData-0:
gS77m2/MXI3ZMz7S+htv8nu6MttAvGeYt1UQI27iABb+/ta2tVl5HEUtd3yQsCLuwwDHL43fnI4nR2sKrCLUELBrKQQ5995n18NxU1Vl5l0W2IvaOa0MsBVW6cz1gbcfZNIog0Hb7pWCCjlFM27hNUndErRNe4Kv8805ikxj8c/FdXfDt8FjveRzLFs5v2oIH9FD0htmo8Wibq2Tz8ZITzDduzny+WULUzRPkVmEBq0pv488yPqMlCVQ7BoaLUQ7W5mdwKIE/fupUUwU8YNgKlZA5trfoJ8dczB6TUYuWhnfi72IEWLQKlPo4xL/gHs307pKECoiqfjfcu1EzouCy/avUicyrxSxMDCiKX58ctsmL1DjfbVlNr7HX7JSCW6T9Pc2SmhpsVwODOsmcRUGaS/synOrOhGMlRz+3G81vcvt9bAN5BHqcB/W1xA0pL03jaKTcbRkOUlQR7d607Ec0hokN/S/jWER/FU7F5gCVDmzEQjmVXsWg3LqTYeHn0acy7SwxAmRxuZfaYyq+fdiQUlZZj2Ciro5whD98y7PTeLygxH7TIZhAh5tAK1zAZUR5hjeG5lXKwoVS/cDYnDvzsp3dl7OhrR1w8teL2wQRvY=
X-OriginatorOrg: oracle.com
X-MS-Exchange-CrossTenant-Network-Message-Id: 00a0c160-4915-4450-a431-08dd9f5d330e
X-MS-Exchange-CrossTenant-AuthSource: DM4PR10MB8218.namprd10.prod.outlook.com
X-MS-Exchange-CrossTenant-AuthAs: Internal
X-MS-Exchange-CrossTenant-OriginalArrivalTime: 30 May 2025 09:34:39.4449
(UTC)
X-MS-Exchange-CrossTenant-FromEntityHeader: Hosted
X-MS-Exchange-CrossTenant-Id: 4e2c6054-71cb-48f1-bd6c-3a9705aca71b
X-MS-Exchange-CrossTenant-MailboxType: HOSTED
X-MS-Exchange-CrossTenant-UserPrincipalName: BbYbmNTpJ2vP1gWDX2o2uKyAZuwjovQC9yX28E2TVlJMIz3dfRAtaznpTriQw4j588aKfk+f3psA/+XgqMDwUhzglhEpu8GMm6rgMdojzzM=
X-MS-Exchange-Transport-CrossTenantHeadersStamped: IA4PR10MB8494
X-Proofpoint-Virus-Version: vendor=baseguard
engine=ICAP:2.0.293,Aquarius:18.0.1099,Hydra:6.0.736,FMLib:17.12.80.40
definitions=2025-05-30_04,2025-05-29_01,2025-03-28_01
X-Proofpoint-Spam-Details: rule=notspam policy=default score=0 suspectscore=0 mlxscore=0 adultscore=0
malwarescore=0 mlxlogscore=999 bulkscore=0 spamscore=0 phishscore=0
classifier=spam adjust=0 reason=mlx scancount=1 engine=8.12.0-2505160000
definitions=main-2505300081
X-Proofpoint-GUID: fFoTNhnHmj4SwObrP3acnijz9WOOWkQC
X-Proofpoint-Spam-Details-Enc: AW1haW4tMjUwNTMwMDA4MSBTYWx0ZWRfX/We3qfA9ftAG cstON/TwYTqlchwo6N8zoxu1rd4ClawR6nGcPqehIq5pMwD+1IEOsDgnsWXS/vIjMAr8j6OknsZ cDisDKGmX5/mqIPj9WKPuYhxFwfZVpL86HiyhG8CFVvzaB5TyyW4C+N5dt7/nnpeie4FPGrF9c5
Idt2DLOwcyQoeYURfPq8AzbAbgA4NeA5+mQFgxE8Spzwj/oQjH3+c+/GDFEkX3ItzKIKqXBUhUX kc7AlmYgtkgfqjMmfLRiv6UNWZc8nKiVI+D4JFHeALlU6rNf3jQdDeiZ7peJQspcJsCiFVO/cL8 MSS/aUa09AovC7J94DP1JY8EZeFQwCiRrp/JUQV5Ff/0V03eEnclLbFJDFY+dGgrqdqU20YPUGm
fWZZDcaxhvanlu3+aMWzXz+8NWTqe+8dwh4dUTO3yMckTeBMC5FwU7p3ODz3kyQ2OOgIU0IT
X-Proofpoint-ORIG-GUID: fFoTNhnHmj4SwObrP3acnijz9WOOWkQC
X-Authority-Analysis: v=2.4 cv=N7MpF39B c=1 sm=1 tr=0 ts=68397bb2 b=1 cx=c_pps a=WeWmnZmh0fydH62SvGsd2A==:117 a=WeWmnZmh0fydH62SvGsd2A==:17 a=6eWqkTHjU83fiwn7nKZWdM+Sl24=:19 a=lCpzRmAYbLLaTzLvsPZ7Mbvzbb8=:19 a=wKuvFiaSGQ0qltdbU6+NXLB8nM8=:19
a=Ol13hO9ccFRV9qXi2t6ftBPywas=:19 a=xqWC_Br6kY4A:10 a=kj9zAlcOel0A:10 a=dt9VzEwgFbYA:10 a=GoEa3M9JfhUA:10 a=NFPYR10K5HdowB0G7XMA:9 a=CjuIK1q_8ugA:10
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

On Fri, May 30, 2025 at 11:16:51AM +0200, David Hildenbrand wrote:
[snip]
> It kind-of contradicts the linked
> Documentation/admin-guide/mm/transhuge.rst, where we have this *beautiful*
> comment
>
> "Transparent Hugepage Support for anonymous memory can be entirely disable
> (mostly for debugging purposes".
>
> I mean, "entirely" is also pretty clear to me.
>
> I would assume that the man page of MADV_COLLAPSE should have talked about
> ignoring *khugepaged* toggles (max_ptes_none ...), at least that's what I
> recall from the discussions back then.

Sorry I don't want to turn this stuff into too much of a mega-thread but
just a small comment here - I think we should go and update the
documentation/man pages to be clearer and consistent.

There is enough confusion around this as it is...

>
> --
> Cheers,
>
> David / dhildenb
>

Return-Path: <linux-kernel+bounces-667895-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id E837241E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:39:47 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 7557E1894812
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:39:36 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id D9E4322DA0D;
Fri, 30 May 2025 09:34:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="lIRmN2ee"
Received: from mail-pj1-f44.google.com (mail-pj1-f44.google.com [209.85.216.44])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id F34E52222DF
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:34:51 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.44
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597694; cv=none; b=JICzyLglG0nwnLhQmUgC3wmA143VYZucXMbMuTKTup6F6GuGt6X7kKylf4lt5i8hB446GPc0r0QIlzlRweD3kI47He9YnxhnzQdAYRct98YRAEN6xwohQ+WRBqf+Zx1QTJlSGQs5zp50f1E3p5nGHJ7qqq29j21YdP8ZCF8FSxs=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597694; c=relaxed/simple;
bh=huOWwdeBYR2ktVOxaLw5MQ08zC/qZOEF1LgzmHEU8B4=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=BGN94+HwkEpGGZ8E8nz7TghW3QLx2ewE0Yy9Htyzqp2Fr3qZtCfFqjIsWAM9C7K5Q+W1R8QsSGqgPbc9WDm1rXiDUF8geKgakV15MkWc+IfQQDkFgjyXKr9M1lmhPbYopZi2zNxOXA7tvzolNr2hFqzn9ZItMtV0lFduQFW95/o=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=lIRmN2ee; arc=none smtp.client-ip=209.85.216.44
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pj1-f44.google.com with SMTP id 98e67ed59e1d1-3109fb9f941so2028108a91.3
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:34:51 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597691; x=1749202491; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=4r9koD1NPFSh7WkJOunYhnXV7czMJGlSW2YfbvvIqZo=;
b=lIRmN2eeZ8hSHOyD39mDvrocaoB8bYLso4269wCfjXQI0Z8JKAB6gX6E1JAqesqZ/l
zaiBhXOow9Usxm0YRWoacFzS88MYj527OdXbqSVqJ1/1BO+xLI32fpqghiDeoKeOCtCU
u8iM/Bfx+qO4chcVapQLofBcqDwsjrgzxfXdYb676HxXu+4OoSAEtNhOwekhobMWxe5b
VrmEQwZP8W8pjANYe8ZNtzAHPMNLXeC1Sv3tW4hu7PDctYmDkiIKsRP0vycUImdyIOns
w/0C7zy0BB/CD1W6apOtKJLlYzXpQ1a6enbXVSMYtpU7Um7GyF2uvmS9ZZMjLd3ah+Uw
zd0A==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597691; x=1749202491;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=4r9koD1NPFSh7WkJOunYhnXV7czMJGlSW2YfbvvIqZo=;
b=fThKqwJbXgYmaFEmcIHQhoe4CYEvhqFvNk/SYYyOcX2nC96ORyOixTqMu33VKmwNxj
UUjZe3emCSDL9/CL5mttwy0dE31VcLGzAzzcTWxCOJV+a3VA8ExoMYvfhVmBH++t/OIv
fuXYIXAPDHCmNZR+YZmSy+nvZDka7Z8gAWz3c0V8d9HC4K12MOzqAQqzRd6K6fPs2a03
V3uFdXy2aWMORTP3HmBfHxMaZDE1bb2m1VTj54SDlvhef9YHfV9LrIT5eW2PDn1OYcmI
udmKfsFCJtAe56uC+YMGY/PjC/9+NI3Cyc+EId7yBKLgwv3kPOcgeGolE2y0aANbXz+p
xWKA==
X-Forwarded-Encrypted: i=1; AJvYcCVTJlAM56kypMsUY5Tl56dIodGa76esH4he2D3YNPOJFyIFmg8qTf1WFiETh1ugUxDRc3Gwr4EG3pVoHeM=@vger.kernel.org
X-Gm-Message-State: AOJu0YxQ9hRgevlnk1xMcUv4a5qTNhSfEmBjGs4fzzOEZT+wGJ2erwbY
eNfn6fwythp4wwu9AWLnE1fA1LA1gvT2fziqVc87efUHw8J+j6LxYzy46I2DITPYCEo=
X-Gm-Gg: ASbGncvwtYxPvjoU1BnBFS1u/6hb1u4ab8d0AQGHwXPLPJRaFbhMwngiYyZy8clQB11
AQR2ByFsUCjXSRoic/9T4dIkopEI+dHgZh3/jxGX3XGXAqaVob50SAESj+SaIdtMIlDZ7jF6U4j
5SsNu9yhI6tCXAtbjMDlawzIlrsOGs1lrHaVcilpS6sb0krgd+eo96MX3VKO0I0xSHZRnHrIvGQ
+x57QgePNFneBEAHu+dGQ3JxsT9WqdP23+5E7VZDygtIbCDnyT25DFUwUkVWjcw+dypIhzuF354
sSGTQp43tLSCRyNVxCQ8On/KXxTustV6YhTivO0Hm56Bj133ErHizTJuvCn1bPi6MoKcMOeqzPy
bL1TuyvWMWA==
X-Google-Smtp-Source: AGHT+IEYFSuwVcZ5gMKEd63K0LSZFm1at7KJSWQADYXZzWbymIXeS5JhogJZ9S4360x8945TmkY35w==
X-Received: by 2002:a17:90b:2e8b:b0:311:e637:287b with SMTP id 98e67ed59e1d1-3124198a8e2mr3896259a91.29.1748597690878;
Fri, 30 May 2025 02:34:50 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.34.35
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:34:50 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 25/35] RPAL: add MPK initialization and interface
Date: Fri, 30 May 2025 17:27:53 +0800
Message-Id: <569387db40571a03a71506cbec12813c1e5dde62.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

RPAL uses MPK (Memory Protection Keys) to protect memory. Therefore, RPAL
needs to perform MPK initialization, allocation, and other related tasks,
while providing corresponding user-mode interfaces.

This patch executes MPK initialization operations, including feature
detection, implementation of user mode interfaces for setting and
retrieving pkeys, and development of utility functions. For pkey
allocation, RPAL prioritizes using pkeys provided by user mode, with user
mode responsible for preventing pkey collisions between different services.
If user mode does not provide a valid pkey, RPAL generates a pkey via
id % arch_max_pkey() to maximize the avoidance of pkey collisions.
Additionally, RPAL does not permit services to manipulate pkeys
independently; thus, all pkeys are marked as allocated, and services are
prohibited from releasing pkeys.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/Kconfig | 12 +++++++-
arch/x86/rpal/Makefile | 1 +
arch/x86/rpal/core.c | 13 ++++++++
arch/x86/rpal/internal.h | 5 +++
arch/x86/rpal/pku.c | 47 ++++++++++++++++++++++++++++
arch/x86/rpal/proc.c | 5 +++
arch/x86/rpal/service.c | 24 +++++++++++++++
include/linux/rpal.h | 66 ++++++++++++++++++++++++++++++++++++++++
mm/mprotect.c | 9 ++++++
9 files changed, 181 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/rpal/pku.c

diff --git a/arch/x86/rpal/Kconfig b/arch/x86/rpal/Kconfig
index e5e6996553ea..5434fdb2940d 100644
--- a/arch/x86/rpal/Kconfig
+++ b/arch/x86/rpal/Kconfig
@@ -8,4 +8,14 @@ config RPAL
depends on X86_64
help
This option enables system support for Run Process As
- library (RPAL).
\ No newline at end of file
+ library (RPAL).
+
+config RPAL_PKU
+ bool "mpk protection for RPAL"
+ default y
+ depends on RPAL
+ help
+ Memory protection key (MPK) can achieve intra-process
+ memory separation which is broken by RPAL, Always keep
+ it on when use RPAL. CPU feature will be detected at
+ boot time as some CPUs do not support it.
\ No newline at end of file
diff --git a/arch/x86/rpal/Makefile b/arch/x86/rpal/Makefile
index 89f745382c51..42a42b0393be 100644
--- a/arch/x86/rpal/Makefile
+++ b/arch/x86/rpal/Makefile
@@ -3,3 +3,4 @@
obj-$(CONFIG_RPAL) += rpal.o

rpal-y := service.o core.o mm.o proc.o thread.o
+rpal-$(CONFIG_RPAL_PKU) += pku.o
\ No newline at end of file
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 406d54788bac..41111d693994 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -8,6 +8,7 @@

#include <linux/rpal.h>
#include <linux/sched/task_stack.h>
+#include <linux/pkeys.h>
#include <asm/fsgsbase.h>

#include "internal.h"
@@ -374,6 +375,14 @@ static bool check_hardware_features(void)
rpal_err("no fsgsbase feature\n");
return false;
}
+
+#ifdef CONFIG_RPAL_PKU
+ if (!arch_pkeys_enabled()) {
+ rpal_err("MPK is not enabled\n");
+ return false;
+ }
+#endif
+
return true;
}

@@ -390,6 +399,10 @@ int __init rpal_init(void)
if (ret)
goto fail;

+#ifdef CONFIG_RPAL_PKU
+ rpal_set_cap(RPAL_CAP_PKU);
+#endif
+
rpal_inited = true;
return 0;

diff --git a/arch/x86/rpal/internal.h b/arch/x86/rpal/internal.h
index 6256172bb79e..71afa8225450 100644
--- a/arch/x86/rpal/internal.h
+++ b/arch/x86/rpal/internal.h
@@ -54,3 +54,8 @@ rpal_build_call_state(const struct rpal_sender_data *rsd)
return ((rsd->rcd.service_id << RPAL_SID_SHIFT) |
(rsd->scc->sender_id << RPAL_ID_SHIFT) | RPAL_RECEIVER_STATE_CALL);
}
+
+/* pkey.c */
+int rpal_alloc_pkey(struct rpal_service *rs, int pkey);
+int rpal_pkey_setup(struct rpal_service *rs, int pkey);
+void rpal_service_pku_init(void);
diff --git a/arch/x86/rpal/pku.c b/arch/x86/rpal/pku.c
new file mode 100644
index 000000000000..4c5151ca5b8b
--- /dev/null
+++ b/arch/x86/rpal/pku.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RPAL service level operations
+ * Copyright (c) 2025, ByteDance. All rights reserved.
+ *
+ * Author: Jiadong Sun <sunjiadong.lff@xxxxxxxxxxxxx>
+ */
+
+#include <linux/rpal.h>
+#include <linux/pkeys.h>
+
+#include "internal.h"
+
+void rpal_service_pku_init(void)
+{
+ u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
+ struct mm_struct *mm = current->mm;
+
+ /* We consume all pkeys so that no pkeys will be allocated by others */
+ mmap_write_lock(mm);
+ if (mm->context.pkey_allocation_map != 0x1)
+ rpal_err("pkey has been allocated: %u\n",
+ mm->context.pkey_allocation_map);
+ mm->context.pkey_allocation_map = all_pkeys_mask;
+ mmap_write_unlock(mm);
+}
+
+int rpal_pkey_setup(struct rpal_service *rs, int pkey)
+{
+ int val;
+
+ val = rpal_pkey_to_pkru(pkey);
+ rs->pkey = pkey;
+ return 0;
+}
+
+int rpal_alloc_pkey(struct rpal_service *rs, int pkey)
+{
+ int ret;
+
+ if (pkey >= 0 && pkey < arch_max_pkey())
+ return pkey;
+
+ ret = rs->id % arch_max_pkey();
+
+ return ret;
+}
diff --git a/arch/x86/rpal/proc.c b/arch/x86/rpal/proc.c
index 16ac9612bfc5..2f9cceec4992 100644
--- a/arch/x86/rpal/proc.c
+++ b/arch/x86/rpal/proc.c
@@ -76,6 +76,11 @@ static long rpal_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case RPAL_IOCTL_RELEASE_SERVICE:
ret = rpal_release_service(arg);
break;
+#ifdef CONFIG_RPAL_PKU
+ case RPAL_IOCTL_GET_SERVICE_PKEY:
+ ret = put_user(cur->pkey, (int __user *)arg);
+ break;
+#endif
default:
return -EINVAL;
}
diff --git a/arch/x86/rpal/service.c b/arch/x86/rpal/service.c
index 16e94d710445..ca795dacc90d 100644
--- a/arch/x86/rpal/service.c
+++ b/arch/x86/rpal/service.c
@@ -208,6 +208,10 @@ struct rpal_service *rpal_register_service(void)
spin_lock_init(&rs->rpd.poll_lock);
bitmap_zero(rs->rpd.dead_key_bitmap, RPAL_NR_ID);
init_waitqueue_head(&rs->rpd.rpal_waitqueue);
+#ifdef CONFIG_RPAL_PKU
+ rs->pkey = -1;
+ rpal_service_pku_init();
+#endif

rs->bad_service = false;
rs->base = calculate_base_address(rs->id);
@@ -288,6 +292,9 @@ static int add_mapped_service(struct rpal_service *rs, struct rpal_service *tgt,
if (node->rs == NULL) {
node->rs = rpal_get_service(tgt);
set_bit(type_bit, &node->type);
+#ifdef CONFIG_RPAL_PKU
+ node->pkey = tgt->pkey;
+#endif
} else {
if (node->rs != tgt) {
ret = -EINVAL;
@@ -397,6 +404,19 @@ int rpal_request_service(unsigned long arg)
goto put_service;
}

+#ifdef CONFIG_RPAL_PKU
+ if (cur->pkey == tgt->pkey) {
+ ret = -EINVAL;
+ goto put_service;
+ }
+
+ ret = put_user(tgt->pkey, rra.pkey);
+ if (ret) {
+ ret = -EFAULT;
+ goto put_service;
+ }
+#endif
+
ret = put_user((unsigned long)(tgt->rsm.user_meta), rra.user_metap);
if (ret) {
ret = -EFAULT;
@@ -577,6 +597,10 @@ int rpal_enable_service(unsigned long arg)
mutex_lock(&cur->mutex);
if (!cur->enabled) {
cur->rsm = rsm;
+#ifdef CONFIG_RPAL_PKU
+ rsm.pkey = rpal_alloc_pkey(cur, rsm.pkey);
+ rpal_pkey_setup(cur, rsm.pkey);
+#endif
cur->enabled = true;
}
mutex_unlock(&cur->mutex);
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 4f1d92053818..2f2982d281cc 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -97,6 +97,12 @@ enum {
#define RPAL_ID_MASK (~(0 | RPAL_RECEIVER_STATE_MASK | RPAL_SID_MASK))
#define RPAL_MAX_ID ((1 << (RPAL_SID_SHIFT - RPAL_ID_SHIFT)) - 1)

+#define RPAL_PKRU_BASE_CODE_READ 0xAAAAAAAA
+#define RPAL_PKRU_BASE_CODE 0xFFFFFFFF
+#define RPAL_PKRU_SET 0
+#define RPAL_PKRU_UNION 1
+#define RPAL_PKRU_INTERSECT 2
+
extern unsigned long rpal_cap;

enum rpal_task_flag_bits {
@@ -122,6 +128,10 @@ enum rpal_sender_state {
RPAL_SENDER_STATE_KERNEL_RET,
};

+enum rpal_capability {
+ RPAL_CAP_PKU
+};
+
struct rpal_critical_section {
unsigned long ret_begin;
unsigned long ret_end;
@@ -134,6 +144,7 @@ struct rpal_service_metadata {
unsigned long version;
void __user *user_meta;
struct rpal_critical_section rcs;
+ int pkey;
};

struct rpal_request_arg {
@@ -141,11 +152,17 @@ struct rpal_request_arg {
u64 key;
unsigned long __user *user_metap;
int __user *id;
+#ifdef CONFIG_RPAL_PKU
+ int __user *pkey;
+#endif
};

struct rpal_mapped_service {
unsigned long type;
struct rpal_service *rs;
+#ifdef CONFIG_RPAL_PKU
+ int pkey;
+#endif
};

struct rpal_poll_data {
@@ -220,6 +237,11 @@ struct rpal_service {
/* fsbase / pid map */
struct rpal_fsbase_tsk_map fs_tsk_map[RPAL_MAX_RECEIVER_NUM];

+#ifdef CONFIG_RPAL_PKU
+ /* pkey */
+ int pkey;
+#endif
+
/* delayed service put work */
struct delayed_work delayed_put_work;

@@ -323,6 +345,7 @@ enum rpal_command_type {
RPAL_CMD_DISABLE_SERVICE,
RPAL_CMD_REQUEST_SERVICE,
RPAL_CMD_RELEASE_SERVICE,
+ RPAL_CMD_GET_SERVICE_PKEY,
RPAL_NR_CMD,
};

@@ -351,6 +374,8 @@ enum rpal_command_type {
_IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_REQUEST_SERVICE, unsigned long)
#define RPAL_IOCTL_RELEASE_SERVICE \
_IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_RELEASE_SERVICE, unsigned long)
+#define RPAL_IOCTL_GET_SERVICE_PKEY \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_GET_SERVICE_PKEY, int *)

#define rpal_for_each_requested_service(rs, idx) \
for (idx = find_first_bit(rs->requested_service_bitmap, RPAL_NR_ID); \
@@ -420,6 +445,47 @@ static inline bool rpal_is_correct_address(struct rpal_service *rs, unsigned lon
return true;
}

+static inline void rpal_set_cap(unsigned long cap)
+{
+ set_bit(cap, &rpal_cap);
+}
+
+static inline void rpal_clear_cap(unsigned long cap)
+{
+ clear_bit(cap, &rpal_cap);
+}
+
+static inline bool rpal_has_cap(unsigned long cap)
+{
+ return test_bit(cap, &rpal_cap);
+}
+
+static inline u32 rpal_pkey_to_pkru(int pkey)
+{
+ int offset = pkey * 2;
+ u32 mask = 0x3 << offset;
+
+ return RPAL_PKRU_BASE_CODE & ~mask;
+}
+
+static inline u32 rpal_pkey_to_pkru_read(int pkey)
+{
+ int offset = pkey * 2;
+ u32 mask = 0x3 << offset;
+
+ return RPAL_PKRU_BASE_CODE_READ & ~mask;
+}
+
+static inline u32 rpal_pkru_union(u32 pkru0, u32 pkru1)
+{
+ return pkru0 & pkru1;
+}
+
+static inline u32 rpal_pkru_intersect(u32 pkru0, u32 pkru1)
+{
+ return pkru0 | pkru1;
+}
+
#ifdef CONFIG_RPAL
static inline struct rpal_service *rpal_current_service(void)
{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 62c1f7945741..982f911ffaba 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -33,6 +33,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/memory-tiers.h>
#include <uapi/linux/mman.h>
+#include <linux/rpal.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -895,6 +896,14 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
{
int ret;

+#ifdef CONFIG_RPAL_PKU
+ if (rpal_current_service()) {
+ rpal_err("try_to_free pkey: %d %s\n", current->pid,
+ current->comm);
+ return -EINVAL;
+ }
+#endif
+
mmap_write_lock(current->mm);
ret = mm_pkey_free(current->mm, pkey);
mmap_write_unlock(current->mm);
--
2.20.1

Return-Path: <linux-kernel+bounces-667898-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sy.mirrors.kernel.org (sy.mirrors.kernel.org [147.75.48.161])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 1B45841E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:40:20 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sy.mirrors.kernel.org (Postfix) with ESMTPS id B96C47AAEEC
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:39:01 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id BCFAF233151;
Fri, 30 May 2025 09:35:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="d4wJuq1F"
Received: from mail-pl1-f179.google.com (mail-pl1-f179.google.com [209.85.214.179])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 25BEA232395
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:35:06 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.214.179
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597709; cv=none; b=Y0+mONyOM3I5PrcyhqCGNOfLYN11BUJhc1lZooFfTWZ0qTdL5N2bD9Aa5zl5xb9SOijg4SRG1MAdIlh+i+ih5iG7ms+kWZ6Ug19Km62yC3wYlVhf9DS67lQ+AX8+yw1i2t1JdkSKaA4ckObQyhfvl+T3gxO0TIf1stI2W72VMro=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597709; c=relaxed/simple;
bh=CSBcJAAaPu1EwMn2t/TMYKgMYiCUawOQ8pC1PZaoOVQ=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version:Content-Type; b=GIEysaoR+BYfjezTy58Bg4Y043RKdt22FHBpBhAH/eBp8E9XPOMawhz/LyRdwg2IJbUXUiajP8WY1ORwNerq/jxqW2UJFIIJNSGUbBAXBZL7jwF2pSFmeqty58xKWB4JNAMc/dtiAWDCbuWbsJDuaR23o3/bCM4I5pUK7NS9jHw=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=d4wJuq1F; arc=none smtp.client-ip=209.85.214.179
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pl1-f179.google.com with SMTP id d9443c01a7336-234d366e5f2so23780915ad.1
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:35:06 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597706; x=1749202506; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=L5lpZBJN8KO9Wi2JkiZbNrbX1Mk+tAQNdIUoCRiUIVY=;
b=d4wJuq1FXHu4nTQhd567rVBNLwpQu/vPTHTSMzJELXe8r8QMRyWPKI6UHMdfwJXOvr
C1GoXVRbTVaRSYD0nmNXphDcvSH//5te5HB1lEE0EvlBtG1sssy6fSGhqZUTMLkzrpK+
LSGOGzc5RSFiXfbt4+oXm4yIvJYYWyoItXd/4IPDDMlPFoABm0JQji3CRu0pNwGLiMdg
G2+GO3t1l7vwd3WtPSIONakx6NnCy6LaQYQbPc1urPCtwWSXG1dSCAvdYQIXjoffdKoe
8eAsBL9+hETeapwiFw5ycojUfAKNjsZiLmf0P6KQfEKFnzs4Wp2+rE9ANwE6A9Gl0gLg
UGEA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597706; x=1749202506;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=L5lpZBJN8KO9Wi2JkiZbNrbX1Mk+tAQNdIUoCRiUIVY=;
b=tRszNgIKYfcjApQ7bilUf2NbrXeaWqh6jKnnLO8tQhBqJDTIPmFuiKGQbIPH9cEHGo
WpfNtwBPZT9lAcyBWub/ayZo9uxYcw22sDfZLmseuYl8oSRZYFBJYg+f/lcHs8AhR/Mx
MCQo4ImcbnsZyJ1Cs6reivhI4qdoeZh30yuIsG+21nlHIKV8bUt2q7PVRyh15ZIQiJro
MM6vnf7ybf72Icz4njP0zXXe80ScrfDQZJUTgVgQ3QizCRI+lxicC+xaV5CUh3fABjiA
WysVaeLknkP4Ox7vdddPq+m/CDi/pYpHj7iRz8cOiGqdIPUS57+03H6MJ9xnhUX9ymIt
Fl9w==
X-Forwarded-Encrypted: i=1; AJvYcCUFgGr0ieRwyLKh7f3e+pbKKb+MGELVnSYrZOqEi4wNx159bxZqcw2P7YkMv5OpVMw71VWj/c7zbasWUcY=@vger.kernel.org
X-Gm-Message-State: AOJu0YyTtSYJN9vn34Te73YSTrl+duRNgEwFrXOLjZjG+djSUvtieM4y
x4RoyjtxV1O/p2D5QSNDQ5Z2enAS+y4b68adIWBNwVElU8n8JG46pTjlKchUUzL6WoY=
X-Gm-Gg: ASbGncuFhjWOLjFkv30SENVubZu30G3DWdbeB8xW+lk25VwBSjO7y/ZG/tyl+nlB3xz
i1JrjQ/wXkmT9zJi5MfNxA/7jJbDvavPob8ZJH1AdVTwLOUYrLi8Q0eRpm/O/Bn5BriPk0WOVgQ
r3wV0Gxq+fjIBwhvO/wEuGnJO6sGWw3IstIcN56Hc+OgS70Qwxr0kUOPXLrdqWEU2y8rNUHw/Zp
DzOju324ZoUt0unLBsxWb50IzlpTaC18Ma2K25PIgDZEdn57Im3gbkwe6muN9daJMWJQ/S61X9O
BcqT16UtYCVOT7nCx1uWitQV+K6VOLrUuiUdoI+GOFRm9Vte3dTe9OJ0dmaqda+rmOkmzWFPz/W
UxDcOiecr7z1R8ZrOgSLC
X-Google-Smtp-Source: AGHT+IHAptJ8gzy+riOF1QxfTIK2HHrgqPaaAtuaRZ0rpmj0urvpGAIWcEQ578w6XYLBnp/Jp17aLw==
X-Received: by 2002:a17:902:ccce:b0:234:bca7:2940 with SMTP id d9443c01a7336-23529a2c094mr43718025ad.38.1748597706235;
Fri, 30 May 2025 02:35:06 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.34.51
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:35:05 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 26/35] RPAL: enable MPK support
Date: Fri, 30 May 2025 17:27:54 +0800
Message-Id: <a7da7fe131b0ce6582dbb77903745673d83a6195.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

RPAL leverages Memory Protection Keys (MPK) to safeguard shared memory
from illegal access and corruption by other processes. MPK-based memory
protection involves two key mechanisms: First, for already allocated
memory, when RPAL is enabled, the protection key fields in all page tables
must be set to the processâ??s corresponding pkey value. Second, for newly
allocated memory, when the kernel detects that the process is an RPAL
service, it sets the corresponding pkey flag in the relevant memory data
structures. Together, these measures ensure that all memory belonging to
the current process is protected by its own pkey.

For MPK initialization, RPAL needs to set the pkeys of all allocated page
table pages to the pkeys assigned by RPAL to the service. This is
completed in three steps: First, enable permissions for all pkeys of the
service, allowing it to access memory protected by any pkey. Then, update
the pkeys in the page tables. Since permissions for all pkeys are already
enabled at this stage, even if old and new pkeys coexist during the page
table update, the service's memory access remains unaffected. Finally,
after the page table update is complete, set the service's pkey permissions
to the corresponding values, thereby achieving memory protection.

Additionally, RPAL must manage the values of the PKRU register during
lazy switch operations and signal handling. This ensures the process
avoids coredumps causing by MPK.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/kernel/cpu/common.c | 8 +-
arch/x86/kernel/fpu/core.c | 8 +-
arch/x86/kernel/process.c | 7 +-
arch/x86/rpal/core.c | 14 +++-
arch/x86/rpal/internal.h | 1 +
arch/x86/rpal/pku.c | 139 ++++++++++++++++++++++++++++++++++-
arch/x86/rpal/service.c | 1 +
arch/x86/rpal/thread.c | 5 ++
include/linux/rpal.h | 3 +
kernel/sched/core.c | 3 +
mm/mmap.c | 12 +++
mm/mprotect.c | 96 ++++++++++++++++++++++++
mm/vma.c | 18 +++++
13 files changed, 310 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8feb8fd2957a..2678453cdf76 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -26,6 +26,7 @@
#include <linux/pgtable.h>
#include <linux/stackprotector.h>
#include <linux/utsname.h>
+#include <linux/rpal.h>

#include <asm/alternative.h>
#include <asm/cmdline.h>
@@ -532,7 +533,12 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)

cr4_set_bits(X86_CR4_PKE);
/* Load the default PKRU value */
- pkru_write_default();
+#ifdef CONFIG_RPAL_PKU
+ if (rpal_current_service() && rpal_current_service()->pku_on)
+ write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ else
+#endif
+ pkru_write_default();
}

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ea138583dd92..251b1ddee726 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -20,6 +20,7 @@
#include <linux/hardirq.h>
#include <linux/pkeys.h>
#include <linux/vmalloc.h>
+#include <linux/rpal.h>

#include "context.h"
#include "internal.h"
@@ -746,7 +747,12 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
else
frstor(&init_fpstate.regs.fsave);

- pkru_write_default();
+#ifdef CONFIG_RPAL_PKU
+ if (rpal_current_service() && rpal_current_service()->pku_on)
+ write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ else
+#endif
+ pkru_write_default();
}

/*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index be8845e2ca4d..b74de35218f9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -285,7 +285,12 @@ static void pkru_flush_thread(void)
* If PKRU is enabled the default PKRU value has to be loaded into
* the hardware right here (similar to context switch).
*/
- pkru_write_default();
+#ifdef CONFIG_RPAL_PKU
+ if (rpal_current_service() && rpal_current_service()->pku_on)
+ write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ else
+#endif
+ pkru_write_default();
}

void flush_thread(void)
diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 41111d693994..47c9e551344e 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -275,6 +275,13 @@ rpal_skip_lazy_switch(struct task_struct *next, struct pt_regs *regs)
tgt = next->rpal_rs;
if (in_ret_section(tgt, regs->ip)) {
wrfsbase(current->thread.fsbase);
+#ifdef CONFIG_RPAL_PKU
+ rpal_set_current_pkru(
+ rpal_pkru_union(
+ rpal_pkey_to_pkru(rpal_current_service()->pkey),
+ rpal_pkey_to_pkru(next->rpal_rs->pkey)),
+ RPAL_PKRU_SET);
+#endif
rebuild_sender_stack(current->rpal_sd, regs);
rpal_clear_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT);
next->rpal_rd->sender = NULL;
@@ -292,8 +299,13 @@ static struct task_struct *rpal_fix_critical_section(struct task_struct *next,
if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT))
next = rpal_skip_lazy_switch(next, regs);
/* receiver->sender */
- else if (rpal_is_correct_address(cur, regs->ip))
+ else if (rpal_is_correct_address(cur, regs->ip)) {
rpal_skip_receiver_code(next, regs);
+#ifdef CONFIG_RPAL_PKU
+ write_pkru(rpal_pkru_union(
+ rpal_pkey_to_pkru(next->rpal_rs->pkey), rdpkru()));
+#endif
+ }

return next;
}
diff --git a/arch/x86/rpal/internal.h b/arch/x86/rpal/internal.h
index 71afa8225450..e49febce8645 100644
--- a/arch/x86/rpal/internal.h
+++ b/arch/x86/rpal/internal.h
@@ -58,4 +58,5 @@ rpal_build_call_state(const struct rpal_sender_data *rsd)
/* pkey.c */
int rpal_alloc_pkey(struct rpal_service *rs, int pkey);
int rpal_pkey_setup(struct rpal_service *rs, int pkey);
+void rpal_set_current_pkru(u32 val, int mode);
void rpal_service_pku_init(void);
diff --git a/arch/x86/rpal/pku.c b/arch/x86/rpal/pku.c
index 4c5151ca5b8b..26cef324f41f 100644
--- a/arch/x86/rpal/pku.c
+++ b/arch/x86/rpal/pku.c
@@ -25,12 +25,149 @@ void rpal_service_pku_init(void)
mmap_write_unlock(mm);
}

+void rpal_set_pku_schedule_tail(struct task_struct *prev)
+{
+ if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
+ struct rpal_service *cur = rpal_current_service();
+ u32 val = rpal_pkey_to_pkru(cur->pkey);
+
+ rpal_set_current_pkru(val, RPAL_PKRU_SET);
+ } else {
+ struct rpal_service *cur = rpal_current_service();
+ u32 val = rpal_pkey_to_pkru(cur->pkey);
+
+ val = rpal_pkru_union(
+ val,
+ rpal_pkey_to_pkru(
+ current->rpal_sd->receiver->rpal_rs->pkey));
+ rpal_set_current_pkru(val, RPAL_PKRU_SET);
+ }
+}
+
+static inline u32 rpal_get_new_val(u32 old_val, u32 new_val, int mode)
+{
+ switch (mode) {
+ case RPAL_PKRU_SET:
+ return new_val;
+ case RPAL_PKRU_UNION:
+ return rpal_pkru_union(old_val, new_val);
+ case RPAL_PKRU_INTERSECT:
+ return rpal_pkru_intersect(old_val, new_val);
+ default:
+ rpal_err("%s: invalid mode: %d\n", __func__, mode);
+ return old_val;
+ }
+}
+
+static int rpal_set_task_fpu_pkru(struct task_struct *task, u32 val, int mode)
+{
+ struct thread_struct *t = &task->thread;
+
+ val = rpal_get_new_val(t->pkru, val, mode);
+ t->pkru = val;
+
+ return 0;
+}
+
+void rpal_set_current_pkru(u32 val, int mode)
+{
+ u32 new_val;
+
+ new_val = rpal_get_new_val(rdpkru(), val, mode);
+ write_pkru(new_val);
+}
+
+struct task_function_data {
+ struct task_struct *task;
+ u32 val;
+ int mode;
+ int ret;
+};
+
+static void rpal_set_remote_pkru(void *data)
+{
+ struct task_function_data *tfd = data;
+ struct task_struct *task = tfd->task;
+
+ if (task) {
+ /* -EAGAIN */
+ if (task_cpu(task) != smp_processor_id())
+ return;
+
+ tfd->ret = -ESRCH;
+ if (task == current) {
+ rpal_set_current_pkru(tfd->val, tfd->mode);
+ tfd->ret = 0;
+ } else {
+ tfd->ret = rpal_set_task_fpu_pkru(task, tfd->val,
+ tfd->mode);
+ }
+ return;
+ }
+}
+
+static int rpal_task_function_call(struct task_struct *task, u32 val, int mode)
+{
+ struct task_function_data data = {
+ .task = task,
+ .val = val,
+ .mode = mode,
+ .ret = -EAGAIN,
+ };
+ int ret;
+
+ for (;;) {
+ smp_call_function_single(task_cpu(task), rpal_set_remote_pkru,
+ &data, 1);
+ ret = data.ret;
+
+ if (ret != -EAGAIN)
+ break;
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
+static void rpal_set_task_pkru(struct task_struct *task, u32 val, int mode)
+{
+ if (task == current)
+ rpal_set_current_pkru(val, mode);
+ else
+ rpal_task_function_call(task, val, mode);
+}
+
+static void rpal_set_group_pkru(u32 val, int mode)
+{
+ struct task_struct *p;
+
+ for_each_thread(current, p) {
+ rpal_set_task_pkru(p, val, mode);
+ }
+}
+
int rpal_pkey_setup(struct rpal_service *rs, int pkey)
{
- int val;
+ int err, val;

val = rpal_pkey_to_pkru(pkey);
+
+ mmap_write_lock(current->mm);
+ if (rs->pku_on) {
+ mmap_write_unlock(current->mm);
+ return 0;
+ }
rs->pkey = pkey;
+ /* others must see rs->pkey before rs->pku_on */
+ barrier();
+ rs->pku_on = true;
+ mmap_write_unlock(current->mm);
+ rpal_set_group_pkru(val, RPAL_PKRU_UNION);
+ err = do_rpal_mprotect_pkey(rs->base, RPAL_ADDR_SPACE_SIZE, pkey);
+ if (unlikely(err))
+ rpal_err("do_rpal_mprotect_key error: %d\n", err);
+ rpal_set_group_pkru(val, RPAL_PKRU_SET);
return 0;
}

diff --git a/arch/x86/rpal/service.c b/arch/x86/rpal/service.c
index ca795dacc90d..7a83e85cf096 100644
--- a/arch/x86/rpal/service.c
+++ b/arch/x86/rpal/service.c
@@ -210,6 +210,7 @@ struct rpal_service *rpal_register_service(void)
init_waitqueue_head(&rs->rpd.rpal_waitqueue);
#ifdef CONFIG_RPAL_PKU
rs->pkey = -1;
+ rs->pku_on = false;
rpal_service_pku_init();
#endif

diff --git a/arch/x86/rpal/thread.c b/arch/x86/rpal/thread.c
index 02c1a9c22dd7..fcc592baaac0 100644
--- a/arch/x86/rpal/thread.c
+++ b/arch/x86/rpal/thread.c
@@ -281,6 +281,11 @@ int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
regs->sp = ersp;
/* avoid rebuild again */
scc->ec.magic = 0;
+#ifdef CONFIG_RPAL_PKU
+ rpal_set_current_pkru(
+ rpal_pkey_to_pkru(rpal_current_service()->pkey),
+ RPAL_PKRU_SET);
+#endif
return 0;
}
}
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 2f2982d281cc..f2474cb53abe 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -239,6 +239,7 @@ struct rpal_service {

#ifdef CONFIG_RPAL_PKU
/* pkey */
+ bool pku_on;
int pkey;
#endif

@@ -571,4 +572,6 @@ void rpal_schedule(struct task_struct *next);
asmlinkage struct task_struct *
__rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
+int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey);
+void rpal_set_pku_schedule_tail(struct task_struct *prev);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0f9343698198..eb5d5bd51597 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11029,6 +11029,9 @@ asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev)

finish_task_switch(prev);
trace_sched_exit_tp(true, CALLER_ADDR0);
+#ifdef CONFIG_RPAL_PKU
+ rpal_set_pku_schedule_tail(prev);
+#endif
preempt_enable();

calculate_sigpending();
diff --git a/mm/mmap.c b/mm/mmap.c
index 98bb33d2091e..d36ea4ea2bd0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -396,6 +396,18 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (pkey < 0)
pkey = 0;
}
+#ifdef CONFIG_RPAL_PKU
+ /*
+ * For RPAL process, if pku is enabled, we always use
+ * its service pkey for new vma.
+ */
+ do {
+ struct rpal_service *cur = rpal_current_service();
+
+ if (cur && cur->pku_on)
+ pkey = cur->pkey;
+ } while (0);
+#endif

/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 982f911ffaba..e9ae828e377d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -713,6 +713,18 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
struct mmu_gather tlb;
struct vma_iterator vmi;

+#ifdef CONFIG_RPAL_PKU
+ if (pkey != -1) {
+ struct rpal_service *cur = rpal_current_service();
+
+ if (unlikely(cur) && cur->pku_on) {
+ rpal_err("%s, pid: %d, try to change pkey\n",
+ current->comm, current->pid);
+ return -EINVAL;
+ }
+ }
+#endif
+
start = untagged_addr(start);

prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
@@ -848,6 +860,90 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
return error;
}

+#ifdef CONFIG_RPAL_PKU
+int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey)
+{
+ unsigned long nstart, end, tmp;
+ struct vm_area_struct *vma, *prev;
+ struct rpal_service *cur = rpal_current_service();
+ int error = -EINVAL;
+ struct mmu_gather tlb;
+ struct vma_iterator vmi;
+
+ start = untagged_addr(start);
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return 0;
+ len = PAGE_ALIGN(len);
+ end = start + len;
+ if (end <= start)
+ return -ENOMEM;
+
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+
+ /*
+ * If userspace did not allocate the pkey, do not let
+ * them use it here.
+ */
+ error = -EINVAL;
+ if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
+ goto out;
+
+ vma_iter_init(&vmi, current->mm, start);
+ vma = vma_find(&vmi, end);
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ prev = vma_prev(&vmi);
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
+ if (start > vma->vm_start)
+ prev = vma;
+
+ tlb_gather_mmu(&tlb, current->mm);
+ nstart = start;
+ tmp = vma->vm_start;
+ for_each_vma_range(vmi, vma, end) {
+ unsigned long vma_pkey_mask;
+ unsigned long newflags;
+
+ tmp = vma->vm_start;
+ nstart = tmp;
+
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+ vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 |
+ VM_PKEY_BIT3;
+ newflags = vma->vm_flags;
+ newflags &= ~vma_pkey_mask;
+ newflags |= ((unsigned long)cur->pkey) << VM_PKEY_SHIFT;
+
+ tmp = vma->vm_end;
+ if (tmp > end)
+ tmp = end;
+
+ if (vma->vm_ops && vma->vm_ops->mprotect) {
+ error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
+ if (error)
+ break;
+ }
+
+ error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags);
+ if (error)
+ break;
+ }
+ tlb_finish_mmu(&tlb);
+
+out:
+ mmap_write_unlock(current->mm);
+ return error;
+}
+#endif
+
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
diff --git a/mm/vma.c b/mm/vma.c
index a468d4c29c0c..fa9d8f694e6e 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -4,6 +4,8 @@
* VMA-specific functions.
*/

+#include <linux/rpal.h>
+
#include "vma_internal.h"
#include "vma.h"

@@ -2622,6 +2624,22 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
{
struct mm_struct *mm = current->mm;

+#ifdef CONFIG_RPAL_PKU
+ /*
+ * Any memory need to use RPAL service pkey
+ * once service is enabled.
+ */
+ struct rpal_service *cur = rpal_current_service();
+ unsigned long vma_pkey_mask;
+
+ if (cur && cur->pku_on) {
+ vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 |
+ VM_PKEY_BIT3;
+ flags &= ~vma_pkey_mask;
+ flags |= ((unsigned long)cur->pkey) << VM_PKEY_SHIFT;
+ }
+#endif
+
/*
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
--
2.20.1

Return-Path: <linux-kernel+bounces-667897-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id A7C0941E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:40:22 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 671A61889F31
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:40:18 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id E0ABF22DA1B;
Fri, 30 May 2025 09:35:06 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=oracle.com header.i=@oracle.com header.b="MvErhWpu";
dkim=pass (1024-bit key) header.d=oracle.onmicrosoft.com header.i=@oracle.onmicrosoft.com header.b="GLgzQ+m+"
Received: from mx0b-00069f02.pphosted.com (mx0b-00069f02.pphosted.com [205.220.177.32])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1C7C5231856;
Fri, 30 May 2025 09:35:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=fail smtp.client-ip=205.220.177.32
ARC-Seal:i=2; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597704; cv=fail; b=pvjHv6BMzqnTsrfE9AJfuNhrKR1YNDrzc9yQaX0E0hDw/yeiYPHQSq8A5Zh9NEeDtnThMQMFj7OUm4jFyw8lclGYh4iqQ6DxKAJw5Ul7oKm5RK98+JxBkZZ1EYDdmrzv2NXBNFwLXyqG2Lulxxek5GDRTW3r0Q4dzLfpL3vUVH4=
ARC-Message-Signature:i=2; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597704; c=relaxed/simple;
bh=RDWu/ko2MZBRawXwTAO+RsrKGdbqUcHxthtGMo4E41Y=;
h=Date:From:To:Cc:Subject:Message-ID:References:Content-Type:
Content-Disposition:In-Reply-To:MIME-Version; b=hhT1gg1ekhRMBid5Q+HveT0UZryxoGLZEXKcIP1f0qKkOkw5RwJzSvTALEDJ4uFH0KwBgFpx+vM4K4GP08Qs4pfh3hFCVdQMWaUf1MjjR9idl9GR1+kO6VQNXQsjNmvuY9CIjBZBdyTD0AlFOBeO6iNovdNJFyKq7e0sxbRpOrc=
ARC-Authentication-Results:i=2; smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=oracle.com; spf=pass smtp.mailfrom=oracle.com; dkim=pass (2048-bit key) header.d=oracle.com header.i=@oracle.com header.b=MvErhWpu; dkim=pass (1024-bit key) header.d=oracle.onmicrosoft.com header.i=@oracle.onmicrosoft.com header.b=GLgzQ+m+; arc=fail smtp.client-ip=205.220.177.32
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=oracle.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=oracle.com
Received: from pps.filterd (m0246631.ppops.net [127.0.0.1])
by mx0b-00069f02.pphosted.com (8.18.1.2/8.18.1.2) with ESMTP id 54U6uX5o005875;
Fri, 30 May 2025 09:33:40 GMT
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=oracle.com; h=cc
:content-transfer-encoding:content-type:date:from:in-reply-to
:message-id:mime-version:references:subject:to; s=
corp-2025-04-25; bh=Pnlj08ZOn0iM7uNe8bP45c5Q7whDWBOZbum8CPz0E3o=; b=
MvErhWpu5ADM0JMfvLhVM+c2RQm92fQrSUMKD9wpikjEPx9nimDUxmtUrjHcMHg+
F8r/X4B+WRqlAtCigh8ir4F57wlMxCHMTM6tUE1WnxSFG41XsAQ8uP6Jo3GhQYlA
qdAanpH24be7qHzf3ILPmTf22Pb34BJNyk5k/f+L12MVtU4Tc/JZK968BqeiEyiW
eIQfN9CFACL/RFa6JvlqmLFuAjyrnxc/a8bwicVh7naQBlazPnJscrxKVw/K5AGR
vnpp7nLQ6DflqVE7mdtrLWJCgkFEzQpCp0/3xh7TaxnSxb50qaTlLkmjnKlofmqG
mWYpK3kTFDR1aGG7A86Ebg==
Received: from phxpaimrmta02.imrmtpd1.prodappphxaev1.oraclevcn.com (phxpaimrmta02.appoci.oracle.com [147.154.114.232])
by mx0b-00069f02.pphosted.com (PPS) with ESMTPS id 46wjbcnvud-1
(version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK);
Fri, 30 May 2025 09:33:40 +0000 (GMT)
Received: from pps.filterd (phxpaimrmta02.imrmtpd1.prodappphxaev1.oraclevcn.com [127.0.0.1])
by phxpaimrmta02.imrmtpd1.prodappphxaev1.oraclevcn.com (8.18.1.2/8.18.1.2) with ESMTP id 54U8lu4w008196;
Fri, 30 May 2025 09:33:39 GMT
Received: from bn1pr04cu002.outbound.protection.outlook.com (mail-eastus2azon11010048.outbound.protection.outlook.com [52.101.56.48])
by phxpaimrmta02.imrmtpd1.prodappphxaev1.oraclevcn.com (PPS) with ESMTPS id 46u4jdt5pp-1
(version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK);
Fri, 30 May 2025 09:33:38 +0000
ARC-Seal: i=1; a=rsa-sha256; s=arcselector10001; d=microsoft.com; cv=none;
b=R1eHxSFw68sYH5oMm/CCT+Z0HCrV8pZk6T0ZKRbL3cCixpzg0NLb4T/0JMOmvCzj0Qguw+/pNLOLkIAyV61aVgZTWYx8MSQ0boCvnHPuNzCugJNDBYXJkR2xR0XtqACKPB7yvgXoczJn0ZzCGKpS+tdyoHOkFT6r0Bo9l0jb7RvlXvgxY8dzdVhQCLn0IuJTyvPbKFfN48VQ6/lPSyy0+QHuBxppdIHULOAJ7T0mqciSWfwYUtWWhercYLAfaXkNxBr2262ZqpB45Nr/QPoH5485p/guY0b6lblnyuIJD8jcRjVgzliCcS/uSRwJOnR7nhM1J0Xcvq2c7ew4dI/W3g==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com;
s=arcselector10001;
h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1;
bh=Pnlj08ZOn0iM7uNe8bP45c5Q7whDWBOZbum8CPz0E3o=;
b=UGtVOYhzos5AXV5PC0IMhTIvdfO4GOmwVG+H6LgXtrLOENtEQsZcDAYlkmaYTWrNh47YlV3pvLqxAy/IeJFXBnlBhv0U7FZlZardpb+/Tb8ysgZhvsSHeeeFHgZk/ThalWHG6SXNzRzC0HEp/8LHSkcWGksdyVx792py+L4r2hGWrV9xOEcFsx9MDhJ/3IFtl8M9bbiyuBPCuumtVcPCaMXBPHFuX9fSxny67FRnpGM8ENfoeL+TcLJTJELN1vLd+Vn4weQR9fIbQFUJUNHLE0rW/YsJqI+WBnZDsSap1kPCXenXUznTQi9JkuuOM2CNWPdM2iEoqaQffEqlBqwhdw==
ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass
smtp.mailfrom=oracle.com; dmarc=pass action=none header.from=oracle.com;
dkim=pass header.d=oracle.com; arc=none
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=oracle.onmicrosoft.com; s=selector2-oracle-onmicrosoft-com;
h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck;
bh=Pnlj08ZOn0iM7uNe8bP45c5Q7whDWBOZbum8CPz0E3o=;
b=GLgzQ+m+/WZqiPRc4cCwUEuMQrYKqFKqkvwMvcFRPLFtZq2WiCLrXAVAxD2PfPkRbXrYPqlpINDRjAO7gR2rvbyhyzke9bskCQfBHgUqt61L/o9vHSAjNhAwzHrtuuEZ5dnP/FwKO7Qt3bivYX+3OSiUEKqD8+KRYFaljCf9+90=
Received: from DM4PR10MB8218.namprd10.prod.outlook.com (2603:10b6:8:1cc::16)
by IA4PR10MB8494.namprd10.prod.outlook.com (2603:10b6:208:564::17) with
Microsoft SMTP Server (version=TLS1_2,
cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.8746.40; Fri, 30 May
2025 09:33:35 +0000
Received: from DM4PR10MB8218.namprd10.prod.outlook.com
([fe80::2650:55cf:2816:5f2]) by DM4PR10MB8218.namprd10.prod.outlook.com
([fe80::2650:55cf:2816:5f2%5]) with mapi id 15.20.8746.030; Fri, 30 May 2025
09:33:35 +0000
Date: Fri, 30 May 2025 10:33:31 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx>
To: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Cc: tglx@xxxxxxxxxxxxx, mingo@xxxxxxxxxx, bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx, x86@xxxxxxxxxx, luto@xxxxxxxxxx,
kees@xxxxxxxxxx, akpm@xxxxxxxxxxxxxxxxxxxx, david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx, vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx, dietmar.eggemann@xxxxxxx, hpa@xxxxxxxxx,
acme@xxxxxxxxxx, namhyung@xxxxxxxxxx, mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx, jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx, adrian.hunter@xxxxxxxxx, kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx, brauner@xxxxxxxxxx, jack@xxxxxxx,
Liam.Howlett@xxxxxxxxxx, vbabka@xxxxxxx, rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx, mhocko@xxxxxxxx, rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx, mgorman@xxxxxxx, vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx, pfalcato@xxxxxxx, riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx, linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx, duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx, dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx, chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx, yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx, sunjiadong.lff@xxxxxxxxxxxxx
Subject: Re: [RFC v2 00/35] optimize cost of inter-process communication
Message-ID: <8c98c8e0-95e1-4292-8116-79d803962d5f@lucifer.local>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline
Content-Transfer-Encoding: 8bit
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
X-ClientProxiedBy: LO2P265CA0038.GBRP265.PROD.OUTLOOK.COM
(2603:10a6:600:61::26) To DM4PR10MB8218.namprd10.prod.outlook.com
(2603:10b6:8:1cc::16)
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
X-MS-PublicTrafficType: Email
X-MS-TrafficTypeDiagnostic: DM4PR10MB8218:EE_|IA4PR10MB8494:EE_
X-MS-Office365-Filtering-Correlation-Id: 9000b9bc-7768-43f0-cadb-08dd9f5d0ca9
X-MS-Exchange-SenderADCheck: 1
X-MS-Exchange-AntiSpam-Relay: 0
X-Microsoft-Antispam: BCL:0;ARA:13230040|1800799024|7416014|366016|376014;
X-Microsoft-Antispam-Message-Info:
=?utf-8?B?dFNDSDRHN0ZMdGMzb09wdWRxaDNIeExrMVRKbTFhNGxIUzdlZ3A2VWtrOUlo?=
=?utf-8?B?WU1OVkZNSWhGQUhaenNDSzFiYzJ5SVc0Y0o2dXg5TG9oSHFhZjc1bUJPY1Ns?=
=?utf-8?B?RUNrZzlNeWJnNzhqT3ZHT2l4cDcxNXBCMXJmUmp5bUJ6eGhvNHpTOUM5NDZn?=
=?utf-8?B?VTd5aWFxNlpRTzh1Tm81OVRhTDNGM3ZqY0F4cVRFM1ZFY3Z2WWYrQlhQY0N4?=
=?utf-8?B?K29tcHhaM1EzQk9jVzhXN1F4MkNpUk5GUERydE04Y3ltMjU5d0dHWUlUSkZN?=
=?utf-8?B?U2J2d3VCejJ6WHBTRkxDYjNsZnBvWXBOcksvbHBPcDZXcW9HbW9oalNKY0pR?=
=?utf-8?B?bE9WQ1lGM0ljR3FGR3RTRHgrdnYvU0l6dC9pd0doQyt6ZVRBdFhHVWhXZ0E4?=
=?utf-8?B?MHkwSTA4QzNDekdJNzRoTGJEYTVTUU10elBCWjJtVGdhVUVmWU9uSTlzeU10?=
=?utf-8?B?NHVKV2xlWHZxdVROSnVmRWptc0VEbUoxS0c5QWlxcENpdUptSTlmZmZBeFQ3?=
=?utf-8?B?dWxGVUNtM0l3ZXh0OTFXK2hSalRIdUxHNEQzM291ZUhxUWtUMDVtcHBKaVBz?=
=?utf-8?B?UEJ4Q2hlSFRCamNpY0xJV0U2M2dsQnM1U0ZubDVLekMvQmZuVzhtQ0FGOUp3?=
=?utf-8?B?dEUrL1Znc3NHMHVwdmtpaDdYSGo3ZmNZY0xTWkNpYUsvZ0hEMUxLb0VQTnFI?=
=?utf-8?B?NGZTREp1R3IxNG1yYlg3b2VEVjRaK2IyZVhIVVdtbXcwbW51akg2SFJUWGRJ?=
=?utf-8?B?cFhWTmFQbEhmVWpvZjA3L3JIcWl0KzdxYWxkaVVWYlV1Z0JPYWhnTDJYUlhs?=
=?utf-8?B?blRhMTNKY1RYNmt5aWtjWTlEQUlhT3p5bktnQ1JSRThzS3VDU01ubWtCZGl3?=
=?utf-8?B?ZVJVQUlxLy9WR2R3aUp2UXQ3MTBGeTUxYWhWRTVkREJ3VlJJSm0ydk1PMkRS?=
=?utf-8?B?KzZCRTFYYW5ocFBickd4ZTZ2N3E5YVNHcVYwRHYvZ0QwbXk0eGNkL0o4bXFX?=
=?utf-8?B?RFBEbzZraXUrMjlYV1hOb3EvQ1lCNHJjVzdUODFBSWVLVnZXUU1yWGhBL3c4?=
=?utf-8?B?OEdET1cwZ3hmQmM3NG9vbEVVMFoxMkNTVTVSbVpCTDZoVXBNd0g5ME13TWhO?=
=?utf-8?B?eHhZSTBGVVlsTzdBQThMdXo3cXdBVTQ3dzVQTVpBNkJRMGRHTWx2VEZJTmds?=
=?utf-8?B?ZXo2Z1FZMWNQU0wxaFkzR1lvR0lHUEdiejQ5SEFCTGtkek5vOUFOeEpnaTRx?=
=?utf-8?B?aU0wM0xrb2tucGtyOVIzVE1NVVE4QVNDY0U3S2VEUE1zL3V1N1lMQXI0UnJs?=
=?utf-8?B?RGFjY0Uzak9BZDBmOTFZZ2FsZjU5M2J4MEsyUkd4cHE3TEhNYnJoaUVrQmVN?=
=?utf-8?B?a09BYmFhcVF0MS8yMzNpQWFKVDRhZFBIQnVNRmp4Qkp3U0QrNUVDdlBMNkgr?=
=?utf-8?B?ZHgrWFNzdldzN3djZGtPVlNXOEJJT3EyeDJoUTlrcy9UUFhWcDh2c3pVR2gr?=
=?utf-8?B?cVVYNnNseU96b0JMNy9nc0JpVjNScXAxaDZIb2R1eFVwbW1kZlQvYlpHMmZ1?=
=?utf-8?B?bkt6blpuRkxXTXBxb0tRb0crVUYxbytPT1BsTWYyL0JkbGMwSVNtTXlISnVT?=
=?utf-8?B?aCsreFR4N2cvYkpMY0FscW9FcHJHTjVmMXhjVVQxS2lnNER3NzZmcng3Vjh3?=
=?utf-8?B?ZXNiV0J2c1RLZDVBT0lqdy8xd2pqeFhvaE9UNHBFSmY0dGRhQXdETElrN0U2?=
=?utf-8?B?M3ZpTnBYOHZnZDUzTzh3b1VHeWQvWEZsRU4wRk45ZE5XYkUyRGZQZGNQamRF?=
=?utf-8?B?MG9GakFBWVhSQXM2SUFjUzViVEw1QkQvanVTcDgvMkhuYTU2T3lSc05OTmE5?=
=?utf-8?B?TFRVeUFqWCtpc3BRdHNGdnpwZ0JNdXNxeXRnOFdRSHlpaWQxRGU5MTNpbU5W?=
=?utf-8?Q?nl3ISw/dk84=3D?=
X-Forefront-Antispam-Report:
CIP:255.255.255.255;CTRY:;LANG:en;SCL:1;SRV:;IPV:NLI;SFV:NSPM;H:DM4PR10MB8218.namprd10.prod.outlook.com;PTR:;CAT:NONE;SFS:(13230040)(1800799024)(7416014)(366016)(376014);DIR:OUT;SFP:1101;
X-MS-Exchange-AntiSpam-MessageData-ChunkCount: 1
X-MS-Exchange-AntiSpam-MessageData-0:
=?utf-8?B?OG54bFFoTklaQjV5ZllQT3piRTBGU0VjalVqNmxIRlh0aVFoREl1dkt5Wit6?=
=?utf-8?B?bzJ0S0xVS3ppbE9sUGhWcXpWUmVjbnBFdzlZUzQ2YThnL3N0aHQrQS9Rc0RD?=
=?utf-8?B?RG9EQ2M0cFJaZHJ2K1FqbEp5SnNpbW9LRVlXb0dDV1kxV2RnYy9XUG1EWVIv?=
=?utf-8?B?SFdMdUxKWElBZ3cxaXg0T3NzcUJXTTFTbUs0L2VtcFRhOWFQSyt6SnJEc2tE?=
=?utf-8?B?QWxweE1xSXFLbjBIQTlkTkErWlQ0ZXVKK0VHYXBLQTlOZWVSTkJ0ZTd5SEdo?=
=?utf-8?B?OUJad2lFREY4T0FxOG80ZGZFNUxFZm1oeFA3TFdweENFREtsYm1lWWZNK3Zi?=
=?utf-8?B?bzdVV0hwU3NsWjNRTk53SDA1ZFU0VlpMNnc4RXo3dmFzdFFVQkh4ZUU5VEdY?=
=?utf-8?B?SEJmejBreCtRWDlrOHhraHVvUTBMbU5YSHBqeU1kZVowR1ZnckgrSklackdn?=
=?utf-8?B?Q1RWODB0TGJPOUY2blo5SEk3Qm91eDY2aGdSMjRlcXdXS1lXUWNHeCtVK243?=
=?utf-8?B?dzJZOTM2ODVZbEpRMktQeVAwblgrQklXWGppekhuY1hrcGk3WFhsQnVOWXVN?=
=?utf-8?B?djl3YnZETHA3RGZwTzA3VVlCWlRoN29ueDBjd0d4YVRMbW04bVA3dEtkdndJ?=
=?utf-8?B?R2FYUUpRMHd1dnlyU3MzSVFiaXFPUmhIdlRwaWpiWHlvUzNNSnc3K0ZhK3Nl?=
=?utf-8?B?bEN3d3RYVlJVTDlPK1FTdEN1WTNQaGVJRHgyNzlzOGtwa3Y4ZVBIaFVHNTY1?=
=?utf-8?B?YmFJRnErVjNHaTJHYXZCOVpOWFpzRnZCV3doamF6Q094MmNEODJYT0F6ZHBW?=
=?utf-8?B?YWVXVWhYSGN6ZU9LV0NxZkFlZWlYbWRaVGZqZ0pHaFRhYTkzZkVERzd0V1gy?=
=?utf-8?B?eW9taExYdzdJNjZmZ0Erbzcva0N0ajEyd2ZTSGR6WUJKK1dwcmY5cStzcTlB?=
=?utf-8?B?RzJVQlNhMXZpSVREdWhSL1IrUk81UTg4YWI3U1g0M3NwZ2g1ZkNLTUhFNlFC?=
=?utf-8?B?ZE8wZU5NYURPd04vSUhybkxDTFFaamp6QVV5WVFLUGcweVBGRVBIQWJvWVpG?=
=?utf-8?B?b0UwZUNhV2lZNnNaendLamlHUTZrcmFEU082R1pKSXE4bUVjdWV5RnEwYnFO?=
=?utf-8?B?NVVLTENNNTVJSDk2VGZvUXE2aXFHWXAvSFA1NTVsWDRtbHFOTjdqOThVTmFj?=
=?utf-8?B?VWxJZUtXVVdIcEZBV0lZS0RCN0s1emtWZXFFVWVDaC9FMitzNlZwQVFISEN3?=
=?utf-8?B?ZW54aU1TM2hHWFl3SmpUdzgxNCtCSkNhQURhS1pUZmUvTHQybUlubC9xbUh4?=
=?utf-8?B?T0dhZTRQVTNxNFkwMWc5R2doR0h5enppWC9MRFA4M0o2THVvYS9sZkl0UDhh?=
=?utf-8?B?dGs5SHVjbS8wS0RDYzAvMHlmYkFyd3pSOE5KaldCTEloYkJheCsxRjlpWGdq?=
=?utf-8?B?dFpVZ002c1VDQWsrRkoyblR5cWZic095QVZOYmNlWHBhcVB3QlgzbXMzRmsr?=
=?utf-8?B?U0xFQkxMdllUcGVTaTYxbFh5UHErdDFtSUdqSE5tY0tPakZBUW9jdStPVmsw?=
=?utf-8?B?aS82QTUzSWxSREkyUDFmd3JUSFBNcGwxYzZWaTY0UjVBaGk5eVBFdVBLRWoy?=
=?utf-8?B?dys2TExUVS9FZjdtamE4OW1oTE12SDExK2VVRW5WNmNqM1YyVTNWcXJkMlRH?=
=?utf-8?B?VDNhdkdTUWRBZEh5Qys2S0VFVE93YlBNUUtBUTJkRmdmZzNmUlFKMmRISnpQ?=
=?utf-8?B?RjJYeEkzb0pmL3MwK0k5S3lENnhQcjl5TCsrR2U3RDcrTGVJTU5rODFlMU54?=
=?utf-8?B?VlJoSURtbDkrbUxjYmF2QnhQZ2RLZGZXRjZ3V2dOVnJocmxjVWwzWDUzQkVU?=
=?utf-8?B?Z1cwREY5eXlCWWN5YW5aLzBOdWc2ZmZRWjBtVnJSVHVWeXlmNS9GUXRKLzNR?=
=?utf-8?B?ZkVwMXJhcDdzT3huUUZ0L29WbVcxS0ovV2pidnFQeVlyc3lPMXJJWFlVeTE4?=
=?utf-8?B?V2hXc0t6SjQ0bTdqU0NzVVNPbGxXTWxLTUcvWXYvbkdMRTNSTk8wazJ0T1FF?=
=?utf-8?B?TDByQXkzYVlnUGhwUFNwRzZFS0diVkZxUVlyUXBhYTBZZzdyMEJGaXNaTHJ5?=
=?utf-8?B?Uis5SjQxS2Z5Q25MdnpXTG5WQzl4MDI5YStFZUVybDhlU1pLN0NIUUZUeDZB?=
=?utf-8?B?Qmc9PQ==?=
X-MS-Exchange-AntiSpam-ExternalHop-MessageData-ChunkCount: 1
X-MS-Exchange-AntiSpam-ExternalHop-MessageData-0:
lGEHurlHXFB5DzfCZnMKM1CCLaGCI9/NKfmTbvEWD51UCDC7viRwo17edsJbqE6FmwBfFdkluFS0m+EDIOtn07U+/1bFbjS2EY8TSUxs5CD7lER+L1CX+9fLtl893nm5HOb9+Xk8GJ1vg27UKIG5MYPCoKY3Lo3EN3ZeEkSusLEv1a0ADHALmpQlIxgNIdQlGMGsTGhBryAP3jfecOmz223mFI5tigcRLxRWiigsSmzK3g1DiQ/LOsj14enIs19B20jTtZw+kcCIkX34vjnFiuHT6GsU7Je23xQ6qWjOSmYBp0HIVc+OitWL/LtdzqEjga9WN1g8k6LiEk+SSPv+fQKCk7cD2NXxDooFm+RL9P01Jfrw4F7umIrDumulbsy2ZnhSHxBw1kVbNHiE9cO3Qbr9j0r6U8j0dLxcT5vbbCnI1N3adMBUaJNWd/aAX2mkvej9O2x6Lk0XaAhDIjEc0FPd6IWdMjMMybKDpbeqgpR1xeev7sZtcHqeNhi6lsQ2qNpEL8OijNQ+oDB6TNN7fD9ep0MT1pHEURUKChIh/0Cg1W00dpEFNNDZJgSgrGEIyWnFLXFuzQJDjZzxoTBM0e/4x0CYd02mh2VuoO26fmc=
X-OriginatorOrg: oracle.com
X-MS-Exchange-CrossTenant-Network-Message-Id: 9000b9bc-7768-43f0-cadb-08dd9f5d0ca9
X-MS-Exchange-CrossTenant-AuthSource: DM4PR10MB8218.namprd10.prod.outlook.com
X-MS-Exchange-CrossTenant-AuthAs: Internal
X-MS-Exchange-CrossTenant-OriginalArrivalTime: 30 May 2025 09:33:35.1538
(UTC)
X-MS-Exchange-CrossTenant-FromEntityHeader: Hosted
X-MS-Exchange-CrossTenant-Id: 4e2c6054-71cb-48f1-bd6c-3a9705aca71b
X-MS-Exchange-CrossTenant-MailboxType: HOSTED
X-MS-Exchange-CrossTenant-UserPrincipalName: /skI3LRfsWC29SNRw4j9pVzaOk2MN80XYg02x+NhNvYbRdB7plAg3T1rkvKWnZwIG4TF/70TQf7mUG91hdjygV/F1tiQS/1JXfZK7QT3j1Y=
X-MS-Exchange-Transport-CrossTenantHeadersStamped: IA4PR10MB8494
X-Proofpoint-Virus-Version: vendor=baseguard
engine=ICAP:2.0.293,Aquarius:18.0.1099,Hydra:6.0.736,FMLib:17.12.80.40
definitions=2025-05-30_04,2025-05-29_01,2025-03-28_01
X-Proofpoint-Spam-Details: rule=notspam policy=default score=0 suspectscore=0 mlxlogscore=999
phishscore=0 malwarescore=0 bulkscore=0 adultscore=0 spamscore=0
mlxscore=0 classifier=spam adjust=0 reason=mlx scancount=1
engine=8.12.0-2505160000 definitions=main-2505300081
X-Proofpoint-GUID: _xMH1j0yD73JZBr2tkXXeAX7gc_23PBW
X-Proofpoint-ORIG-GUID: _xMH1j0yD73JZBr2tkXXeAX7gc_23PBW
X-Proofpoint-Spam-Details-Enc: AW1haW4tMjUwNTMwMDA4MSBTYWx0ZWRfX163T0WpfpINj CzmSwQh6lO7xeCkXFYq1SkD8/cn25mbtmBYDqyFUaeY8hGCtBRpk8L5rut+BFpQ9LAba0PkZ1WW I/0qcAiCbk8kuCijhxTme7DhsFmNmAljqdS6Rzs94cauvLKmAkO5guus1RqzJeNCaKsf4W/2DYt
STgvuzzYPHQ5sCCa/Jd29IpzfUQ2qYABWRsGXy7G/iRtM32caeba4jW/4J9NgRL52NcuQGWQft7 XEXY8OrN7ptMg75A0Ak0EG+8kslAs15sxSbGBbTnnn7eXbaioKESgPuIPzhbIJkMnw4k4Z8iSve xPhYBJw/maFjGmLYlMgWy9uo7hVyJV/f0gcbZhFuYDreNZgXh2iikpahn7yJPC+Mms7wHp0qMCw
ag/CBBRy6oumL7UlWGaQrk2tp7j9kTxb9l7zffz1EsZF7Qz4QLpyNaNtigkHm+U9gSIb43+c
X-Authority-Analysis: v=2.4 cv=c8qrQQ9l c=1 sm=1 tr=0 ts=68397b74 cx=c_pps a=OOZaFjgC48PWsiFpTAqLcw==:117 a=OOZaFjgC48PWsiFpTAqLcw==:17 a=6eWqkTHjU83fiwn7nKZWdM+Sl24=:19 a=lCpzRmAYbLLaTzLvsPZ7Mbvzbb8=:19 a=wKuvFiaSGQ0qltdbU6+NXLB8nM8=:19
a=Ol13hO9ccFRV9qXi2t6ftBPywas=:19 a=xqWC_Br6kY4A:10 a=IkcTkHD0fZMA:10 a=dt9VzEwgFbYA:10 a=GoEa3M9JfhUA:10 a=VwQbUJbxAAAA:8 a=pGLkceISAAAA:8 a=ZQb2-ej06_51c_fjXvYA:9 a=8xWE5lwX36eHxTgC:21 a=3ZKOabzyN94A:10 a=QEXdDO2ut3YA:10
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

Bo,

You have outstanding feedback on your v1 from me and Dave Hansen. I'm not
quite sure why you're sending a v2 without responding to that.

This isn't how the upstream kernel works...

Thanks, Lorenzo

On Fri, May 30, 2025 at 05:27:28PM +0800, Bo Li wrote:
> Changelog:
>
> v2:
> - Port the RPAL functions to the latest v6.15 kernel.
> - Add a supplementary introduction to the application scenarios and
> security considerations of RPAL.
>
> link to v1:
> https://lore.kernel.org/lkml/CAP2HCOmAkRVTci0ObtyW=3v6GFOrt9zCn2NwLUbZ+Di49xkBiw@xxxxxxxxxxxxxx/
>
> --------------------------------------------------------------------------
>
> # Introduction
>
> We mainly apply RPAL to the service mesh architecture widely adopted in
> modern cloud-native data centers. Before the rise of the service mesh
> architecture, network functions were usually integrated into monolithic
> applications as libraries, and the main business programs invoked them
> through function calls. However, to facilitate the independent development
> and operation and maintenance of the main business programs and network
> functions, the service mesh removed the network functions from the main
> business programs and made them independent processes (called sidecars).
> Inter-process communication (IPC) is used for interaction between the main
> business program and the sidecar, and the introduced inter-process
> communication has led to a sharp increase in resource consumption in
> cloud-native data centers, and may even occupy more than 10% of the CPU of
> the entire microservice cluster.
>
> To achieve the efficient function call mechanism of the monolithic
> architecture under the service mesh architecture, we introduced the RPAL
> (Running Process As Library) architecture, which implements the sharing of
> the virtual address space of processes and the switching threads in user
> mode. Through the analysis of the service mesh architecture, we found that
> the process memory isolation between the main business program and the
> sidecar is not particularly important because they are split from one
> application and were an integral part of the original monolithic
> application. It is more important for the two processes to be independent
> of each other because they need to be independently developed and
> maintained to ensure the architectural advantages of the service mesh.
> Therefore, RPAL breaks the isolation between processes while preserving the
> independence between them. We think that RPAL can also be applied to other
> scenarios featuring sidecar-like architectures, such as distributed file
> storage systems in LLM infra.
>
> In RPAL architecture, multiple processes share a virtual address space, so
> this architecture can be regarded as an advanced version of the Linux
> shared memory mechanism:
>
> 1. Traditional shared memory requires two processes to negotiate to ensure
> the mapping of the same piece of memory. In RPAL architecture, two RPAL
> processes still need to reach a consensus before they can successfully
> invoke the relevant system calls of RPAL to share the virtual address
> space.
> 2. Traditional shared memory only shares part of the data. However, in RPAL
> architecture, processes that have established an RPAL communication
> relationship share a virtual address space, and all user memory (such as
> data segments and code segments) of each RPAL process is shared among these
> processes. However, a process cannot access the memory of other processes
> at any time. We use the MPK mechanism to ensure that the memory of other
> processes can only be accessed when special RPAL functions are called.
> Otherwise, a page fault will be triggered.
> 3. In RPAL architecture, to ensure the consistency of the execution context
> of the shared code (such as the stack and thread local storage), we further
> implement the thread context switching in user mode based on the ability to
> share the virtual address space of different processes, enabling the
> threads of different processes to directly perform fast switching in user
> mode without falling into kernel mode for slow switching.
>
> # Background
>
> In traditional inter-process communication (IPC) scenarios, Unix domain
> sockets are commonly used in conjunction with the epoll() family for event
> multiplexing. IPC operations involve system calls on both the data and
> control planes, thereby imposing a non-trivial overhead on the interacting
> processes. Even when shared memory is employed to optimize the data plane,
> two data copies still remain. Specifically, data is initially copied from
> a process's private memory space into the shared memory area, and then it
> is copied from the shared memory into the private memory of another
> process.
>
> This poses a question: Is it possible to reduce the overhead of IPC with
> only minimal modifications at the application level? To address this, we
> observed that the functionality of IPC, which encompasses data transfer
> and invocation of the target thread, is similar to a function call, where
> arguments are passed and the callee function is invoked to process them.
> Inspired by this analogy, we introduce RPAL (Run Process As Library), a
> framework designed to enable one process to invoke another as if making
> a local function call, all without going through the kernel.
>
> # Design
>
> First, letâ??s formalize RPALâ??s core objectives:
>
> 1. Data-plane efficiency: Reduce the number of data copies from two (in the
> shared memory solution) to one.
> 2. Control-plane optimization: Eliminate the overhead of system calls and
> kernel's thread switches.
> 3. Application compatibility: Minimize the modifications to existing
> applications that utilize Unix domain sockets and the epoll() family.
>
> To attain the first objective, processes that use RPAL share the same
> virtual address space. So one process can access another's data directly
> via a data pointer. This means data can be transferred from one process to
> another with just one copy operation.
>
> To meet the second goal, RPAL relies on the shared address space to do
> lightweight context switching in user space, which we call an "RPAL call".
> This allows one process to execute another process's code just like a
> local function call.
>
> To achieve the third target, RPAL stays compatible with the epoll family
> of functions, like epoll_create(), epoll_wait(), and epoll_ctl(). If an
> application uses epoll for IPC, developers can switch to RPAL with just a
> few small changes. For instance, you can just replace epoll_wait() with
> rpal_epoll_wait(). The basic epoll procedure, where a process waits for
> another to write to a monitored descriptor using an epoll file descriptor,
> still works fine with RPAL.
>
> ## Address space sharing
>
> For address space sharing, RPAL partitions the entire userspace virtual
> address space and allocates non-overlapping memory ranges to each process.
> On x86_64 architectures, RPAL uses a memory range size covered by a
> single PUD (Page Upper Directory) entry, which is 512GB. This restricts
> each processâ??s virtual address space to 512GB on x86_64, sufficient for
> most applications in our scenario. The rationale is straightforward:
> address space sharing can be simply achieved by copying the PUD from one
> processâ??s page table to anotherâ??s. So one process can directly use the
> data pointer to access another's memory.
>
>
> |------------| <- 0
> |------------| <- 512 GB
> | Process A |
> |------------| <- 2*512 GB
> |------------| <- n*512 GB
> | Process B |
> |------------| <- (n+1)*512 GB
> |------------| <- STACK_TOP
> | Kernel |
> |------------|
>
> ## RPAL call
>
> We refer to the lightweight userspace context switching mechanism as RPAL
> call. It enables the caller (or sender) thread of one process to directly
> switch to the callee (or receiver) thread of another process.
>
> When Process Aâ??s caller thread initiates an RPAL call to Process Bâ??s
> callee thread, the CPU saves the callerâ??s context and loads the calleeâ??s
> context. This enables direct userspace control flow transfer from the
> caller to the callee. After the callee finishes data processing, the CPU
> saves Process Bâ??s callee context and switches back to Process Aâ??s caller
> context, completing a full IPC cycle.
>
>
> |------------| |---------------------|
> | Process A | | Process B |
> | |-------| | | |-------| |
> | | caller| --- RPAL call --> | | callee| handle |
> | | thread| <------------------ | thread| -> event |
> | |-------| | | |-------| |
> |------------| |---------------------|
>
> # Security and compatibility with kernel subsystems
>
> ## Memory protection between processes
>
> Since processes using RPAL share the address space, unintended
> cross-process memory access may occur and corrupt the data of another
> process. To mitigate this, we leverage Memory Protection Keys (MPK) on x86
> architectures.
>
> MPK assigns 4 bits in each page table entry to a "protection key", which
> is paired with a userspace register (PKRU). The PKRU register defines
> access permissions for memory regions protected by specific keys (for
> detailed implementation, refer to the kernel documentation "Memory
> Protection Keys"). With MPK, even though the address space is shared
> among processes, cross-process access is restricted: a process can only
> access the memory protected by a key if its PKRU register is configured
> with the corresponding permission. This ensures that processes cannot
> access each otherâ??s memory unless an explicit PKRU configuration is set.
>
> ## Page fault handling and TLB flushing
>
> Due to the shared address space architecture, both page fault handling and
> TLB flushing require careful consideration. For instance, when Process A
> accesses Process Bâ??s memory, a page fault may occur in Process A's
> context, but the faulting address belongs to Process B. In this case, we
> must pass Process B's mm_struct to the page fault handler.
>
> TLB flushing is more complex. When a thread flushes the TLB, since the
> address space is shared, not only other threads in the current process but
> also other processes that share the address space may access the
> corresponding memory (related to the TLB flush). Therefore, the cpuset used
> for TLB flushing should be the union of the mm_cpumasks of all processes
> that share the address space.
>
> ## Lazy switch of kernel context
>
> In RPAL, a mismatch may arise between the user context and the kernel
> context. The RPAL call is designed solely to switch the user context,
> leaving the kernel context unchanged. For instance, when a RPAL call takes
> place, transitioning from caller thread to callee thread, and subsequently
> a system call is initiated within callee thread, the kernel will
> incorrectly utilize the caller's kernel context (such as the kernel stack)
> to process the system call.
>
> To resolve context mismatch issues, a kernel context switch is triggered at
> the kernel entry point when the callee initiates a syscall or an
> exception/interrupt occurs. This mechanism ensures context consistency
> before processing system calls, interrupts, or exceptions. We refer to this
> kernel context switch as a "lazy switch" because it defers the switching
> operation from the traditional thread switch point to the next kernel entry
> point.
>
> Lazy switch should be minimized as much as possible, as it significantly
> degrades performance. We currently utilize RPAL in an RPC framework, in
> which the RPC sender thread relies on the RPAL call to invoke the RPC
> receiver thread entirely in user space. In most cases, the receiver
> thread is free of system calls and the code execution time is relatively
> short. This characteristic effectively reduces the probability of a lazy
> switch occurring.
>
> ## Time slice correction
>
> After an RPAL call, the callee's user mode code executes. However, the
> kernel incorrectly attributes this CPU time to the caller due to the
> unchanged kernel context.
>
> To resolve this, we use the Time Stamp Counter (TSC) register to measure
> CPU time consumed by the callee thread in user space. The kernel then uses
> this user-reported timing data to adjust the CPU accounting for both the
> caller and callee thread, similar to how CPU steal time is implemented.
>
> ## Process recovery
>
> Since processes can access each otherâ??s memory, there is a risk that the
> target processâ??s memory may become invalid at the access time (e.g., if
> the target process has exited unexpectedly). The kernel must handle such
> cases; otherwise, the accessing process could be terminated due to
> failures originating from another process.
>
> To address this issue, each thread of the process should pre-establish a
> recovery point when accessing the memory of other processes. When such an
> invalid access occurs, the thread traps into the kernel. Inside the page
> fault handler, the kernel restores the user context of the thread to the
> recovery point. This mechanism ensures that processes maintain mutual
> independence, preventing cascading failures caused by cross-process memory
> issues.
>
> # Performance
>
> To quantify the performance improvements driven by RPAL, we measured
> latency both before and after its deployment. Experiments were conducted on
> a server equipped with two Intel(R) Xeon(R) Platinum 8336C CPUs (2.30 GHz)
> and 1 TB of memory. Latency was defined as the duration from when the
> client thread initiates a message to when the server thread is invoked and
> receives it.
>
> During testing, the client transmitted 1 million 32-byte messages, and we
> computed the per-message average latency. The results are as follows:
>
> *****************
> Without RPAL: Message length: 32 bytes, Total TSC cycles: 19616222534,
> Message count: 1000000, Average latency: 19616 cycles
> With RPAL: Message length: 32 bytes, Total TSC cycles: 1703459326,
> Message count: 1000000, Average latency: 1703 cycles
> *****************
>
> These results confirm that RPAL delivers substantial latency improvements
> over the current epoll implementationâ??achieving a 17,913-cycle reduction
> (an ~91.3% improvement) for 32-byte messages.
>
> We have applied RPAL to an RPC framework that is widely used in our data
> center. With RPAL, we have successfully achieved up to 15.5% reduction in
> the CPU utilization of processes in real-world microservice scenario. The
> gains primarily stem from minimizing control plane overhead through the
> utilization of userspace context switches. Additionally, by leveraging
> address space sharing, the number of memory copies is significantly
> reduced.
>
> # Future Work
>
> Currently, RPAL requires the MPK (Memory Protection Key) hardware feature,
> which is supported by a range of Intel CPUs. For AMD architectures, MPK is
> supported only on the latest processor, specifically, 3th Generation AMD
> EPYCâ?¢ Processors and subsequent generations. Patch sets that extend RPAL
> support to systems lacking MPK hardware will be provided later.
>
> Accompanying test programs are also provided in the samples/rpal/
> directory. And the user-mode RPAL library, which realizes user-space RPAL
> call, is in the samples/rpal/librpal directory.
>
> We hope to get some community discussions and feedback on RPAL's
> optimization approaches and architecture.
>
> Look forward to your comments.
>
> Bo Li (35):
> Kbuild: rpal support
> RPAL: add struct rpal_service
> RPAL: add service registration interface
> RPAL: add member to task_struct and mm_struct
> RPAL: enable virtual address space partitions
> RPAL: add user interface
> RPAL: enable shared page mmap
> RPAL: enable sender/receiver registration
> RPAL: enable address space sharing
> RPAL: allow service enable/disable
> RPAL: add service request/release
> RPAL: enable service disable notification
> RPAL: add tlb flushing support
> RPAL: enable page fault handling
> RPAL: add sender/receiver state
> RPAL: add cpu lock interface
> RPAL: add a mapping between fsbase and tasks
> sched: pick a specified task
> RPAL: add lazy switch main logic
> RPAL: add rpal_ret_from_lazy_switch
> RPAL: add kernel entry handling for lazy switch
> RPAL: rebuild receiver state
> RPAL: resume cpumask when fork
> RPAL: critical section optimization
> RPAL: add MPK initialization and interface
> RPAL: enable MPK support
> RPAL: add epoll support
> RPAL: add rpal_uds_fdmap() support
> RPAL: fix race condition in pkru update
> RPAL: fix pkru setup when fork
> RPAL: add receiver waker
> RPAL: fix unknown nmi on AMD CPU
> RPAL: enable time slice correction
> RPAL: enable fast epoll wait
> samples/rpal: add RPAL samples
>
> arch/x86/Kbuild | 2 +
> arch/x86/Kconfig | 2 +
> arch/x86/entry/entry_64.S | 160 ++
> arch/x86/events/amd/core.c | 14 +
> arch/x86/include/asm/pgtable.h | 25 +
> arch/x86/include/asm/pgtable_types.h | 11 +
> arch/x86/include/asm/tlbflush.h | 10 +
> arch/x86/kernel/asm-offsets.c | 3 +
> arch/x86/kernel/cpu/common.c | 8 +-
> arch/x86/kernel/fpu/core.c | 8 +-
> arch/x86/kernel/nmi.c | 20 +
> arch/x86/kernel/process.c | 25 +-
> arch/x86/kernel/process_64.c | 118 +
> arch/x86/mm/fault.c | 271 ++
> arch/x86/mm/mmap.c | 10 +
> arch/x86/mm/tlb.c | 172 ++
> arch/x86/rpal/Kconfig | 21 +
> arch/x86/rpal/Makefile | 6 +
> arch/x86/rpal/core.c | 477 ++++
> arch/x86/rpal/internal.h | 69 +
> arch/x86/rpal/mm.c | 426 +++
> arch/x86/rpal/pku.c | 196 ++
> arch/x86/rpal/proc.c | 279 ++
> arch/x86/rpal/service.c | 776 ++++++
> arch/x86/rpal/thread.c | 313 +++
> fs/binfmt_elf.c | 98 +-
> fs/eventpoll.c | 320 +++
> fs/exec.c | 11 +
> include/linux/mm_types.h | 3 +
> include/linux/rpal.h | 633 +++++
> include/linux/sched.h | 21 +
> init/init_task.c | 6 +
> kernel/exit.c | 5 +
> kernel/fork.c | 32 +
> kernel/sched/core.c | 676 +++++
> kernel/sched/fair.c | 109 +
> kernel/sched/sched.h | 8 +
> mm/mmap.c | 16 +
> mm/mprotect.c | 106 +
> mm/rmap.c | 4 +
> mm/vma.c | 18 +
> samples/rpal/Makefile | 17 +
> samples/rpal/asm_define.c | 14 +
> samples/rpal/client.c | 178 ++
> samples/rpal/librpal/asm_define.h | 6 +
> samples/rpal/librpal/asm_x86_64_rpal_call.S | 57 +
> samples/rpal/librpal/debug.h | 12 +
> samples/rpal/librpal/fiber.c | 119 +
> samples/rpal/librpal/fiber.h | 64 +
> .../rpal/librpal/jump_x86_64_sysv_elf_gas.S | 81 +
> .../rpal/librpal/make_x86_64_sysv_elf_gas.S | 82 +
> .../rpal/librpal/ontop_x86_64_sysv_elf_gas.S | 84 +
> samples/rpal/librpal/private.h | 341 +++
> samples/rpal/librpal/rpal.c | 2351 +++++++++++++++++
> samples/rpal/librpal/rpal.h | 149 ++
> samples/rpal/librpal/rpal_pkru.h | 78 +
> samples/rpal/librpal/rpal_queue.c | 239 ++
> samples/rpal/librpal/rpal_queue.h | 55 +
> samples/rpal/librpal/rpal_x86_64_call_ret.S | 45 +
> samples/rpal/offset.sh | 5 +
> samples/rpal/server.c | 249 ++
> 61 files changed, 9710 insertions(+), 4 deletions(-)
> create mode 100644 arch/x86/rpal/Kconfig
> create mode 100644 arch/x86/rpal/Makefile
> create mode 100644 arch/x86/rpal/core.c
> create mode 100644 arch/x86/rpal/internal.h
> create mode 100644 arch/x86/rpal/mm.c
> create mode 100644 arch/x86/rpal/pku.c
> create mode 100644 arch/x86/rpal/proc.c
> create mode 100644 arch/x86/rpal/service.c
> create mode 100644 arch/x86/rpal/thread.c
> create mode 100644 include/linux/rpal.h
> create mode 100644 samples/rpal/Makefile
> create mode 100644 samples/rpal/asm_define.c
> create mode 100644 samples/rpal/client.c
> create mode 100644 samples/rpal/librpal/asm_define.h
> create mode 100644 samples/rpal/librpal/asm_x86_64_rpal_call.S
> create mode 100644 samples/rpal/librpal/debug.h
> create mode 100644 samples/rpal/librpal/fiber.c
> create mode 100644 samples/rpal/librpal/fiber.h
> create mode 100644 samples/rpal/librpal/jump_x86_64_sysv_elf_gas.S
> create mode 100644 samples/rpal/librpal/make_x86_64_sysv_elf_gas.S
> create mode 100644 samples/rpal/librpal/ontop_x86_64_sysv_elf_gas.S
> create mode 100644 samples/rpal/librpal/private.h
> create mode 100644 samples/rpal/librpal/rpal.c
> create mode 100644 samples/rpal/librpal/rpal.h
> create mode 100644 samples/rpal/librpal/rpal_pkru.h
> create mode 100644 samples/rpal/librpal/rpal_queue.c
> create mode 100644 samples/rpal/librpal/rpal_queue.h
> create mode 100644 samples/rpal/librpal/rpal_x86_64_call_ret.S
> create mode 100755 samples/rpal/offset.sh
> create mode 100644 samples/rpal/server.c
>
> --
> 2.20.1
>

Return-Path: <linux-kernel+bounces-667899-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sy.mirrors.kernel.org (sy.mirrors.kernel.org [147.75.48.161])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 0EC5041E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:40:43 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sy.mirrors.kernel.org (Postfix) with ESMTPS id A40C97B0E55
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:39:24 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 453AD22B8BD;
Fri, 30 May 2025 09:35:25 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="f0RARZhZ"
Received: from mail-pg1-f182.google.com (mail-pg1-f182.google.com [209.85.215.182])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 505D022B8B3
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:35:22 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.182
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597724; cv=none; b=g2wx+7NJWmXj8c+hum1IfHHekJsPDjG6uYzbg1ygp/cqEzw2PWNqjSqF39/3lUi+LIlq2ey326z1Y/KOlB3aVqBdHdjeC5XADo47VvxUTmhgCLj84p8MlaFnx7RLWimwWHngkecC3bLUJvbFre6PQVlINOKZnG+qI6+WfF5Eb+g=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597724; c=relaxed/simple;
bh=9CHW/F/J9wY0zWhmMIE46/gcJf++4JvdpcdoqukKOXQ=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=qfsCpBvWhh6/DfbeAGCvu29U4M2+DU13I4dLl2+zWEYvFGFUbs2Cy41tWhJiqp8OVS9vYqDTvcobr6ko4faaoXJl3SQXU8alqmxRHBsVTqjp+S9JB8sOUYIRpubFzemq28cL5Bq9oSuHoGXmAfLZFmbETQ0k0uMIyLns7QwXIsg=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=f0RARZhZ; arc=none smtp.client-ip=209.85.215.182
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pg1-f182.google.com with SMTP id 41be03b00d2f7-b200047a6a5so2491168a12.0
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:35:22 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597722; x=1749202522; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=VFn4X8pNflxKPFbQMCWiED+xW2QzJgp0gVRveKZQMVQ=;
b=f0RARZhZLjfW/aWmcoFtrJUt9YE2tMtU6hW+dQbnlucaiQQxvRKFiRnO3h/iMy/s2v
omUDtrkqOEt/ktuvHS7n8o/6kqJxThZBTjaanLdHcfE1r68tQrAeOHxsfRfLW3NJ1Hyd
nd/Aive6GBoywupUoDPQdMrRdo+l8n/7/P60qKSKROcXtd+U80e0W3nNU68j1qoyhmnr
mn1MiVkLYfoxSFIz6NDcTU0XKh6O44BoWmqcKTXh53e35VdFfXUyYkby9sBRdZExDvD8
3ou16CG9Vt4ycR+xYcIbyYYpe5bEbsuJZozsHxU4FYXZ5gsNlVcyG2FwKzZAZMjxVrhE
iPLA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597722; x=1749202522;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=VFn4X8pNflxKPFbQMCWiED+xW2QzJgp0gVRveKZQMVQ=;
b=cO7B5OrVRnkB+KFchpungkFJEglXAMpGrp7TP0FeoyU2vJmRzsoY8btgZ2cF/Skw4K
gepzHQ75c0HresKK/CbQDTpZKOUuM8x8c8L/bT6TLcTK1BP4Pm0e21lg6a/xWUTt8bEy
r6b55Rx3G0pBZqB9lP8cwT4odUCzbloslt5uuSwrImMS4lk8tl9FQv5rctvjIa6/nAEK
dB1RFbUsMcuvai3fBZPlTnihNXQF0Eea4gCtfRCeUrcrnNzOAbERSgwmOMXPMG2MK6O9
Cw6TqoW83DPCpjbIj8JcJ+7yr3xtdYDVSTFJr0iZaNBm7/aRmoBqL85TSjCeml1Qokk/
MgQA==
X-Forwarded-Encrypted: i=1; AJvYcCX4Y0vjP5TvUhZrCpgCA3TpvsuVUivLw7180O2nesaZjhbhiJjkSGp3do46DrxNH3KSLowvPA8sYm5iHQc=@vger.kernel.org
X-Gm-Message-State: AOJu0YwB2TpaTfjPweQXQxiPny7J5WIpAbRu5Ndq06J4peolK3pcZk7u
fjdf2hfRqu5kXD5uEqYVZ9xoNLchdny1oCHfC2vVIsd1MN7zvkRmIN1dG5AjvUnKlfg=
X-Gm-Gg: ASbGncuZStyn61YIL6ziFiimPGAr4dQp4FlbSmtx/yrP8NZQ4TntrCsAWgAspMRzQLm
cTIDACCjcE5FgsfyGJSRUIjTHU8LVvrWy1J6rpuXlE0QRtlHjPRJluYEfJ+jc3AT9oAmDwjycgV
TkDyhfnwxtjuhg3+EC8p+k04/vzkLram927tQyj8MdkcR31NdHWsk+/vLQGjjmVL/gqLhFGWdL5
ZH+hvqJ53tNjK5+rRB27n+NBAwFIdhufg+VQFPnk0AmHXuAfu/JdK43NMbvnExYXHWNaVpUoRQQ
q+oNckFyY9hQo/oKJp2U/PK9SY4cTO5WTYNOEQdMGLw6AKDBymdCR9gMJoyqLHkdQx0PO4+rNDk
Fs7WC4BrKdy8IjnXPSUoc
X-Google-Smtp-Source: AGHT+IH46+bZCrX7K5xEEg6cSKu1TR1I7SkSliOWsHgcBqoA6CG8wyIvf12Zvpihk7VZmfu1IuwYXQ==
X-Received: by 2002:a17:90b:164b:b0:310:8d4a:4a97 with SMTP id 98e67ed59e1d1-31214ee68f2mr10223135a91.15.1748597721380;
Fri, 30 May 2025 02:35:21 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.35.06
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:35:21 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 27/35] RPAL: add epoll support
Date: Fri, 30 May 2025 17:27:55 +0800
Message-Id: <7eb30a577e2c6a4f582515357aea25260105eb18.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

To support the epoll family, RPAL needs to add new logic for RPAL services
to the existing epoll logic, ensuring that user mode can execute RPAL
service-related logic through identical interfaces.

When the receiver thread calls epoll_wait(), it can set RPAL_EP_POLL_MAGIC
to notify the kernel to invoke RPAL-related logic. The kernel then sets the
receiver's state to RPAL_RECEIVER_STATE_READY and transitions it to
RPAL_RECEIVER_STATE_WAIT when the receiver is actually removed from the
runqueue, allowing the sender to perform RPAL calls on the receiver thread.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/core.c | 4 +
fs/eventpoll.c | 200 +++++++++++++++++++++++++++++++++++++++++++
include/linux/rpal.h | 21 +++++
kernel/sched/core.c | 17 ++++
4 files changed, 242 insertions(+)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 47c9e551344e..6a22b9faa100 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -9,6 +9,7 @@
#include <linux/rpal.h>
#include <linux/sched/task_stack.h>
#include <linux/pkeys.h>
+#include <linux/file.h>
#include <asm/fsgsbase.h>

#include "internal.h"
@@ -63,6 +64,7 @@ void rpal_kernel_ret(struct pt_regs *regs)

if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
rcc = current->rpal_rd->rcc;
+ regs->ax = rpal_try_send_events(current->rpal_rd->ep, rcc);
atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
} else {
tsk = current->rpal_sd->receiver;
@@ -142,6 +144,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
struct task_struct *prev = current;

if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+ rpal_resume_ep(next);
current->rpal_sd->receiver = next;
rpal_lock_cpu(current);
rpal_lock_cpu(next);
@@ -154,6 +157,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
*/
rebuild_sender_stack(current->rpal_sd, regs);
rpal_schedule(next);
+ fdput(next->rpal_rd->f);
} else {
update_dst_stack(next, regs);
/*
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d4dbffdedd08..437cd5764c03 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
+#include <linux/rpal.h>
#include <net/busy_poll.h>

/*
@@ -2141,6 +2142,187 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}
}

+#ifdef CONFIG_RPAL
+
+void rpal_resume_ep(struct task_struct *tsk)
+{
+ struct rpal_receiver_data *rrd = tsk->rpal_rd;
+ struct eventpoll *ep = (struct eventpoll *)rrd->ep;
+ struct rpal_receiver_call_context *rcc = rrd->rcc;
+
+ if (rcc->timeout > 0) {
+ hrtimer_cancel(&rrd->ep_sleeper.timer);
+ destroy_hrtimer_on_stack(&rrd->ep_sleeper.timer);
+ }
+ if (!list_empty_careful(&rrd->ep_wait.entry)) {
+ write_lock(&ep->lock);
+ __remove_wait_queue(&ep->wq, &rrd->ep_wait);
+ write_unlock(&ep->lock);
+ }
+}
+
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc)
+{
+ int eavail;
+ int res = 0;
+
+ res = ep_send_events(ep, rcc->events, rcc->maxevents);
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
+
+ eavail = ep_events_available(ep);
+ if (!eavail) {
+ atomic_and(~RPAL_KERNEL_PENDING, &rcc->ep_pending);
+ /* check again to avoid data race on RPAL_KERNEL_PENDING */
+ eavail = ep_events_available(ep);
+ if (eavail)
+ atomic_or(RPAL_KERNEL_PENDING, &rcc->ep_pending);
+ }
+ return res;
+}
+
+static int rpal_schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
+ const enum hrtimer_mode mode,
+ clockid_t clock_id)
+{
+ struct hrtimer_sleeper *t = &current->rpal_rd->ep_sleeper;
+
+ /*
+ * Optimize when a zero timeout value is given. It does not
+ * matter whether this is an absolute or a relative time.
+ */
+ if (expires && *expires == 0) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ /*
+ * A NULL parameter means "infinite"
+ */
+ if (!expires) {
+ schedule();
+ return -EINTR;
+ }
+
+ hrtimer_setup_sleeper_on_stack(t, clock_id, mode);
+ hrtimer_set_expires_range_ns(&t->timer, *expires, delta);
+ hrtimer_sleeper_start_expires(t, mode);
+
+ if (likely(t->task))
+ schedule();
+
+ hrtimer_cancel(&t->timer);
+ destroy_hrtimer_on_stack(&t->timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ return !t->task ? 0 : -EINTR;
+}
+
+static int rpal_ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
+ int maxevents, struct timespec64 *timeout)
+{
+ int res = 0, eavail, timed_out = 0;
+ u64 slack = 0;
+ struct rpal_receiver_data *rrd = current->rpal_rd;
+ wait_queue_entry_t *wait = &rrd->ep_wait;
+ ktime_t expires, *to = NULL;
+
+ rrd->ep = ep;
+
+ lockdep_assert_irqs_enabled();
+
+ if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
+ slack = select_estimate_accuracy(timeout);
+ to = &expires;
+ *to = timespec64_to_ktime(*timeout);
+ } else if (timeout) {
+ timed_out = 1;
+ }
+
+ eavail = ep_events_available(ep);
+
+ while (1) {
+ if (eavail) {
+ res = rpal_try_send_events(ep, rrd->rcc);
+ if (res) {
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ return res;
+ }
+ }
+
+ if (timed_out) {
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ return 0;
+ }
+
+ eavail = ep_busy_loop(ep);
+ if (eavail)
+ continue;
+
+ if (signal_pending(current)) {
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_RUNNING);
+ return -EINTR;
+ }
+
+ init_wait(wait);
+ wait->func = rpal_ep_autoremove_wake_function;
+ wait->private = rrd;
+ write_lock_irq(&ep->lock);
+
+ atomic_xchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_READY);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
+ eavail = ep_events_available(ep);
+ if (!eavail)
+ __add_wait_queue_exclusive(&ep->wq, wait);
+
+ write_unlock_irq(&ep->lock);
+
+ if (!eavail && ep_schedule_timeout(to)) {
+ if (RPAL_USER_PENDING & atomic_read(&rrd->rcc->ep_pending)) {
+ timed_out = 1;
+ } else {
+ timed_out =
+ !rpal_schedule_hrtimeout_range_clock(
+ to, slack, HRTIMER_MODE_ABS,
+ CLOCK_MONOTONIC);
+ }
+ }
+ atomic_cmpxchg(&rrd->rcc->receiver_state,
+ RPAL_RECEIVER_STATE_READY,
+ RPAL_RECEIVER_STATE_RUNNING);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * We were woken up, thus go and try to harvest some events.
+ * If timed out and still on the wait queue, recheck eavail
+ * carefully under lock, below.
+ */
+ eavail = 1;
+
+ if (!list_empty_careful(&wait->entry)) {
+ write_lock_irq(&ep->lock);
+ /*
+ * If the thread timed out and is not on the wait queue,
+ * it means that the thread was woken up after its
+ * timeout expired before it could reacquire the lock.
+ * Thus, when wait.entry is empty, it needs to harvest
+ * events.
+ */
+ if (timed_out)
+ eavail = list_empty(&wait->entry);
+ __remove_wait_queue(&ep->wq, wait);
+ write_unlock_irq(&ep->lock);
+ }
+ }
+}
+#endif
+
/**
* ep_loop_check_proc - verify that adding an epoll file inside another
* epoll structure does not violate the constraints, in
@@ -2529,7 +2711,25 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
ep = fd_file(f)->private_data;

/* Time to fish for events ... */
+#ifdef CONFIG_RPAL
+ /*
+ * For RPAL task, if it is a receiver and it set MAGIC in shared memory,
+ * We think it is prepared for rpal calls. Therefore, we need to handle
+ * it differently.
+ *
+ * In other cases, RPAL task always plays like a normal task.
+ */
+ if (rpal_current_service() &&
+ rpal_test_current_thread_flag(RPAL_RECEIVER_BIT) &&
+ current->rpal_rd->rcc->rpal_ep_poll_magic == RPAL_EP_POLL_MAGIC) {
+ current->rpal_rd->f = f;
+ return rpal_ep_poll(ep, events, maxevents, to);
+ } else {
+ return ep_poll(ep, events, maxevents, to);
+ }
+#else
return ep_poll(ep, events, maxevents, to);
+#endif
}

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index f2474cb53abe..5912ffec6e28 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -16,6 +16,8 @@
#include <linux/hashtable.h>
#include <linux/atomic.h>
#include <linux/sizes.h>
+#include <linux/file.h>
+#include <linux/hrtimer.h>

#define RPAL_ERROR_MSG "rpal error: "
#define rpal_err(x...) pr_err(RPAL_ERROR_MSG x)
@@ -89,6 +91,7 @@ enum {
};

#define RPAL_ERROR_MAGIC 0x98CC98CC
+#define RPAL_EP_POLL_MAGIC 0xCC98CC98

#define RPAL_SID_SHIFT 24
#define RPAL_ID_SHIFT 8
@@ -103,6 +106,9 @@ enum {
#define RPAL_PKRU_UNION 1
#define RPAL_PKRU_INTERSECT 2

+#define RPAL_KERNEL_PENDING 0x1
+#define RPAL_USER_PENDING 0x2
+
extern unsigned long rpal_cap;

enum rpal_task_flag_bits {
@@ -282,6 +288,12 @@ struct rpal_receiver_call_context {
int receiver_id;
atomic_t receiver_state;
atomic_t sender_state;
+ atomic_t ep_pending;
+ int rpal_ep_poll_magic;
+ int epfd;
+ void __user *events;
+ int maxevents;
+ int timeout;
};

/* recovery point for sender */
@@ -325,6 +337,10 @@ struct rpal_receiver_data {
struct rpal_shared_page *rsp;
struct rpal_receiver_call_context *rcc;
struct task_struct *sender;
+ void *ep;
+ struct fd f;
+ struct hrtimer_sleeper ep_sleeper;
+ wait_queue_entry_t ep_wait;
};

struct rpal_sender_data {
@@ -574,4 +590,9 @@ __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey);
void rpal_set_pku_schedule_tail(struct task_struct *prev);
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+ unsigned int mode, int wake_flags,
+ void *key);
+void rpal_resume_ep(struct task_struct *tsk);
+int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb5d5bd51597..486d59bdd3fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6794,6 +6794,23 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#define SM_RTLOCK_WAIT 2

#ifdef CONFIG_RPAL
+int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
+ unsigned int mode, int wake_flags,
+ void *key)
+{
+ struct rpal_receiver_data *rrd = curr->private;
+ struct task_struct *tsk = rrd->rcd.bp_task;
+ int ret;
+
+ ret = try_to_wake_up(tsk, mode, wake_flags);
+
+ list_del_init_careful(&curr->entry);
+ if (!ret)
+ atomic_or(RPAL_KERNEL_PENDING, &rrd->rcc->ep_pending);
+
+ return 1;
+}
+
static inline void rpal_check_ready_state(struct task_struct *tsk, int state)
{
if (rpal_test_task_thread_flag(tsk, RPAL_RECEIVER_BIT)) {
--
2.20.1

Return-Path: <linux-kernel+bounces-667900-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from ny.mirrors.kernel.org (ny.mirrors.kernel.org [147.75.199.223])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id B466641E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:40:57 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by ny.mirrors.kernel.org (Postfix) with ESMTPS id F29B3171FE9
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:40:58 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id DFBED22B8CC;
Fri, 30 May 2025 09:35:39 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="KC7PSjPx"
Received: from mail-pg1-f174.google.com (mail-pg1-f174.google.com [209.85.215.174])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4BE08224225
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:35:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.174
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597738; cv=none; b=mRrFxvfjCNaSchnkVkWxr3k7Z+fDKDSTu4kmSBd8HWz+O1qUvKd/v7LRRwX5BLoIZzzlzxjmGrRMvZyj/d75q3MiBNcUQ4m216jvZEqLuU1YOdUp1jhnfPU1tCyB1tDDzGDSG/ohp21FHQkw1p/ljPkHYsh9F5HBX8QM4j29qhk=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597738; c=relaxed/simple;
bh=Rt/SGj48eEmheWhl/pKyHYlndzSJCIR513AnD89nuEY=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=d2uFUAUI7TqbFHh6WPj/MoW5hQAzx1xwxJGTxuy9Yc5E0b7R/4aFoHHh8+W2YvSdSjepAJ1ztCbnajqCZlA1zEV6kbFkcGtPY8vjUS3EJYDblKAnPwNXMwH+eaNTmO80uskJnqBeSXR5wWwihWpO4/0XIk9uKgn0IobhfJlA3w4=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=KC7PSjPx; arc=none smtp.client-ip=209.85.215.174
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pg1-f174.google.com with SMTP id 41be03b00d2f7-b26ee6be1ecso1251860a12.0
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:35:37 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597736; x=1749202536; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=NQYeXWECBZU61mIlblKiCexij9wbxTcb3SJTjEcJrDE=;
b=KC7PSjPx3y+5TbRZPN2v6pQ3ditZFyB/WcVnzDp3Ee3O22P/pUykeel1KcUWkUJiKT
nhTuK9kvMmGUQDggJbaQe6VKWFwuqrN3UEsKUZxUueNbSXse/TdWkeX3Z0Vt2Nxqzhe0
HqtvGc1AMocFIRLSXajgQ1k8PdrpFRhHZkI4cm/LWKhNr2oI2rLtAKjVQe89L756Kmmg
cncPxK4OWdHhqGNLXqNzRIaaT8BEdoAPSpp8PnVplsrGlsehAxG7kiGY5jb0O2MULmG9
RJoQS2VAgukjfRnHr82khH8CYVmj8yAHvb7AlO4XeARN+iAzEoZr+tcNN9pbJ1WuWv2S
4x2A==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597736; x=1749202536;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=NQYeXWECBZU61mIlblKiCexij9wbxTcb3SJTjEcJrDE=;
b=uf6s0YsNWTyrWFxKLN2FvzefccLA3K5l7UFVIEblG/H3OiuPGTlf4L2SysSxRmwPKC
Z+JHeMVfSqEJg9Y+tVdx7YZCWDdOsoe405JG9ZZdvEIU30vmavCxMkTfXwjfaQO4eJJi
k1RPaXkx0+9jQ1ehm+KFYCi/hUoFAR84MIO0Tw4nnKzdzfc+xcmVqWfjKNeSUdaednYD
Rp6Fi53QqUq5ViAe0HVcqyU3RrndcjoX1cANauwsLd3IGqnVelO1AXIpSg3ABmpGmN6v
PHfFyTsiJ2F+8Cmb7Gu/kfDzSe3EKvjEAsS2RyJZ0tJJ5WTsSfMcT+P3xMcFGRMVi8A7
p8Pg==
X-Forwarded-Encrypted: i=1; AJvYcCXUMC6C14LipX5ap6sFpTmeeNW86UUE0ia0tnUf59w3EELZL1fRXItydDuWDUGIGSgLHTatyfrg0r5sP3c=@vger.kernel.org
X-Gm-Message-State: AOJu0YymoL8ieGkjrPfVTNBh9ZFQ0ASwpIOUxhw+GvmeMkeG4ydtIald
JtW4t71I5buzDm8rMUnzGXkrvltEOAFg3JpMjIWNWhus7R2aSHmV/E+8fJViVZvhs/Q=
X-Gm-Gg: ASbGncsnrMKeB5TljicoRioFu0shRZrf9S+2FgPEzXsYPz4vVtFtOWesKH83RMS8mXs
5z4tJyWbAKFMIJsC+PD3NVxXc3kwpePh28XUq22VMXUcQ02vu1dlp9HmRh6rXvzkW3v93h9U+uD
xBCfJ01hx6oVl+erxfBMvuHQFr3mqLo0bdCYVmc0GtthPTReD+mF7pPaHk94AbowiTynLJzkOUK
PgJbhd/3NJtRgaBoQuDxOlUXIebJxE60tGaOD/4aOQdupk8LwcB0CQVa8iG1usCYzB1zme/d4FH
75ctTRzqMoA1lbYUf9gTGkp5saizwt5SPku6Wc9g8GG3qLwakz/1f5BV14Tpi8CbsAkkdmPjoYF
LD+tJdaBARQ==
X-Google-Smtp-Source: AGHT+IG6nnxH2FVgtgvPODfS+5ObcESlWEhm+JqnaRiZqSXrOdJI4HZuM5tUxbidkIm+8B4kCWHa5w==
X-Received: by 2002:a17:90b:3c49:b0:312:ec:4123 with SMTP id 98e67ed59e1d1-3125036bb61mr2199095a91.13.1748597736482;
Fri, 30 May 2025 02:35:36 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.35.21
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:35:36 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 28/35] RPAL: add rpal_uds_fdmap() support
Date: Fri, 30 May 2025 17:27:56 +0800
Message-Id: <7d9d805dcfe80358c06f0a02fadd31a7288500b4.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

For a UDS connection between a sender and a receiver, neither side knows
which file descriptor (fd) the other uses to manage the connection. The
sender cannot determine which user space fd's buffer in the receiver to
write data to, necessitating a complex process for both sides to inform
each other of fd mappings. This process incurs significant overhead when
managing a large number of connections, which requires optimization.

This patch introduces the RPAL_IOCTL_UDS_FDMAP interface, which simplifies
the establishment of fd mappings between sender and receiver processes for
files monitored by epoll. This avoids the need for a complex setup process
each time a new connection is created.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/internal.h | 3 +
arch/x86/rpal/proc.c | 117 +++++++++++++++++++++++++++++++++++++++
fs/eventpoll.c | 19 +++++++
include/linux/rpal.h | 11 ++++
4 files changed, 150 insertions(+)

diff --git a/arch/x86/rpal/internal.h b/arch/x86/rpal/internal.h
index e49febce8645..e03f8a90619d 100644
--- a/arch/x86/rpal/internal.h
+++ b/arch/x86/rpal/internal.h
@@ -11,6 +11,7 @@

#include <linux/mm.h>
#include <linux/file.h>
+#include <net/af_unix.h>

extern bool rpal_inited;

@@ -60,3 +61,5 @@ int rpal_alloc_pkey(struct rpal_service *rs, int pkey);
int rpal_pkey_setup(struct rpal_service *rs, int pkey);
void rpal_set_current_pkru(u32 val, int mode);
void rpal_service_pku_init(void);
+
+extern struct sock *unix_peer_get(struct sock *sk);
diff --git a/arch/x86/rpal/proc.c b/arch/x86/rpal/proc.c
index 2f9cceec4992..b60c099c4a92 100644
--- a/arch/x86/rpal/proc.c
+++ b/arch/x86/rpal/proc.c
@@ -9,6 +9,8 @@
#include <linux/rpal.h>
#include <linux/proc_fs.h>
#include <linux/poll.h>
+#include <net/sock.h>
+#include <net/af_unix.h>

#include "internal.h"

@@ -34,6 +36,118 @@ static int rpal_get_api_version_and_cap(void __user *p)
return 0;
}

+static void *rpal_uds_peer_data(struct sock *psk, int *pfd)
+{
+ void *ep = NULL;
+ unsigned long flags;
+ struct socket_wq *wq;
+ wait_queue_entry_t *entry;
+ wait_queue_head_t *whead;
+
+ rcu_read_lock();
+ wq = rcu_dereference(psk->sk_wq);
+ if (!skwq_has_sleeper(wq))
+ goto unlock_rcu;
+
+ whead = &wq->wait;
+
+ spin_lock_irqsave(&whead->lock, flags);
+ if (list_empty(&whead->head)) {
+ pr_debug("rpal debug: [%d] cannot find epitem entry\n",
+ current->pid);
+ goto unlock_spin;
+ }
+ entry = list_first_entry(&whead->head, wait_queue_entry_t, entry);
+ *pfd = rpal_get_epitemfd(entry);
+ if (*pfd < 0) {
+ pr_debug("rpal debug: [%d] cannot find epitem fd\n",
+ current->pid);
+ goto unlock_spin;
+ }
+ ep = rpal_get_epitemep(entry);
+
+unlock_spin:
+ spin_unlock_irqrestore(&whead->lock, flags);
+unlock_rcu:
+ rcu_read_unlock();
+ return ep;
+}
+
+static int rpal_find_receiver_rid(int id, void *ep)
+{
+ struct task_struct *tsk;
+ struct rpal_service *cur, *tgt;
+ int rid = -1;
+
+ cur = rpal_current_service();
+
+ tgt = rpal_get_mapped_service_by_id(cur, id);
+ if (tgt == NULL)
+ goto out;
+
+ for_each_thread(tgt->group_leader, tsk) {
+ if (!rpal_test_task_thread_flag(tsk, RPAL_RECEIVER_BIT))
+ continue;
+ if (tsk->rpal_rd->ep == ep) {
+ rid = tsk->rpal_rd->rcc->receiver_id;
+ break;
+ }
+ }
+
+ rpal_put_service(tgt);
+out:
+ return rid;
+}
+
+static long rpal_uds_fdmap(unsigned long uarg)
+{
+ struct rpal_uds_fdmap_arg arg;
+ struct socket *sock;
+ struct sock *peer_sk;
+ void *ep;
+ int sfd, rid;
+ struct fd f;
+ long res;
+ int ret;
+
+ ret = copy_from_user(&arg, (void __user *)uarg, sizeof(arg));
+ if (ret)
+ return ret;
+
+ f = fdget(arg.cfd);
+ if (!fd_file(f))
+ goto fd_put;
+
+ sock = sock_from_file(fd_file(f));
+ if (!sock)
+ goto fd_put;
+
+ peer_sk = unix_peer_get(sock->sk);
+ if (peer_sk == NULL)
+ goto fd_put;
+ ep = rpal_uds_peer_data(peer_sk, &sfd);
+ if (ep == NULL) {
+ pr_debug("rpal debug: [%d] cannot find epitem ep\n",
+ current->pid);
+ goto peer_sock_put;
+ }
+ rid = rpal_find_receiver_rid(arg.service_id, ep);
+ if (rid < 0) {
+ pr_debug("rpal debug: [%d] rpal: cannot find epitem rid\n",
+ current->pid);
+ goto peer_sock_put;
+ }
+ res = (long)rid << 32 | (long)sfd;
+ ret = put_user(res, arg.res);
+
+peer_sock_put:
+ sock_put(peer_sk);
+fd_put:
+ if (fd_file(f))
+ fdput(f);
+ return ret;
+}
+
static long rpal_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct rpal_service *cur = rpal_current_service();
@@ -81,6 +195,9 @@ static long rpal_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
ret = put_user(cur->pkey, (int __user *)arg);
break;
#endif
+ case RPAL_IOCTL_UDS_FDMAP:
+ ret = rpal_uds_fdmap(arg);
+ break;
default:
return -EINVAL;
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 437cd5764c03..791321639561 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2143,6 +2143,25 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}

#ifdef CONFIG_RPAL
+void *rpal_get_epitemep(wait_queue_entry_t *wait)
+{
+ struct epitem *epi = ep_item_from_wait(wait);
+
+ if (!epi)
+ return NULL;
+
+ return epi->ep;
+}
+
+int rpal_get_epitemfd(wait_queue_entry_t *wait)
+{
+ struct epitem *epi = ep_item_from_wait(wait);
+
+ if (!epi)
+ return -1;
+
+ return epi->ffd.fd;
+}

void rpal_resume_ep(struct task_struct *tsk)
{
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 5912ffec6e28..7657e6c6393b 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -350,6 +350,12 @@ struct rpal_sender_data {
struct task_struct *receiver;
};

+struct rpal_uds_fdmap_arg {
+ int service_id;
+ int cfd;
+ unsigned long *res;
+};
+
enum rpal_command_type {
RPAL_CMD_GET_API_VERSION_AND_CAP,
RPAL_CMD_GET_SERVICE_KEY,
@@ -363,6 +369,7 @@ enum rpal_command_type {
RPAL_CMD_REQUEST_SERVICE,
RPAL_CMD_RELEASE_SERVICE,
RPAL_CMD_GET_SERVICE_PKEY,
+ RPAL_CMD_UDS_FDMAP,
RPAL_NR_CMD,
};

@@ -393,6 +400,8 @@ enum rpal_command_type {
_IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_RELEASE_SERVICE, unsigned long)
#define RPAL_IOCTL_GET_SERVICE_PKEY \
_IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_GET_SERVICE_PKEY, int *)
+#define RPAL_IOCTL_UDS_FDMAP \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_UDS_FDMAP, unsigned long)

#define rpal_for_each_requested_service(rs, idx) \
for (idx = find_first_bit(rs->requested_service_bitmap, RPAL_NR_ID); \
@@ -594,5 +603,7 @@ int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
unsigned int mode, int wake_flags,
void *key);
void rpal_resume_ep(struct task_struct *tsk);
+void *rpal_get_epitemep(wait_queue_entry_t *wait);
+int rpal_get_epitemfd(wait_queue_entry_t *wait);
int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc);
#endif
--
2.20.1

Return-Path: <linux-kernel+bounces-667901-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from ny.mirrors.kernel.org (ny.mirrors.kernel.org [147.75.199.223])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 444E541E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:41:14 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by ny.mirrors.kernel.org (Postfix) with ESMTPS id 835DD1777BC
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:41:15 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 1D3CA22E00A;
Fri, 30 May 2025 09:35:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="NYaqh2v8"
Received: from mail-pg1-f169.google.com (mail-pg1-f169.google.com [209.85.215.169])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6D07B224B1C
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:35:52 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.169
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597754; cv=none; b=oiyp6GgSo5d77p7SmnhMv+++TaG09nxTWI+6LQ972WnonfV66L7eQZAVH2HeTB3Xrlh+QWUItnm063KVz63yGveozOSa6Tl0jyDpECbbm+wlZDc8JQXFnlPvwfiuPwDxeTq4aVCrbe4AjEoxtA8q3nqnK1zDkmoz/xyoT3GTr3Q=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597754; c=relaxed/simple;
bh=f9zzXMNbxFfZbGJyBMa0POHUK33Y39jv+mYtKpjEoMk=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=Bv/BmIzhN12arfeTvrI6wMpIq0oblK5O9cTEYjXElbVGnifBPn4qRpAFvDsaxfvugBPqRWvGwRWuh1+FPmdxr7AVkVbu8CYhRS3IeXWFzp4OBPuaG0Iwye+qIcYZG2y+mZ+JUuwkYlHCEHxCUVGGac9Pg6FlZ9TRZFulM7VLi0U=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=NYaqh2v8; arc=none smtp.client-ip=209.85.215.169
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pg1-f169.google.com with SMTP id 41be03b00d2f7-afc857702d1so1469093a12.3
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:35:52 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597751; x=1749202551; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=ruJXCVL33HlYZ/8pvuQkFgYyPjMywOaCJ6OilWvGVoA=;
b=NYaqh2v8WiwtvUtfIP8l6dheuVdr6/jF6arATD9keqVx+WpYUV6fkYXqodO9Y3hnNt
rp7zHfBulLlNbMg5TFKvqILTzBrh8S7gSK355JgXCx7jRGp8bVvbWVSBDLdDz2Xu6PBM
qoFsq2qQUrsZ1rxPQNKydFbrEtgoRQATlkiKjVrENN2/PbqNyD3poYKgHtajBX1ODIRg
5NdnvZqH0mhneUPWc25UHe+YHcC1UOw+SsV7Yl0dEyR0TgwurP7VFTDhJ+zsRWpo5O6T
4bX1uRVDdGXzxk+WixQYEpaeoXe+/QWQ9OzwB+NrpunIFQLx1IDnHcBWAheFU0OcGmSP
26bQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597751; x=1749202551;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=ruJXCVL33HlYZ/8pvuQkFgYyPjMywOaCJ6OilWvGVoA=;
b=BvTaJzwHL+ix1L7fgcuJVAlAiOr14reKF+KeQz6UBVLOLaj6DbySE2Z811y/v7xue5
k6YshdfO34B7+5UN888z5gs6BQtcUA9eVn6CM/3Jh06X05R3Jx9igwRKg3unh+lk7DQf
3mmb0RuoxCdk50kp2e21p56CadQzmRzH3YXyuV5xkTuvypQiz0chag5XHRy02QbeFgp0
E401g3afwq78Ji/IVZXGiEyK4ODFp4F5TIumDPwbZVDxK9b1Sg5BnSe/fOXlGCEixAlD
zO4lXJiM2SLKSsAfo45LquaY6G9R8FCCOqix1SR5GRdTo784UdsESoEB9VSDy67lqz/9
ODEw==
X-Forwarded-Encrypted: i=1; AJvYcCXm866PIJFmJsd1b0qG8P9u6veOgd1t5AWKIjhx5m74pkTVqz9GwD+fhfGwurk9v/KZuf3mEbWvJlJOGbc=@vger.kernel.org
X-Gm-Message-State: AOJu0YwOLMokQHHayPdcG6gZ4LokaqGo1NWEpxCaiS11OHa/mhgwG8ju
N87xIK1PD4vvrLQSGM/gjzIVTndGHXekbdEuffJ/zNVUDZ8iQeJW6E/wdawsUuxVdM8=
X-Gm-Gg: ASbGncuciDlOjCusoGj7IJeCGp3gAE67PB0CxSllWoln2ObYEh2Jq+mfNwCtY7Qok6t
MYOeKKUAoCBZU2ORl1IwSXFlhG4qc4JJiRVM2g0mxApW9nzVJhwHSu9b18OFjGN1ge2UIJL/SGl
8re1IJ3GMDNtCTxIXcV0gZA36IytXcxA6E9w+LA+cEFjf09A0JVazIpWvFGpDAg7bAYMcHA14sy
n00DxPKv/YR1EKxp8P3gP8PefSuMv3B+mKKRpDMahZ8S4B8cA++DRSp9YYnUdi2V2dQL8aKef2a
wNg//V+hLJss2SnwTgnXeK/R30EWZOJFDYkeZMn2OM+NEQFplGIIeFqzXbToztEM27IDOuYVb/S
fu0daGcx0VA==
X-Google-Smtp-Source: AGHT+IEvXrgn/ooMRW35qDhy0ngiIGcMy7za8V80iv/SemfT2y7R5n/KmWGhxdI6JxqwxTCjuIZGRA==
X-Received: by 2002:a17:90b:1dc4:b0:311:afaa:5e25 with SMTP id 98e67ed59e1d1-31241865ecdmr4397369a91.24.1748597751453;
Fri, 30 May 2025 02:35:51 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.35.36
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:35:51 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 29/35] RPAL: fix race condition in pkru update
Date: Fri, 30 May 2025 17:27:57 +0800
Message-Id: <7fbb84a57fc8046738c7196031a3fd97ea8334e2.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

When setting up MPK, RPAL uses IPIs to notify tasks running on each core
in the thread group to modify their PKRU values and update the PKEY fields
in all VMA page tables. A race condition exists here: when updating PKRU,
the page table updates may not yet be complete. In such cases, writing
PKRU permissions at locations that require calling pkru_write_default()
(e.g., during signal handling) must not be restricted to a single PKEY,
as this would cause PKRU permissions to fail to accommodate both old and
new page table PKEY settings.

This patch introduces a pku_on state with values PKU_ON_FALSE, PKU_ON_INIT,
and PKU_ON_FINISH, representing the states before, during, and after page
table PKEY updates, respectively. For RPAL services, all calls to
pkru_write_default() are replaced with rpal_pkru_write_default().

- Before page table setup (PKU_ON_FALSE), rpal_pkru_write_default()
directly calls pkru_write_default().
- During page table setup (PKU_ON_INIT), rpal_pkru_write_default() enables
permissions for all PKEYs, ensuring the task can access both old and new
page tables simultaneously.
- After page table setup completes (PKU_ON_FINISH),
rpal_pkru_write_default() tightens permissions to match the updated page
tables.

For newly allocated page tables, the new PKEY is only used when pku_on is
PKU_ON_FINISH. The mmap lock is used to ensure no race conditions occur
during this process.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/kernel/cpu/common.c | 4 ++--
arch/x86/kernel/fpu/core.c | 4 ++--
arch/x86/kernel/process.c | 4 ++--
arch/x86/rpal/pku.c | 14 +++++++++++++-
arch/x86/rpal/service.c | 2 +-
include/linux/rpal.h | 9 ++++++++-
mm/mmap.c | 2 +-
mm/mprotect.c | 1 +
mm/vma.c | 2 +-
9 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2678453cdf76..d21f44873b86 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -534,8 +534,8 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
cr4_set_bits(X86_CR4_PKE);
/* Load the default PKRU value */
#ifdef CONFIG_RPAL_PKU
- if (rpal_current_service() && rpal_current_service()->pku_on)
- write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ if (rpal_current_service())
+ rpal_pkru_write_default();
else
#endif
pkru_write_default();
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 251b1ddee726..4b413af0b179 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -748,8 +748,8 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
frstor(&init_fpstate.regs.fsave);

#ifdef CONFIG_RPAL_PKU
- if (rpal_current_service() && rpal_current_service()->pku_on)
- write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ if (rpal_current_service())
+ rpal_pkru_write_default();
else
#endif
pkru_write_default();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b74de35218f9..898a9e0b23e7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -286,8 +286,8 @@ static void pkru_flush_thread(void)
* the hardware right here (similar to context switch).
*/
#ifdef CONFIG_RPAL_PKU
- if (rpal_current_service() && rpal_current_service()->pku_on)
- write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ if (rpal_current_service())
+ rpal_pkru_write_default();
else
#endif
pkru_write_default();
diff --git a/arch/x86/rpal/pku.c b/arch/x86/rpal/pku.c
index 26cef324f41f..8e530931fb23 100644
--- a/arch/x86/rpal/pku.c
+++ b/arch/x86/rpal/pku.c
@@ -161,7 +161,7 @@ int rpal_pkey_setup(struct rpal_service *rs, int pkey)
rs->pkey = pkey;
/* others must see rs->pkey before rs->pku_on */
barrier();
- rs->pku_on = true;
+ rs->pku_on = PKU_ON_INIT;
mmap_write_unlock(current->mm);
rpal_set_group_pkru(val, RPAL_PKRU_UNION);
err = do_rpal_mprotect_pkey(rs->base, RPAL_ADDR_SPACE_SIZE, pkey);
@@ -182,3 +182,15 @@ int rpal_alloc_pkey(struct rpal_service *rs, int pkey)

return ret;
}
+
+void rpal_pkru_write_default(void)
+{
+ struct rpal_service *cur = rpal_current_service();
+
+ if (cur->pku_on == PKU_ON_INIT)
+ write_pkru(0);
+ else if (cur->pku_on == PKU_ON_FINISH)
+ write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ else
+ pkru_write_default();
+}
diff --git a/arch/x86/rpal/service.c b/arch/x86/rpal/service.c
index 7a83e85cf096..9fd568fa9a29 100644
--- a/arch/x86/rpal/service.c
+++ b/arch/x86/rpal/service.c
@@ -210,7 +210,7 @@ struct rpal_service *rpal_register_service(void)
init_waitqueue_head(&rs->rpd.rpal_waitqueue);
#ifdef CONFIG_RPAL_PKU
rs->pkey = -1;
- rs->pku_on = false;
+ rs->pku_on = PKU_ON_FALSE;
rpal_service_pku_init();
#endif

diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 7657e6c6393b..16a3c80383f7 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -138,6 +138,12 @@ enum rpal_capability {
RPAL_CAP_PKU
};

+enum {
+ PKU_ON_FALSE,
+ PKU_ON_INIT,
+ PKU_ON_FINISH,
+};
+
struct rpal_critical_section {
unsigned long ret_begin;
unsigned long ret_end;
@@ -245,7 +251,7 @@ struct rpal_service {

#ifdef CONFIG_RPAL_PKU
/* pkey */
- bool pku_on;
+ int pku_on;
int pkey;
#endif

@@ -599,6 +605,7 @@ __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey);
void rpal_set_pku_schedule_tail(struct task_struct *prev);
+void rpal_pkru_write_default(void);
int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
unsigned int mode, int wake_flags,
void *key);
diff --git a/mm/mmap.c b/mm/mmap.c
index d36ea4ea2bd0..85a4a33491ab 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -404,7 +404,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
do {
struct rpal_service *cur = rpal_current_service();

- if (cur && cur->pku_on)
+ if (cur && cur->pku_on == PKU_ON_FINISH)
pkey = cur->pkey;
} while (0);
#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e9ae828e377d..ac162180553e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -938,6 +938,7 @@ int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey)
}
tlb_finish_mmu(&tlb);

+ rpal_current_service()->pku_on = PKU_ON_FINISH;
out:
mmap_write_unlock(current->mm);
return error;
diff --git a/mm/vma.c b/mm/vma.c
index fa9d8f694e6e..57ec99a5969d 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2632,7 +2632,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct rpal_service *cur = rpal_current_service();
unsigned long vma_pkey_mask;

- if (cur && cur->pku_on) {
+ if (cur && cur->pku_on == PKU_ON_FINISH) {
vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 |
VM_PKEY_BIT3;
flags &= ~vma_pkey_mask;
--
2.20.1

Return-Path: <linux-kernel+bounces-667902-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from ny.mirrors.kernel.org (ny.mirrors.kernel.org [147.75.199.223])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 65A5F41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:41:29 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by ny.mirrors.kernel.org (Postfix) with ESMTPS id A49E5174F06
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:41:30 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 909EF22E3E9;
Fri, 30 May 2025 09:36:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="P5v3RIM5"
Received: from mail-pg1-f180.google.com (mail-pg1-f180.google.com [209.85.215.180])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 69CE021E082
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:36:07 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.180
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597768; cv=none; b=uc6oECp6WOHwwj7CA5ewTGgZ43y+EINAo5Wd+Bx8fAlS6C0ekC01CIfFlAY37yl/dw3pyhvlPkyiKXux6gnDyU6BLSshVYhJkVFseHtIkfUczcJLMhpK8/l5y6Y9zO3/o9zly5fSGdeDjGR7oVZtu4eRTqCuew5jpbszEnxur5Q=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597768; c=relaxed/simple;
bh=M3ZKzt1i9qkVfxIhwI+P7qiPBgdAVQp1iyJ0A0C1oKs=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=aEfE2jU18D7LTlbzt9A49xBcwfB3N29xVDczvK6c16LHxdogkJBr8/X/SjSPFNbjcE9WUADM6yES1oCUgiPmZ0g1bwCNickbqxl4vPKp1SWQIiFm8/nN4269qVmRfYcb8r8EaftfqhLqRRzIuxSVuQ7ULARHgWh5HRAwTjsdV5c=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=P5v3RIM5; arc=none smtp.client-ip=209.85.215.180
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pg1-f180.google.com with SMTP id 41be03b00d2f7-b26df8f44e6so1846791a12.2
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:36:07 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597766; x=1749202566; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=JvVWemooqfdC/iIY99ZRbIRy8Qdtq9pGwXsG45AWib0=;
b=P5v3RIM52BM+aAtQi9Pl9OO1IkeENABVGz92pzmQn1FFiG8PwrF+QXRI+FsIIMdqjg
NM4Z4iKAQ/ldpEJyDjLft52PTL2oc7Ulf9GhtnXTqzORYRRPRUM3cH2GqBbaCY4sAsG9
7Pdz47ziN8ZPYO7H9pbab0qor34/Ezih4X90itfhU5OPGYefwDsAl/Kb6tArsMO8aSee
aWoHdVuZS7HiYAhPoZ1C/03n4OGIMKLPj/uwG+KQO7WL5CIUl2T+HIclb6/Gv7q8Sdgh
PD6dN7tLf82F2MEC3RVQeO6sWDT7qK5eokyZ7EQnsPqp2y7WWZDlck/HaerjotN8pXXH
5E9A==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597766; x=1749202566;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=JvVWemooqfdC/iIY99ZRbIRy8Qdtq9pGwXsG45AWib0=;
b=IUDLpjgpM0UeoJ4lameWzQPSyhfB3tHz0FcDMjAjKgZy+1r4I9prTFXGhMLxXSl/o/
mW96nH3krdH/XpIlzA+TX1Xh/QJXx23P/4DuWbzohke2D7xvw4Dg4kSkvv5HH9qljQIe
3uxBSZLOx8RX5Y1M5EB0OBwK8cZIa5e9ux7vPVgnLRZtXp6Ki1JZ1mzOID6xu3qdi25R
aHemc8hyI4aYjY9V2XQ8cbBNtpveFFcs1AIBL+QQTuKhVEeXBmbYF2X7VmxblwUdFJ9i
nFVMtXPhWIwOGX8hj0FQLUJPFOYsfiM6fBM2t1YKDusPLJbB4zcZsyWE4tJ7CXrGu/ro
+H3w==
X-Forwarded-Encrypted: i=1; AJvYcCVzIRB62E4iucC2yu1vNMIx4VxQNI95oUJtS4einimzXDUz9juMAnQTszu29zZg6Prw8hxkGYoseAlK+P8=@vger.kernel.org
X-Gm-Message-State: AOJu0YzbD3K19pJb+eK8GOmNDqhkGxOGVAvzHNLlffxAL1zHbr3MKHTP
NJoPDY95ARWmf32zI4mkTu52WciF39wNUkTBChdUFO+DLQ8wBqqn8rgOtMQWIVWlTNY=
X-Gm-Gg: ASbGncsTqCGPdT9/oW1mDnCSt3SCYkVkGb1AUbDNudAGaz/Yl4q3PXH4QG0xxSX2x/A
A/GLfvUSPTnoTXfQOeaCWLfCL/hwY/3Qbf7ob42zl4KY/wklWC0fo8ptdrNDKV6P0xvmhkkEgkc
JeVW5MxnDmka7SpCaeABTFhXdd/TqaIXME0bOHc2Tcw5P4n4xSbd5WI1TeIx2RhoHM3Oh3VQRMv
XkyP/+/F44tWiukP0D/rBEBqy95thpjQ2LBclP9KH5pIhZEGyv17w8z5JEcHTZqAUohY8RZGnst
U+AKrS4hH0ZZ2RhArzgdjDkbtoRbrzdFm/IHpq9fmE6SD+5E2iDYY/B5rwUUO20UcMILhrwnN6I
wk/cDaUOKFDkrc0BcJahV
X-Google-Smtp-Source: AGHT+IFWmfD7z+IdT9GMiEVeQTlh62r7ag47hPnuBnpxyTEiSzaUyDKA69VQl5NNKF6zBMdalqyfiA==
X-Received: by 2002:a17:90b:5387:b0:311:e605:f60e with SMTP id 98e67ed59e1d1-31241637ee5mr4240780a91.20.1748597766419;
Fri, 30 May 2025 02:36:06 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.35.51
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:36:06 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 30/35] RPAL: fix pkru setup when fork
Date: Fri, 30 May 2025 17:27:58 +0800
Message-Id: <af787730bd27fa506c1e6963bce3da38b23e6358.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

When a task performs a fork operation, the PKRU value of the newly forked
task is set to the value read from hardware. At this point, if the service
is executing rpal_pkey_setup(), the newly forked task has not yet been
added to the task list, so PKRU settings cannot be synchronized to the new
task. This results in the new task's PKRU not being set to the correct
value when it is woken up.

This patch addresses this issue by:

- After the newly forked task is added to the task list, further updating
its PKRU value.
- Acquiring a mutex lock to ensure that the PKRU update occurs either
before or after the invocation of rpal_pkey_setup(). This avoids race
conditions with rpal_pkey_setup() and guarantees that the re-updated PKRU
value is always correct.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
kernel/fork.c | 13 +++++++++++++
1 file changed, 13 insertions(+)

diff --git a/kernel/fork.c b/kernel/fork.c
index 01cd48eadf68..11cba74d07c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2683,6 +2683,19 @@ __latent_entropy struct task_struct *copy_process(
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);

+#ifdef CONFIG_RPAL_PKU
+ do {
+ struct rpal_service *cur = rpal_current_service();
+
+ if (cur) {
+ /* ensure we are not in rpal_enable_service() */
+ mutex_lock(&cur->mutex);
+ p->thread.pkru = rdpkru();
+ mutex_unlock(&cur->mutex);
+ }
+ } while (0);
+#endif
+
if (pidfile)
fd_install(pidfd, pidfile);

--
2.20.1

Return-Path: <linux-kernel+bounces-667903-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sv.mirrors.kernel.org (sv.mirrors.kernel.org [139.178.88.99])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id A747441E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:42:05 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sv.mirrors.kernel.org (Postfix) with ESMTPS id 6AB413A695E
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:41:23 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id E4E05229B39;
Fri, 30 May 2025 09:36:24 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="YUmSbtk+"
Received: from mail-pj1-f50.google.com (mail-pj1-f50.google.com [209.85.216.50])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 31CD21E8323
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:36:22 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.50
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597783; cv=none; b=EfWZZ7MKKJeqMvtTS4YUrdIjkNR//uZ53sYjxBarcbDK0qIWt7Ivd2Ej8tBxasi5Y8aX7xIoIN/23qHbh+NNcBf68h1ihsfyHY2k8xJTdqjv4Hf9gerNe9eZ1AQTL2us8LT8/EkGVwtQ3QAp5e4sxP86SDtOFwjxcbR2SKy3l8M=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597783; c=relaxed/simple;
bh=ALYzXLguj2KAo7eUQpHRTBkt+ax6Jk2hscWpGu5UIaI=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=eR1DquaDnRe6ENmcbYXIInZTthekhk5cezpVY1xFh3rZ2OczhkWIoSEzYrm3FeR86rWQ7tUePvEeXXaY0Tu3gNIZyQACALh9PboY6SF3+KOntK0XQgewxyfV2nLscmls2fPX31jxuBrqaT1BGPOXGfAjWjfX9HmjZriamIXLq+g=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=YUmSbtk+; arc=none smtp.client-ip=209.85.216.50
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pj1-f50.google.com with SMTP id 98e67ed59e1d1-3124f18c214so298158a91.2
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:36:22 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597781; x=1749202581; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=TcEpV4r+zMsMwJgdzkjWqZN5teRn9qhnk6f0JAPvEA0=;
b=YUmSbtk+S1jrOLRy9s9/X4wpHMB+n75vTpe5rrwsUM9VdT2Q9FmLk3Ww7r6U6iE+ZZ
m6NNM+8p4Zb88ynkQTfns8oWZ9g6Nh2VbA6poFf0frPIumbN5ZnjTAMln5Q5730uDC5n
s/gPN2nkG4m5mmijf7oslKxgnz+aF93fN8MHtlqq2pG+LA4ycmC2rKybxXj7YM73zo6x
FlevlE2ChxhvQyq0ze6ce1YwOK7rZHevQ8AoE4D8YmS2PWzOcmpKVDMh5yZCJD7I8lDQ
evgcwWlw8aKXWinQAsjEUsXeRvvhSuq1uZr5PWRhRFDZd4NGIIdcfbJBbAfNJWB0e1Kk
2n8w==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597781; x=1749202581;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=TcEpV4r+zMsMwJgdzkjWqZN5teRn9qhnk6f0JAPvEA0=;
b=KlrH71e1IM7zWcwDaJ7LrWcGNj6bsK03WpIJ6dKv6UFR4ps5GI/0uncIHmLHPsRGra
wpPjde+WiEpP90C/CFNMa0GKYiGNMd5o8AYbTICNqg5RJnLvDnKdEfdaigx0hvoKplG+
z8B+vUFQe7r1qBjwu7GpwvM0UloUDFYNZiCvgKRvzfpATdGR8exdUVXvdlvdoaum3+rA
ebU6M3HVya5dXGpwd1ybMG6lCSjC/AV6x8saxuLcuCDIGt2mTqoamnQmby6XqjjlGOW+
ac4h+3toQeBsrslYbWbVsXlFXrwWhjWy9lP352AHo7WeeLC9mxURj6McDjK12S9wltXh
RC5Q==
X-Forwarded-Encrypted: i=1; AJvYcCVy/BGsKC1tP0ITA/q3scpG78Pd3fJQjIw5k/CkRdg71ZjmKDPmsjKEZByUPDnTYaBm1EndSZw3UbFyStc=@vger.kernel.org
X-Gm-Message-State: AOJu0YxLa09VTEXi90bHcKRkrdmVaOZvvEeFcmAFiqBnxz2dz40yPqkO
989taXkGR1++5lRHLg3cDDJinHbSIXJAKnIjc/GarL1MK9izENMnMSNRXnybL+1/Ixc=
X-Gm-Gg: ASbGncuMZWCSEnqNLeuGG51tlySrvXLSUf34lSpqStrP+RvzOeU5VXLd19NoHTzH8OI
RyjcFxK2S4rCGAaJaAT/RL1B5KT9s2Nf0GApa8RQCGH7BEUpPMVUZwAsnDsKVDSuawUYLU2Rcqx
mnZS0HuvIsADHMrK3NeOWQqfUew381vZF4nxZTDp1GobXJ4+X930bWbJcgsyTvHA9gJCngM76Ri
ElnjsNO68bfJMneY9eiqbpjIuh/CvBzTFBIHveZ82FWLPCwzHKjTexItI+/FYFyHCXvEUJFMJGu
GFXvUkJH/tee6wIvy2lssAvDF3R1sapeEReszcqwj02qvsApy6+DK2JqAU5crLsyjtPpWy7ILSC
5T9RdLgXK3I60EnBmWL2f
X-Google-Smtp-Source: AGHT+IG0dsH4ruc0ZVKtNevc7/fyUjoj6Kve3iAa9r38GSH3v9kXrPKHP05YVx3+W/n5pyeS7YOYMA==
X-Received: by 2002:a17:90b:55c6:b0:311:df4b:4b82 with SMTP id 98e67ed59e1d1-3124150e360mr4147264a91.4.1748597781411;
Fri, 30 May 2025 02:36:21 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.36.06
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:36:21 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 31/35] RPAL: add receiver waker
Date: Fri, 30 May 2025 17:27:59 +0800
Message-Id: <198278a03d91ab7e0e17d782c657da85cff741bb.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

In an RPAL call, the receiver thread is in the TASK_INTERRUPTIBLE state
and cannot be awakened, which may lead to missed wakeups. For example, if
no kernel event occurs during the entire RPAL call, the receiver thread
will remain in the TASK_INTERRUPTIBLE state after the RPAL call completes.

To address this issue, RPAL adds a flag to the receiver whenever it
encounters an unawakened state and introduces a "waker" work. The waker
work runs automatically on every tick to check for receiver threads that
have missed wakeups. If any are found, it wakes them up. For epoll, the
waker also checks for pending user mode events and wakes the receiver
thread if such events exist.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/internal.h | 4 ++
arch/x86/rpal/service.c | 98 ++++++++++++++++++++++++++++++++++++++++
arch/x86/rpal/thread.c | 3 ++
include/linux/rpal.h | 11 +++++
kernel/sched/core.c | 3 ++
5 files changed, 119 insertions(+)

diff --git a/arch/x86/rpal/internal.h b/arch/x86/rpal/internal.h
index e03f8a90619d..117357dabdec 100644
--- a/arch/x86/rpal/internal.h
+++ b/arch/x86/rpal/internal.h
@@ -22,6 +22,10 @@ int rpal_enable_service(unsigned long arg);
int rpal_disable_service(void);
int rpal_request_service(unsigned long arg);
int rpal_release_service(u64 key);
+void rpal_insert_wake_list(struct rpal_service *rs,
+ struct rpal_receiver_data *rrd);
+void rpal_remove_wake_list(struct rpal_service *rs,
+ struct rpal_receiver_data *rrd);

/* mm.c */
static inline struct rpal_shared_page *
diff --git a/arch/x86/rpal/service.c b/arch/x86/rpal/service.c
index 9fd568fa9a29..6fefb7a7729c 100644
--- a/arch/x86/rpal/service.c
+++ b/arch/x86/rpal/service.c
@@ -143,6 +143,99 @@ static void delete_service(struct rpal_service *rs)
spin_unlock_irqrestore(&hash_table_lock, flags);
}

+void rpal_insert_wake_list(struct rpal_service *rs,
+ struct rpal_receiver_data *rrd)
+{
+ unsigned long flags;
+ struct rpal_waker_struct *waker = &rs->waker;
+
+ spin_lock_irqsave(&waker->lock, flags);
+ list_add_tail(&rrd->wake_list, &waker->wake_head);
+ spin_unlock_irqrestore(&waker->lock, flags);
+ pr_debug("rpal debug: [%d] insert wake list\n", current->pid);
+}
+
+void rpal_remove_wake_list(struct rpal_service *rs,
+ struct rpal_receiver_data *rrd)
+{
+ unsigned long flags;
+ struct rpal_waker_struct *waker = &rs->waker;
+
+ spin_lock_irqsave(&waker->lock, flags);
+ list_del(&rrd->wake_list);
+ spin_unlock_irqrestore(&waker->lock, flags);
+ pr_debug("rpal debug: [%d] remove wake list\n", current->pid);
+}
+
+/* waker->lock must be hold */
+static inline void rpal_wake_all(struct rpal_waker_struct *waker)
+{
+ struct task_struct *wake_list[RPAL_MAX_RECEIVER_NUM];
+ struct list_head *list;
+ unsigned long flags;
+ int i, cnt = 0;
+
+ spin_lock_irqsave(&waker->lock, flags);
+ list_for_each(list, &waker->wake_head) {
+ struct task_struct *task;
+ struct rpal_receiver_call_context *rcc;
+ struct rpal_receiver_data *rrd;
+ int pending;
+
+ rrd = list_entry(list, struct rpal_receiver_data, wake_list);
+ task = rrd->rcd.bp_task;
+ rcc = rrd->rcc;
+
+ pending = atomic_read(&rcc->ep_pending) & RPAL_USER_PENDING;
+
+ if (rpal_test_task_thread_flag(task, RPAL_WAKE_BIT) ||
+ (pending && atomic_cmpxchg(&rcc->receiver_state,
+ RPAL_RECEIVER_STATE_WAIT,
+ RPAL_RECEIVER_STATE_RUNNING) ==
+ RPAL_RECEIVER_STATE_WAIT)) {
+ wake_list[cnt] = task;
+ cnt++;
+ }
+ }
+ spin_unlock_irqrestore(&waker->lock, flags);
+
+ for (i = 0; i < cnt; i++)
+ wake_up_process(wake_list[i]);
+}
+
+static void rpal_wake_callback(struct work_struct *work)
+{
+ struct rpal_waker_struct *waker =
+ container_of(work, struct rpal_waker_struct, waker_work.work);
+
+ rpal_wake_all(waker);
+ /* We check it every ticks */
+ schedule_delayed_work(&waker->waker_work, 1);
+}
+
+static void rpal_enable_waker(struct rpal_waker_struct *waker)
+{
+ INIT_DELAYED_WORK(&waker->waker_work, rpal_wake_callback);
+ schedule_delayed_work(&waker->waker_work, 1);
+ pr_debug("rpal debug: [%d] enable waker\n", current->pid);
+}
+
+static void rpal_disable_waker(struct rpal_waker_struct *waker)
+{
+ unsigned long flags;
+ struct list_head *p, *n;
+
+ cancel_delayed_work_sync(&waker->waker_work);
+ rpal_wake_all(waker);
+ spin_lock_irqsave(&waker->lock, flags);
+ list_for_each_safe(p, n, &waker->wake_head) {
+ list_del_init(p);
+ }
+ INIT_LIST_HEAD(&waker->wake_head);
+ spin_unlock_irqrestore(&waker->lock, flags);
+ pr_debug("rpal debug: [%d] disable waker\n", current->pid);
+}
+
static inline unsigned long calculate_base_address(int id)
{
return RPAL_ADDRESS_SPACE_LOW + RPAL_ADDR_SPACE_SIZE * id;
@@ -213,6 +306,10 @@ struct rpal_service *rpal_register_service(void)
rs->pku_on = PKU_ON_FALSE;
rpal_service_pku_init();
#endif
+ spin_lock_init(&rs->waker.lock);
+ INIT_LIST_HEAD(&rs->waker.wake_head);
+ /* receiver may miss wake up if in lazy switch, try to wake it later */
+ rpal_enable_waker(&rs->waker);

rs->bad_service = false;
rs->base = calculate_base_address(rs->id);
@@ -257,6 +354,7 @@ void rpal_unregister_service(struct rpal_service *rs)
schedule();

delete_service(rs);
+ rpal_disable_waker(&rs->waker);

pr_debug("rpal: unregister service, id: %d, tgid: %d\n", rs->id,
rs->group_leader->tgid);
diff --git a/arch/x86/rpal/thread.c b/arch/x86/rpal/thread.c
index fcc592baaac0..51c9eec639cb 100644
--- a/arch/x86/rpal/thread.c
+++ b/arch/x86/rpal/thread.c
@@ -186,6 +186,8 @@ int rpal_register_receiver(unsigned long addr)
current->rpal_rd = rrd;
rpal_set_current_thread_flag(RPAL_RECEIVER_BIT);

+ rpal_insert_wake_list(cur, rrd);
+
atomic_inc(&cur->thread_cnt);

return 0;
@@ -214,6 +216,7 @@ int rpal_unregister_receiver(void)
clear_fs_tsk_map();

rpal_put_shared_page(rrd->rsp);
+ rpal_remove_wake_list(cur, rrd);
rpal_clear_current_thread_flag(RPAL_RECEIVER_BIT);
rpal_free_thread_pending(&rrd->rcd);
kfree(rrd);
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 16a3c80383f7..1d8c1bdc90f2 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -116,6 +116,7 @@ enum rpal_task_flag_bits {
RPAL_RECEIVER_BIT,
RPAL_CPU_LOCKED_BIT,
RPAL_LAZY_SWITCHED_BIT,
+ RPAL_WAKE_BIT,
};

enum rpal_receiver_state {
@@ -189,6 +190,12 @@ struct rpal_fsbase_tsk_map {
struct task_struct *tsk;
};

+struct rpal_waker_struct {
+ spinlock_t lock;
+ struct list_head wake_head;
+ struct delayed_work waker_work;
+};
+
/*
* Each RPAL process (a.k.a RPAL service) should have a pointer to
* struct rpal_service in all its tasks' task_struct.
@@ -255,6 +262,9 @@ struct rpal_service {
int pkey;
#endif

+ /* receiver thread waker */
+ struct rpal_waker_struct waker;
+
/* delayed service put work */
struct delayed_work delayed_put_work;

@@ -347,6 +357,7 @@ struct rpal_receiver_data {
struct fd f;
struct hrtimer_sleeper ep_sleeper;
wait_queue_entry_t ep_wait;
+ struct list_head wake_list;
};

struct rpal_sender_data {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 486d59bdd3fc..c219ada29d34 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3943,6 +3943,7 @@ static bool rpal_check_state(struct task_struct *p)
struct rpal_receiver_call_context *rcc = p->rpal_rd->rcc;
int state;

+ rpal_clear_task_thread_flag(p, RPAL_WAKE_BIT);
retry:
state = atomic_read(&rcc->receiver_state) & RPAL_RECEIVER_STATE_MASK;
switch (state) {
@@ -3957,6 +3958,7 @@ static bool rpal_check_state(struct task_struct *p)
case RPAL_RECEIVER_STATE_RUNNING:
break;
case RPAL_RECEIVER_STATE_CALL:
+ rpal_set_task_thread_flag(p, RPAL_WAKE_BIT);
ret = false;
break;
default:
@@ -4522,6 +4524,7 @@ int rpal_try_to_wake_up(struct task_struct *p)

BUG_ON(READ_ONCE(p->__state) == TASK_RUNNING);

+ rpal_clear_task_thread_flag(p, RPAL_WAKE_BIT);
scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
smp_mb__after_spinlock();
if (!ttwu_state_match(p, TASK_NORMAL, &success))
--
2.20.1

Return-Path: <linux-kernel+bounces-667904-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 7534B41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:42:23 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 25B391890ACF
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:42:14 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 3DD4F22F16E;
Fri, 30 May 2025 09:36:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="Wwt4uAd8"
Received: from mail-pj1-f49.google.com (mail-pj1-f49.google.com [209.85.216.49])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 31DA6220F30
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:36:37 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.49
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597799; cv=none; b=DXbQVX78YVyek/s3tqBPq+HgPJKO3l4lJd06hOLEFAyR3uOa1t3ocomBTQIR4KwCf9ugyVAyoCQbBsCtLDFoWFcQmEu8i3F9l08Am9xiML5/NNWj9LQc6vthDco3gNw2NK9c2t8Jasow8N1AR9pI+Q/Zf3BkPsWWWH7FBWrC3sE=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597799; c=relaxed/simple;
bh=Dtz1Aet7apWTXhLd7VJWKFnNYwGISZ8LO5wYY32y+Hg=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=PNiiZ49Z5SYThRjWy0JqIcYKnVGPru+aJwySUZKq/E7/KGNfmuCrO1Si0xctyX2zLfoLQ0v7JKioPJ/GLGFBeZXQXMLMfqgF9rYRyeTWmVAX6+Bfbf+7k4Vp/FMrEs+lb5S12XpONL2uQijrGSJzopHgdw7889bTCgryXzTuEfw=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=Wwt4uAd8; arc=none smtp.client-ip=209.85.216.49
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pj1-f49.google.com with SMTP id 98e67ed59e1d1-311d5fdf1f0so1705323a91.1
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:36:37 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597796; x=1749202596; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=PQ/D1sjhfYHEwDTh2b+Qyte458RPTLSFlphr3fZumJQ=;
b=Wwt4uAd8KbjRRSsRvich6T35PFHgOUS+Fouykni2GzUkaA8t/k3UTaCOLNa7A6hdWF
x1RPblBdLzzWC1AE4v87JHrM61KF9DVGbUK8DEpp1cUP/DUuDeLidudvWGuW07fFBQvo
XsLAUvimfdP6y7CO6hHio+NKe/+P42z+xBNiOHIBXkeee2PwtArHHOplLnjsJ9wNosUg
E01vyFZb748GdrTlxKUnbJTyKS8Zn6JhcvT8DxNNlYF0w5Z0r+UmrF7gj63i7iDyBOhz
BIAKWOhEjVr7wQQv+6F9Rr1vj4Y9ZuEuEEYS6k3Lpgm8CNT6pPZaykrXHWBUNyJ1Bb5e
7sXA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597796; x=1749202596;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=PQ/D1sjhfYHEwDTh2b+Qyte458RPTLSFlphr3fZumJQ=;
b=aAlzonhj2hRf+6SXryDi8t4NsLqj6evbTjGyY4/5UP7sXgxTI9Qu9rxhHCJrGIdrj+
SKNcq5qiSt2+H7iSZ2pbcAKsm3vTHmkTYlzdkipzMGFg8GnAlywDCRBHJddKp3Rz/GtL
fc/YO0vVdf8MBesW3a85SlRW4DG0nXywhjQ7u4+pW9Sffk7jP/cQ7p/qfLkUqgnltCZy
PysNX8k/7mDDmBgQ7RO7iGHKSNCnHdKwEzbvCuTKG5H0fkk0HqmmlIrbrWBukLT//GWv
etYNPwJ4LZFbu3UQIVpvoh7NNNR364fDHpE72RiTsAaQkw0CK2aN6vRnFEcRXxe1JLED
PJ2Q==
X-Forwarded-Encrypted: i=1; AJvYcCXOY210lrmGfWj5ysAvoaDOZ7MjC6xHx2aEwcMZi9uIKDNp4dSXqeWptqQnBVgbfh1YjL7+b5rf7uK/s6A=@vger.kernel.org
X-Gm-Message-State: AOJu0Yz9iiGXS+SnmCVxdpMoQ7oQSr+WBJq02oJwkRBX6W9IiZdyRdCQ
xrtrL2su8jH1KG4twNHKYmsYchGTGksF4eII0wwaf73IiWcS21UKqV7PyVqCU9TSOVc=
X-Gm-Gg: ASbGnctg+lhn2ZODT0SZvEu4gv+8uBV8nCVlmTK6k472thUyHN3aduFsYTFBt06f4dC
RWkGtXrC1O8kZRsUMbzgUIQHWhmS6naLLuRp9TtzwolOSikIzwP1rYPcXk08bY3gb7Tx+2a3ZCx
/gpdpm7ymKLvLfZpOJQvz247Ljf5uWwZpfmigyQeU8cMnabzGeQgy/Tkh6FAV2D44psXg/mNp7v
GfF1dkf2BjfwTbYEHj8XYh0vXjsZoNkS+Wa4/wXlrMG6nASeBevUKMTH3rZkBm23giVKa7ExYyE
2YiPB9jHDqEJlkfwnRxlZVyzMcRmUuBxgXtSYu9hSBGhHEYBKYe8eewkamaV1Rk6TUniHvsRxWg
AQrFDxj8vD1doQZNPsDML
X-Google-Smtp-Source: AGHT+IE1J4YJ6wHA6b9CUB/dnKICEd+OlMDbXuOs/bJjQ/NWJ3rv4Cxh7kWsPUyPIoUfIb7mzNdLsw==
X-Received: by 2002:a17:90b:4f4d:b0:312:1cd7:b337 with SMTP id 98e67ed59e1d1-3125034a47amr1876977a91.5.1748597796457;
Fri, 30 May 2025 02:36:36 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.36.21
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:36:36 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 32/35] RPAL: fix unknown nmi on AMD CPU
Date: Fri, 30 May 2025 17:28:00 +0800
Message-Id: <fc9a95163b055235b1a5007753a131a7250a409b.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

In Lazy switch, the function event_sched_out() will be called. This
function deletes the perf event of the task being scheduled out, causing
the active_mask in cpu_hw_events to be cleared. In AMD's NMI handler, if
the bit corresponding to active_mask is not set, the CPU will not handle
the NMI event, ultimately triggering an unknown NMI error. Additionally,
event_sched_out() may call amd_pmu_wait_on_overflow(), leading to a busy
wait of up to 50us during lazy switch.

This patch adds two per_cpu variables. rpal_nmi_handle is set when an NMI
occurs. When encountering an unknown NMI, this NMI is skipped. rpal_nmi is
set before lazy switch and cleared after lazy switch, preventing the busy
wait caused by amd_pmu_wait_on_overflow().

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/events/amd/core.c | 14 ++++++++++++++
arch/x86/kernel/nmi.c | 20 ++++++++++++++++++++
arch/x86/rpal/core.c | 17 ++++++++++++++++-
3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index b20661b8621d..633a9ac4e77c 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -719,6 +719,10 @@ static void amd_pmu_wait_on_overflow(int idx)
}
}

+#ifdef CONFIG_RPAL
+DEFINE_PER_CPU(bool, rpal_nmi);
+#endif
+
static void amd_pmu_check_overflow(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -732,6 +736,11 @@ static void amd_pmu_check_overflow(void)
if (in_nmi())
return;

+#ifdef CONFIG_RPAL
+ if (this_cpu_read(rpal_nmi))
+ return;
+#endif
+
/*
* Check each counter for overflow and wait for it to be reset by the
* NMI if it has overflowed. This relies on the fact that all active
@@ -807,6 +816,11 @@ static void amd_pmu_disable_event(struct perf_event *event)
if (in_nmi())
return;

+#ifdef CONFIG_RPAL
+ if (this_cpu_read(rpal_nmi))
+ return;
+#endif
+
amd_pmu_wait_on_overflow(event->hw.idx);
}

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index be93ec7255bf..dd72b6d1c7f9 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -351,12 +351,23 @@ NOKPROBE_SYMBOL(unknown_nmi_error);

static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
+#ifdef CONFIG_RPAL
+DEFINE_PER_CPU(bool, rpal_nmi_handle);
+#endif

static noinstr void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
bool b2b = false;
+#ifdef CONFIG_RPAL
+ bool rpal_handle = false;
+
+ if (__this_cpu_read(rpal_nmi_handle)) {
+ __this_cpu_write(rpal_nmi_handle, false);
+ rpal_handle = true;
+ }
+#endif

/*
* Back-to-back NMIs are detected by comparing the RIP of the
@@ -471,6 +482,15 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
*/
if (b2b && __this_cpu_read(swallow_nmi))
__this_cpu_add(nmi_stats.swallow, 1);
+#ifdef CONFIG_RPAL
+ /*
+ * Lazy switch may clear the bit in active_mask, causing
+ * nmi event not handled. This will lead to unknown nmi,
+ * try to avoid this.
+ */
+ else if (rpal_handle)
+ goto out;
+#endif
else
unknown_nmi_error(reason, regs);

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 6a22b9faa100..92281b557a6c 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -376,11 +376,26 @@ rpal_exception_context_switch(struct pt_regs *regs)
return next;
}

+DECLARE_PER_CPU(bool, rpal_nmi_handle);
+DECLARE_PER_CPU(bool, rpal_nmi);
__visible struct task_struct *rpal_nmi_context_switch(struct pt_regs *regs)
{
struct task_struct *next;

- next = rpal_kernel_context_switch(regs);
+ if (rpal_test_current_thread_flag(RPAL_LAZY_SWITCHED_BIT))
+ rpal_update_fsbase(regs);
+
+ next = rpal_misidentify();
+ if (unlikely(next != NULL)) {
+ next = rpal_fix_critical_section(next, regs);
+ if (next) {
+ __this_cpu_write(rpal_nmi_handle, true);
+ /* avoid wait in amd_pmu_check_overflow */
+ __this_cpu_write(rpal_nmi, true);
+ next = rpal_do_kernel_context_switch(next, regs);
+ __this_cpu_write(rpal_nmi, false);
+ }
+ }

return next;
}
--
2.20.1

Return-Path: <linux-kernel+bounces-667905-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 3585241E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:42:41 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 402211886345
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:42:29 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 05D6B22E41D;
Fri, 30 May 2025 09:36:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="FU4Xbg5J"
Received: from mail-pg1-f170.google.com (mail-pg1-f170.google.com [209.85.215.170])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5BC7922A1EF
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:36:52 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.215.170
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597814; cv=none; b=Fv/hMe8PKkMhNyTI5O00KTZ3zvXyRhR7jLBjpvT5CcyZPmwbZ/H5O6SGKdM0dPOYQsG0TCNQ4ohUlf5G1L6mFbONTUIfRdjoVsuk/XCIkgHztbP7xBMgyeM1PfAKeNdYwkVza8TTA+sxgT6SwTAbPkEOwXZLLEubc5j+2cLJgjg=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597814; c=relaxed/simple;
bh=vd2C1pogQV46JqBd4R9eC8WzEgBwL6DtuQBJbThvcY0=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=Eqhh6pGm2FuiK8BDS+ZNGS8WnbibuWQWMzQrENS458oiLS1kkbqqP6qDGCZjl+PqjIeNqtIE30po1Jitgq8Q7k9i1koVRcr5q3lGeGTrC8HqmcMDxudPGzo1vIcLyh3SizmqltGvdh1Py+HFFupSG0skC/GE/MjCi+6REwvV0zY=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=FU4Xbg5J; arc=none smtp.client-ip=209.85.215.170
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pg1-f170.google.com with SMTP id 41be03b00d2f7-b2c4476d381so1642540a12.0
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:36:52 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597811; x=1749202611; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=9GWxjqZwkmZx5+2pv+QLtRl8XC71PyQccoyY/dIwUoU=;
b=FU4Xbg5JkXTsWoP2NVMq4T/VCUnkpyQejsiTEHcl2yure2y1mHWmP11exSYEnJdlhr
/FdE7deI2fUaaELz2MqqcsRNahTIaO9I5G1zsgunOCTERvqf58se7Ho9IIgJDjnYAxi2
T0TsibJyUU0LBKhdDjzJXKG7kkB+JQFLDHuReD4ILaEK0GDJSFiA+fa2tB3r8vTZUR/o
AGu+W8mrucLYHMnK6b1vbLdNWCa7l3DxnM50Tpq1jlH5jog99k7Vxn3tu36uVfDCA+sa
HnQw/Oz4L9eF55h0FXhU1zh/9gZRxIYBxVCFyNGqJZmBCq4jRmW/K+4vgzNbw8kuEo5X
Jm5g==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597811; x=1749202611;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=9GWxjqZwkmZx5+2pv+QLtRl8XC71PyQccoyY/dIwUoU=;
b=hNxHvdUNAbghhdVrTAy21aOXHUeQ1xOwluJ92P43wB3Q6BpZc0qmUJ7E4dXdpHPfvy
ObguWz9jv0z9rZbEAtYHRPhsfTHNkbY47ViYJTIhxsIw989o2nAXkNMv7/69tilQs478
mB6NUqY8xSF5mJ8NfoGdBNPTyeefWutFc/GMca0iLMqZdyOPXyCllEedfroi6VoVFiek
lzSj8hcLOEGIDcgAnxlYMMmkwCcfftuXmRD/YT4wW/KatJsAIKNPI4He0ZwCjtJb1e3U
ReXmxSeuaQxast8SQZTJGDPFqS1M4s0zx8O532UUArgRYZvbLhzJwpd0PfQ2SzgJdaqZ
6vfg==
X-Forwarded-Encrypted: i=1; AJvYcCUPCK7HgxMc1HXLcUPE0UGAgtJhz7wz05KLB4CcbwJcgA5Me59Xn+k5F/eKcYHAYk7mMUPPG0imn7OhhTQ=@vger.kernel.org
X-Gm-Message-State: AOJu0Yw6yn7WZJWf4XNtGpsfVs0zEk+bCdyFVY0Mdh5hvI2erzr1EFIT
FC7kwdzZaJx5zydAHnP+O6bBtppTNPAZUQ246lDr8hZBnxtDpslRAT5Q6W6SuyyVtJ0=
X-Gm-Gg: ASbGncukCnINC8LyxP+MZ8LnwfwyIO+5s7olYaKLEN4QyPdtowSnfQNlOQrB/Zgr3DN
8GU02p7mVo6RP2d4Qb+SIRWopH+VsMlBE4a4HF4zEfcsPgtf6mN7J5nVbNC4widDBIINBc1ldnf
JpR4WjUlMTtCqCXaZXgwxC+orsMCqJadIngsrcqAgbWLg/9HY1wKM1Z1tZxT8aySMSj+cEjkmYe
ppnYTYuIqekMJ5Xkh/iTAW/Uv4k+lo/2bx2u3fQ9Tl91HGjv/abBvJdaW0RXSmQSYcBppJ8+MKE
NpYcZEr2wx8mEauvOQljWGH/MyzyCjQ4UEwk3hn1GRGdtdAWx9/xm+DhQIJp9D20Vx2WIoZs0tu
EOcgTqZSVJA==
X-Google-Smtp-Source: AGHT+IEd6eqw3bYsn2JQr9oo7VNa3bHRe5ESaa02VpYg0eRHeS5gGe9cNS3vRZsm3BzNT8I7hOudjA==
X-Received: by 2002:a17:90b:5104:b0:302:fc48:4f0a with SMTP id 98e67ed59e1d1-3124446ce79mr4391987a91.0.1748597811484;
Fri, 30 May 2025 02:36:51 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.36.36
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:36:51 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 33/35] RPAL: enable time slice correction
Date: Fri, 30 May 2025 17:28:01 +0800
Message-Id: <8941a17e12edce00c1cc1c78f4dd3e1bf28e47c0.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

After an RPAL call, the receiver's user mode code executes. However, the
kernel incorrectly attributes this CPU time to the sender due to the
unchanged kernel context. This results in incorrect runtime statistics.

This patch adds a new member total_time to both rpal_sender_call_context
and rpal_receiver_call_context. This member tracks how much runtime (
measured in CPU cycles via rdtsc()) has been incorrectly accounted for.
The kernel measures total_time at the entry of __schedule() and corrects
the delta in the update_rq_clock_task() function.

Additionally, since RPAL calls occur in user space, runtime statistics are
typically calculated by user space. However, when a lazy switch happens,
the kernel takes over. To address this, the patch introduces a start_time
member to record when an RPAL call is initiated, enabling the kernel to
accurately calculate the runtime that needs correction.

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/core.c | 8 ++++++++
arch/x86/rpal/thread.c | 6 ++++++
include/linux/rpal.h | 3 +++
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/fork.c | 1 +
kernel/sched/core.c | 42 ++++++++++++++++++++++++++++++++++++++++++
7 files changed, 62 insertions(+)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 92281b557a6c..2ac5d932f69c 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -144,6 +144,13 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
struct task_struct *prev = current;

if (rpal_test_task_thread_flag(next, RPAL_LAZY_SWITCHED_BIT)) {
+ struct rpal_receiver_call_context *rcc = next->rpal_rd->rcc;
+ struct rpal_sender_call_context *scc = current->rpal_sd->scc;
+ u64 slice = rdtsc_ordered() - scc->start_time;
+
+ rcc->total_time += slice;
+ scc->total_time += slice;
+
rpal_resume_ep(next);
current->rpal_sd->receiver = next;
rpal_lock_cpu(current);
@@ -169,6 +176,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
rpal_schedule(next);
rpal_clear_task_thread_flag(prev, RPAL_LAZY_SWITCHED_BIT);
prev->rpal_rd->sender = NULL;
+ next->rpal_sd->scc->start_time = rdtsc_ordered();
}
if (unlikely(!irqs_disabled())) {
local_irq_disable();
diff --git a/arch/x86/rpal/thread.c b/arch/x86/rpal/thread.c
index 51c9eec639cb..5cd0be631521 100644
--- a/arch/x86/rpal/thread.c
+++ b/arch/x86/rpal/thread.c
@@ -99,6 +99,8 @@ int rpal_register_sender(unsigned long addr)
rsd->scc = (struct rpal_sender_call_context *)(addr - rsp->user_start +
rsp->kernel_start);
rsd->receiver = NULL;
+ rsd->scc->start_time = 0;
+ rsd->scc->total_time = 0;

current->rpal_sd = rsd;
rpal_set_current_thread_flag(RPAL_SENDER_BIT);
@@ -182,6 +184,7 @@ int rpal_register_receiver(unsigned long addr)
(struct rpal_receiver_call_context *)(addr - rsp->user_start +
rsp->kernel_start);
rrd->sender = NULL;
+ rrd->rcc->total_time = 0;

current->rpal_rd = rrd;
rpal_set_current_thread_flag(RPAL_RECEIVER_BIT);
@@ -289,6 +292,9 @@ int rpal_rebuild_sender_context_on_fault(struct pt_regs *regs,
rpal_pkey_to_pkru(rpal_current_service()->pkey),
RPAL_PKRU_SET);
#endif
+ if (!rpal_is_correct_address(rpal_current_service(), regs->ip))
+ /* receiver has crashed */
+ scc->total_time += rdtsc_ordered() - scc->start_time;
return 0;
}
}
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 1d8c1bdc90f2..f5f4da63f28c 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -310,6 +310,7 @@ struct rpal_receiver_call_context {
void __user *events;
int maxevents;
int timeout;
+ int64_t total_time;
};

/* recovery point for sender */
@@ -325,6 +326,8 @@ struct rpal_sender_call_context {
struct rpal_task_context rtc;
struct rpal_error_context ec;
int sender_id;
+ s64 start_time;
+ s64 total_time;
};

/* End */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f25cc09fb71..a03113fecdc5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1663,6 +1663,7 @@ struct task_struct {
struct rpal_sender_data *rpal_sd;
struct rpal_receiver_data *rpal_rd;
};
+ s64 rpal_steal_time;
#endif

/* CPU-specific state of this task: */
diff --git a/init/init_task.c b/init/init_task.c
index 2eb08b96e66b..3606cf701dfe 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -224,6 +224,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.rpal_rs = NULL,
.rpal_flag = 0,
.rpal_cd = NULL,
+ .rpal_steal_time = 0,
#endif
};
EXPORT_SYMBOL(init_task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 11cba74d07c8..ff6331a28987 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1222,6 +1222,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->rpal_rs = NULL;
tsk->rpal_flag = 0;
tsk->rpal_cd = NULL;
+ tsk->rpal_steal_time = 0;
#endif
return tsk;

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c219ada29d34..d6f8e0d76fc0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -789,6 +789,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
delta -= steal;
}
#endif
+#ifdef CONFIG_RPAL
+ if (unlikely(current->rpal_steal_time != 0)) {
+ delta += current->rpal_steal_time;
+ if (unlikely(delta < 0))
+ delta = 0;
+ current->rpal_steal_time = 0;
+ }
+#endif

rq->clock_task += delta;

@@ -6872,6 +6880,36 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
return true;
}

+#ifdef CONFIG_RPAL
+static void rpal_acct_runtime(void)
+{
+ if (rpal_current_service()) {
+ if (rpal_test_task_thread_flag(current, RPAL_SENDER_BIT) &&
+ current->rpal_sd->scc->total_time != 0) {
+ struct rpal_sender_call_context *scc =
+ current->rpal_sd->scc;
+
+ u64 slice =
+ native_sched_clock_from_tsc(scc->total_time) -
+ native_sched_clock_from_tsc(0);
+ current->rpal_steal_time -= slice;
+ scc->total_time = 0;
+ } else if (rpal_test_task_thread_flag(current,
+ RPAL_RECEIVER_BIT) &&
+ current->rpal_rd->rcc->total_time != 0) {
+ struct rpal_receiver_call_context *rcc =
+ current->rpal_rd->rcc;
+
+ u64 slice =
+ native_sched_clock_from_tsc(rcc->total_time) -
+ native_sched_clock_from_tsc(0);
+ current->rpal_steal_time += slice;
+ rcc->total_time = 0;
+ }
+ }
+}
+#endif
+
/*
* __schedule() is the main scheduler function.
*
@@ -6926,6 +6964,10 @@ static void __sched notrace __schedule(int sched_mode)
struct rq *rq;
int cpu;

+#ifdef CONFIG_RPAL
+ rpal_acct_runtime();
+#endif
+
trace_sched_entry_tp(preempt, CALLER_ADDR0);

cpu = smp_processor_id();
--
2.20.1

Return-Path: <linux-kernel+bounces-667906-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from sv.mirrors.kernel.org (sv.mirrors.kernel.org [139.178.88.99])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id D1C7441E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:42:51 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by sv.mirrors.kernel.org (Postfix) with ESMTPS id 45AEC3B4DBB
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:42:05 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 4BB5F22A1EF;
Fri, 30 May 2025 09:37:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="bQw0UddR"
Received: from mail-pj1-f52.google.com (mail-pj1-f52.google.com [209.85.216.52])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 89982220F5F
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:37:07 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.52
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597829; cv=none; b=hB67eKZgTkweJjyjtonfxgytjRD7s9oVPGr1Ffz2vEpjZgsM3c7crCl0aFwDbYX14PzN/xde4VOXOTT7ScuGJuILAPVr/Z8xdsFegVXw1pSTbLjGGfNK4CBJH+lUqdCiWN6p3etY1eVkKJxphNs1VGUlvRPqKvV1GuA2cVPJY/E=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597829; c=relaxed/simple;
bh=OO8xU47yO/bdoL2deRtc+JSUSQ1CGyj22+/NieuKsMM=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=bVPZQ/nQpif/uKs1qu8pexlQ8+1hEIX8bz8pBqWROsoq291OWPc80giT6NrENawpsBTdfrN8Pxcdy0bOVOhT6nNkynp5h6CaS+HBmROL18uuOsc3tcAIQ0ny98B946MMpawIAFlJBzcaVU5eUqyRm9uJn0b6cq7n2ivPYvaoA2Y=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=bQw0UddR; arc=none smtp.client-ip=209.85.216.52
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pj1-f52.google.com with SMTP id 98e67ed59e1d1-311d5fdf1f0so1705684a91.1
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:37:07 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597827; x=1749202627; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=Ff5bIYNqY4yUnollIgoWVZucxS5phnK+GSArqqp8njA=;
b=bQw0UddRSd0zodJFoDpQPUYtbH1/vf+TrFM/oYUAFXfqB280B0OrDjvwoGgX/58GYZ
j4hvdO+XitJo8fkmEr2ugoXnozUApYaq5qz1miA5ra+AYe749tpVZer6npWBXRkke9rl
4J9pA24qUUSFt3yjLUYeKiqaXphcZbGmGr4E2mI/FIppeSvb1doRHrPpaGvzlfwxVuko
CH3ZTO94pN8EXwoy2vai6ATQnh+0Ijh8Vsj3fLcmjpHubwMPj+kp8l1Xp8olIg2p8bxo
JmRg7UYZYAOyDt7wqRwOKkr9jUx6FV0s/w2/PQAYB/BKmaJbzssmmI2i9aOBaStyjc9s
tz5w==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597827; x=1749202627;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=Ff5bIYNqY4yUnollIgoWVZucxS5phnK+GSArqqp8njA=;
b=h4jikp4J+Mij4bzN0xuFGnQBI1U3pLg1U8Sfjpmxd4qBF2wcnqdEyKEUX3ZqttCJ+r
qJqWoHe/fPo/bhfDnjVVw093ZJ78Y3hePh7y3BRQGRWcT6+fJfjd4WHH6cAieDOY2qhs
kI+M1BFudiEyH9GAzFAuX0AOfLZbt4GmSjvZxyCRd/nfqb73mNd9hsFnfMC2sfDd+11u
WP7yz/aLPkKv1/6cWkFGAryNLeMzpy43mxk1WULr0SVGYhKr94TwmNHo838jCsmF8/Lv
3Ih6DCkp285VeMIm5QxDNiNZsWdVJK5vc0oC2/rbcDB2EFLwpGEi0kmSmH7ywE88kWz9
UKiQ==
X-Forwarded-Encrypted: i=1; AJvYcCWoB6SejjtbtoPXZCSHBlQ537lofXUeg6w82B+kKjpSlp1eechuyNXrt/5yJhzf7JqFxbNNZo+khKL1eSI=@vger.kernel.org
X-Gm-Message-State: AOJu0Yx5nSTY+jFtSKtJRkix7lkU31BgdlTaSDk8WtyZW4rDhXOQsMho
Y1iwL/qN3U2p+oc/FUUFOsh5pkHT7+QjOF1UwkjJITwcWKkmF/MzdANyu+w7V6qGLOA=
X-Gm-Gg: ASbGnctaQ16oO33ikc/wT2yAzcCs7OvKNbKZlWba0kxsa5BqK3QGvLQo3v69DjKWUfI
Nlw0P406OY2RFqzQhOrxA1ZocYrvqCQaJzY7Jilw0dMit9Gv/HpDyFT184wNxIzQbqBNmhW2nOw
HRm5cOSkopIkKR27iIWBgAOZ4FHDi72n9ojxu4WPIKXQ+R6G0U81R3kh/z60ZmUb11BnJVIF6u1
rP02ddueL/HhJ69V4jb+Nj8bXlOa/b+Szf6H//BSiOd6gw54jT3bahFo1/k0Bh2A/72VkHGFCUp
vV9ezFB2vQ6Jsqq2zrYQeuYksmitvnpeHip1fpEwug6oOKoBYWs9kE3dczlu0tcKUS1wNeRJsGy
hItd2K9VuSQ==
X-Google-Smtp-Source: AGHT+IGPAReH+gj/mAHjw8Fc+v5rLzKHU0DKOTk3FhYthS79DtKAel685Vp30oKSxbQ9599VbyBn4A==
X-Received: by 2002:a17:90b:4d:b0:311:be51:bdec with SMTP id 98e67ed59e1d1-3125036326fmr2501710a91.11.1748597826688;
Fri, 30 May 2025 02:37:06 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.36.51
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:37:06 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 34/35] RPAL: enable fast epoll wait
Date: Fri, 30 May 2025 17:28:02 +0800
Message-Id: <b13520ef51366f6c25c50f05de7210d37fcd9489.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

When a kernel event occurs during an RPAL call and triggers a lazy switch,
the kernel context switches from the sender to the receiver. When the
receiver later returns from user space to the sender, a second lazy switch
is required to switch the kernel context back to the sender. In the current
implementation, after the second lazy switch, the receiver returns to user
space via rpal_kernel_ret() and then calls epoll_wait() from user space to
re-enter the kernel. This causes the receiver to be unable to process epoll
events for a long period, degrading performance.

This patch introduces a fast epoll wait feature. During the second lazy
switch, the kernel configures epoll-related data structures so that the
receiver can directly enter the epoll wait state without first returning
to user space and then calling epoll_wait(). The patch adds a new state
RPAL_RECEIVER_STATE_READY_LS, which is used to mark that the receiver can
transition to RPAL_RECEIVER_STATE_WAIT during the second lazy switch. The
kernel then performs this state transition in rpal_lazy_switch_tail().

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
arch/x86/rpal/core.c | 29 ++++++++++++-
fs/eventpoll.c | 101 +++++++++++++++++++++++++++++++++++++++++++
include/linux/rpal.h | 3 ++
kernel/sched/core.c | 13 +++++-
4 files changed, 143 insertions(+), 3 deletions(-)

diff --git a/arch/x86/rpal/core.c b/arch/x86/rpal/core.c
index 2ac5d932f69c..7b6efde23e48 100644
--- a/arch/x86/rpal/core.c
+++ b/arch/x86/rpal/core.c
@@ -51,7 +51,25 @@ void rpal_lazy_switch_tail(struct task_struct *tsk)
atomic_cmpxchg(&rcc->receiver_state, rpal_build_call_state(tsk->rpal_sd),
RPAL_RECEIVER_STATE_LAZY_SWITCH);
} else {
+ /* tsk is receiver */
+ int state;
+
+ rcc = tsk->rpal_rd->rcc;
+ state = atomic_read(&rcc->receiver_state);
+ /* receiver may be scheduled on another cpu after unlock. */
rpal_unlock_cpu(tsk);
+ /*
+ * We must not use RPAL_RECEIVER_STATE_READY instead of
+ * RPAL_RECEIVER_STATE_READY_LS. As receiver may at
+ * TASK_RUNNING state and then call epoll_wait() again,
+ * the state may become RPAL_RECEIVER_STATE_READY, we should
+ * not changed its state to RPAL_RECEIVER_STATE_WAIT since
+ * the state is set by another RPAL call.
+ */
+ if (state == RPAL_RECEIVER_STATE_READY_LS)
+ atomic_cmpxchg(&rcc->receiver_state,
+ RPAL_RECEIVER_STATE_READY_LS,
+ RPAL_RECEIVER_STATE_WAIT);
rpal_unlock_cpu(current);
}
}
@@ -63,8 +81,14 @@ void rpal_kernel_ret(struct pt_regs *regs)
int state;

if (rpal_test_current_thread_flag(RPAL_RECEIVER_BIT)) {
- rcc = current->rpal_rd->rcc;
- regs->ax = rpal_try_send_events(current->rpal_rd->ep, rcc);
+ struct rpal_receiver_data *rrd = current->rpal_rd;
+
+ rcc = rrd->rcc;
+ if (rcc->timeout > 0)
+ hrtimer_cancel(&rrd->ep_sleeper.timer);
+ rpal_remove_ep_wait_list(rrd);
+ regs->ax = rpal_try_send_events(rrd->ep, rcc);
+ fdput(rrd->f);
atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_KERNEL_RET);
} else {
tsk = current->rpal_sd->receiver;
@@ -173,6 +197,7 @@ rpal_do_kernel_context_switch(struct task_struct *next, struct pt_regs *regs)
* Otherwise, sender's user context will be corrupted.
*/
rebuild_receiver_stack(current->rpal_rd, regs);
+ rpal_fast_ep_poll(current->rpal_rd, regs);
rpal_schedule(next);
rpal_clear_task_thread_flag(prev, RPAL_LAZY_SWITCHED_BIT);
prev->rpal_rd->sender = NULL;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 791321639561..b70c1cd82335 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2143,6 +2143,107 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}

#ifdef CONFIG_RPAL
+static void *rpal_get_eventpoll(struct rpal_receiver_data *rrd, struct pt_regs *regs)
+{
+ struct rpal_receiver_call_context *rcc = rrd->rcc;
+ int epfd = rcc->epfd;
+ struct epoll_event __user *events = rcc->events;
+ int maxevents = rcc->maxevents;
+ struct file *file;
+
+ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) {
+ regs->ax = -EINVAL;
+ return NULL;
+ }
+
+ if (!access_ok(events, maxevents * sizeof(struct epoll_event))) {
+ regs->ax = -EFAULT;
+ return NULL;
+ }
+
+ rrd->f = fdget(epfd);
+ file = fd_file(rrd->f);
+ if (!file) {
+ regs->ax = -EBADF;
+ return NULL;
+ }
+
+ if (!is_file_epoll(file)) {
+ regs->ax = -EINVAL;
+ fdput(rrd->f);
+ return NULL;
+ }
+
+ rrd->ep = file->private_data;
+ return rrd->ep;
+}
+
+void rpal_fast_ep_poll(struct rpal_receiver_data *rrd, struct pt_regs *regs)
+{
+ struct eventpoll *ep;
+ struct rpal_receiver_call_context *rcc = rrd->rcc;
+ ktime_t ts = 0;
+ struct hrtimer *ht = &rrd->ep_sleeper.timer;
+ int state;
+ int avail;
+
+ regs->orig_ax = __NR_epoll_wait;
+ ep = rpal_get_eventpoll(rrd, regs);
+
+ if (!ep || signal_pending(current) ||
+ unlikely(ep_events_available(ep)) ||
+ atomic_read(&rcc->ep_pending) || unlikely(rcc->timeout == 0)) {
+ INIT_LIST_HEAD(&rrd->ep_wait.entry);
+ } else {
+ /*
+ * Here we use RPAL_RECEIVER_STATE_READY_LS to avoid conflict with
+ * RPAL_RECEIVER_STATE_READY. As the RPAL_RECEIVER_STATE_READY_LS
+ * is convert to RPAL_RECEIVER_STATE_WAIT in rpal_lazy_switch_tail(),
+ * it is possible the receiver is woken at that time. Thus,
+ * rpal_lazy_switch_tail() should figure out whether the receiver
+ * state is set by lazy switch or not. See rpal_lazy_switch_tail()
+ * for details.
+ */
+ state = atomic_xchg(&rcc->receiver_state, RPAL_RECEIVER_STATE_READY_LS);
+ if (unlikely(state != RPAL_RECEIVER_STATE_LAZY_SWITCH))
+ rpal_err("%s: unexpected state: %d\n", __func__, state);
+ init_waitqueue_func_entry(&rrd->ep_wait, rpal_ep_autoremove_wake_function);
+ rrd->ep_wait.private = rrd;
+ INIT_LIST_HEAD(&rrd->ep_wait.entry);
+ write_lock(&ep->lock);
+ set_current_state(TASK_INTERRUPTIBLE);
+ avail = ep_events_available(ep);
+ if (!avail)
+ __add_wait_queue_exclusive(&ep->wq, &rrd->ep_wait);
+ write_unlock(&ep->lock);
+ if (avail) {
+ /* keep state consistent when we enter rpal_kernel_ret() */
+ atomic_set(&rcc->receiver_state,
+ RPAL_RECEIVER_STATE_LAZY_SWITCH);
+ set_current_state(TASK_RUNNING);
+ return;
+ }
+
+ if (rcc->timeout > 0) {
+ rrd->ep_sleeper.task = rrd->rcd.bp_task;
+ ts = ms_to_ktime(rcc->timeout);
+ hrtimer_start(ht, ts, HRTIMER_MODE_REL);
+ }
+ }
+}
+
+void rpal_remove_ep_wait_list(struct rpal_receiver_data *rrd)
+{
+ struct eventpoll *ep = (struct eventpoll *)rrd->ep;
+ wait_queue_entry_t *wait = &rrd->ep_wait;
+
+ if (!list_empty_careful(&wait->entry)) {
+ write_lock_irq(&ep->lock);
+ __remove_wait_queue(&ep->wq, wait);
+ write_unlock_irq(&ep->lock);
+ }
+}
+
void *rpal_get_epitemep(wait_queue_entry_t *wait)
{
struct epitem *epi = ep_item_from_wait(wait);
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index f5f4da63f28c..676113f0ba1f 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -126,6 +126,7 @@ enum rpal_receiver_state {
RPAL_RECEIVER_STATE_WAIT,
RPAL_RECEIVER_STATE_CALL,
RPAL_RECEIVER_STATE_LAZY_SWITCH,
+ RPAL_RECEIVER_STATE_READY_LS,
RPAL_RECEIVER_STATE_MAX,
};

@@ -627,4 +628,6 @@ void rpal_resume_ep(struct task_struct *tsk);
void *rpal_get_epitemep(wait_queue_entry_t *wait);
int rpal_get_epitemfd(wait_queue_entry_t *wait);
int rpal_try_send_events(void *ep, struct rpal_receiver_call_context *rcc);
+void rpal_remove_ep_wait_list(struct rpal_receiver_data *rrd);
+void rpal_fast_ep_poll(struct rpal_receiver_data *rrd, struct pt_regs *regs);
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d6f8e0d76fc0..1728b04d1387 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3965,6 +3965,11 @@ static bool rpal_check_state(struct task_struct *p)
case RPAL_RECEIVER_STATE_LAZY_SWITCH:
case RPAL_RECEIVER_STATE_RUNNING:
break;
+ /*
+ * Allow RPAL_RECEIVER_STATE_READY_LS to be woken will cause irq
+ * being enabled in rpal_unlock_cpu.
+ */
+ case RPAL_RECEIVER_STATE_READY_LS:
case RPAL_RECEIVER_STATE_CALL:
rpal_set_task_thread_flag(p, RPAL_WAKE_BIT);
ret = false;
@@ -11403,7 +11408,13 @@ void __sched notrace rpal_schedule(struct task_struct *next)

prev_state = READ_ONCE(prev->__state);
if (prev_state) {
- try_to_block_task(rq, prev, &prev_state);
+ if (!try_to_block_task(rq, prev, &prev_state)) {
+ /*
+ * As the task enter TASK_RUNNING state, we should clean up
+ * RPAL_RECEIVER_STATE_READY_LS status.
+ */
+ rpal_check_ready_state(prev, RPAL_RECEIVER_STATE_READY_LS);
+ }
switch_count = &prev->nvcsw;
}

--
2.20.1

Return-Path: <linux-kernel+bounces-667908-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from ny.mirrors.kernel.org (ny.mirrors.kernel.org [147.75.199.223])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 786E241E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:43:12 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by ny.mirrors.kernel.org (Postfix) with ESMTPS id 03B81177DC6
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:42:58 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 9948F22F76C;
Fri, 30 May 2025 09:37:31 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="F3eRGqSf"
Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2779B22B598
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:37:27 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597850; cv=none; b=RwEil6jpU3BIgroSv188AcECm//l4RZp375RDnjJ926TjtOI+Ou820kx713f1EZMiSaF4mCiSUEroxpMPbIh/IlKXziWq3fCOkiVRGyKngPjXwcZwCh27nRXiry7W34RPmUdimS/aNkEh1pkgudXlXQRS47lydud38ty8u4Wc+0=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597850; c=relaxed/simple;
bh=Vo15I+t4ldDz+udo0/J6K2wEPwd7uOm20jAwOh5675U=;
h=Message-ID:Date:MIME-Version:Subject:To:Cc:References:From:
In-Reply-To:Content-Type; b=FIXjLdBZ2VdfnnkxRFjo6SQ/Xl0HSwMmGrY0uAO2prcakFren3JGCBywCt0WceS4OBMLc0/xZnyL5ROlFMqqyhDD6MOY3zz/zUYv0HYAd5LZTihzfZUikA1li0dSRH3+aTfwJ70FhtcA4gpGUUVFd20FxCXSi/DnUMObQYbKXfI=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=F3eRGqSf; arc=none smtp.client-ip=170.10.133.124
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=redhat.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com;
s=mimecast20190719; t=1748597847;
h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
to:to:cc:cc:mime-version:mime-version:content-type:content-type:
content-transfer-encoding:content-transfer-encoding:
in-reply-to:in-reply-to:references:references:autocrypt:autocrypt;
bh=yhyTHW9/iNSmJ3o1/+Kc2KlH+vX1ja7mHJ+mDxpkv9E=;
b=F3eRGqSf/naq58IBMHrYtrb93TuXz2tHgKs5FOHBLNuv8r89hCK1Km5ApOAl0yWYRGmMKz
JVQClIFiiHtw1veKrCx/07QzVPbyblJiLbS3KEbubfRtG1nRnjkOe7keuNTfgwZqYpycNU
RXsk0x37kJcCr856Vm21+g1AtfR9I3o=
Received: from mail-wm1-f69.google.com (mail-wm1-f69.google.com
[209.85.128.69]) by relay.mimecast.com with ESMTP with STARTTLS
(version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id
us-mta-502-aix-I5LcPzmrRjWTfEQF4A-1; Fri, 30 May 2025 05:37:25 -0400
X-MC-Unique: aix-I5LcPzmrRjWTfEQF4A-1
X-Mimecast-MFC-AGG-ID: aix-I5LcPzmrRjWTfEQF4A_1748597845
Received: by mail-wm1-f69.google.com with SMTP id 5b1f17b1804b1-450cb8ff0c6so9515095e9.3
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:37:25 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597844; x=1749202644;
h=content-transfer-encoding:in-reply-to:organization:autocrypt
:content-language:from:references:cc:to:subject:user-agent
:mime-version:date:message-id:x-gm-message-state:from:to:cc:subject
:date:message-id:reply-to;
bh=yhyTHW9/iNSmJ3o1/+Kc2KlH+vX1ja7mHJ+mDxpkv9E=;
b=nAewBizHyan6xd3jJOlnZKiZ3NnfZmU3mag/duvQN7voggA4YpOuhB5eG1eBurlImm
WPXKqpPq47LDUFDRoP5pdOMfukzRvEfV3ikFXyryHzNA28nlPXO7Y7KPloA2xrbYen59
CnwPiUaI06yZk/lOQaJNZg7qpTM/deQdYldmxcZuI63WxWt/DXOEz+2vmZ0JBJYlDZyq
R//+nLBbRbAF70iByiQM0dC+n0Pn6tUrocyDE0Vi8qr7KFGiI2GAWLz9xnFn7UK31XO+
Nnd7oTXPsgDMeQxh/SyEHuiJd+K9qu1N81IoT7Nw4yA7kiz4qXNACgpBgPqqZWWkXjoe
L9rw==
X-Forwarded-Encrypted: i=1; AJvYcCUJ78I9/QdXBlyIOj79OrWa/VaPC/sq/goDAQPIYgz8VPlII5odGalVJscc0ufWmK7Ebi9SlCYEh+1Cd1w=@vger.kernel.org
X-Gm-Message-State: AOJu0Yx6oZXhUFRrFwVn6922m7/m8HnzJFiPU808TuamKryUaMktZtvg
85euguSryvZVsZjxW86vbBySn4sLp3MBz247NKIeH3AIUxcp5toFqXbSbk9hRTIJQh4AN1K879G
RKReleJHhtM6QTO79U7PudwctkEhu+qYGqIwQTZdfenfkd4EmX7EbpmaX2GNokxPYTA==
X-Gm-Gg: ASbGncsAAiDiXMdL70BzdQ3KuWdoHyaR7ibAgdoaewge+gy07X68C1JoVAkTwZbUcSz
WLUTfqt6Rc7bJLOCeeQeDEiz9nfpjO3/QTGiQVChlJbaZYwVPhnC7jS2pQf9fB1DimQ2B3tY4IT
t0dZGtwHXlAnSKTxjE92gVOmJIMogM/nTT8Y33OxWakqIR/abc0tcU4fh+OMw2syY2tFfvjbfK9
fYfOGHx7l5UqMHGi2fFqZVwCIdPEH61kXrpY8Luft8drekpKf6h4H0deRlZOStvxkCpXK7k9+xN
WlxWMofh8ldgfCSMuHbl+m0zPT+lC2jr0tinyYikSpwrOTL0gbTXFYa/F8EI9gC5Ls1+cNkHR+E
ScMgMhAd2foPK8kG8zS7RyX/j7PVNYSV+yargbqk=
X-Received: by 2002:a05:600c:6207:b0:43c:ea36:9840 with SMTP id 5b1f17b1804b1-450d885e38amr12675375e9.22.1748597844509;
Fri, 30 May 2025 02:37:24 -0700 (PDT)
X-Google-Smtp-Source: AGHT+IGccFKRFTCgkkT3ae/QFUuhHOQzucrbv9tCboH5VNdb/K0MlBO6YnvTUAz5zFzNoWT7UAvZNw==
X-Received: by 2002:a05:600c:6207:b0:43c:ea36:9840 with SMTP id 5b1f17b1804b1-450d885e38amr12675135e9.22.1748597844047;
Fri, 30 May 2025 02:37:24 -0700 (PDT)
Received: from ?IPV6:2003:d8:2f03:5b00:f549:a879:b2d3:73ee? (p200300d82f035b00f549a879b2d373ee.dip0.t-ipconnect.de. [2003:d8:2f03:5b00:f549:a879:b2d3:73ee])
by smtp.gmail.com with ESMTPSA id 5b1f17b1804b1-450d800671csm12953125e9.30.2025.05.30.02.37.21
(version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128);
Fri, 30 May 2025 02:37:22 -0700 (PDT)
Message-ID: <371b8fdd-129d-4fe3-bbc7-f0a1bc433b30@xxxxxxxxxx>
Date: Fri, 30 May 2025 11:37:21 +0200
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
User-Agent: Mozilla Thunderbird
Subject: Re: [PATCH 02/12] mm: Convert pXd_devmap checks to vma_is_dax
To: Alistair Popple <apopple@xxxxxxxxxx>, linux-mm@xxxxxxxxx
Cc: gerald.schaefer@xxxxxxxxxxxxx, dan.j.williams@xxxxxxxxx, jgg@xxxxxxxx,
willy@xxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx, nvdimm@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx, linux-ext4@xxxxxxxxxxxxxxx,
linux-xfs@xxxxxxxxxxxxxxx, jhubbard@xxxxxxxxxx, hch@xxxxxx,
zhang.lyra@xxxxxxxxx, debug@xxxxxxxxxxxx, bjorn@xxxxxxxxxx,
balbirs@xxxxxxxxxx, lorenzo.stoakes@xxxxxxxxxx,
linux-arm-kernel@xxxxxxxxxxxxxxxxxxx, loongarch@xxxxxxxxxxxxxxx,
linuxppc-dev@xxxxxxxxxxxxxxxx, linux-riscv@xxxxxxxxxxxxxxxxxxx,
linux-cxl@xxxxxxxxxxxxxxx, dri-devel@xxxxxxxxxxxxxxxxxxxxx, John@xxxxxxxxxx
References: <cover.541c2702181b7461b84f1a6967a3f0e823023fcc.1748500293.git-series.apopple@xxxxxxxxxx>
<224f0265027a9578534586fa1f6ed80270aa24d5.1748500293.git-series.apopple@xxxxxxxxxx>
From: David Hildenbrand <david@xxxxxxxxxx>
Content-Language: en-US
Autocrypt: addr=david@xxxxxxxxxx; keydata=
xsFNBFXLn5EBEAC+zYvAFJxCBY9Tr1xZgcESmxVNI/0ffzE/ZQOiHJl6mGkmA1R7/uUpiCjJ
dBrn+lhhOYjjNefFQou6478faXE6o2AhmebqT4KiQoUQFV4R7y1KMEKoSyy8hQaK1umALTdL
QZLQMzNE74ap+GDK0wnacPQFpcG1AE9RMq3aeErY5tujekBS32jfC/7AnH7I0v1v1TbbK3Gp
XNeiN4QroO+5qaSr0ID2sz5jtBLRb15RMre27E1ImpaIv2Jw8NJgW0k/D1RyKCwaTsgRdwuK
Kx/Y91XuSBdz0uOyU/S8kM1+ag0wvsGlpBVxRR/xw/E8M7TEwuCZQArqqTCmkG6HGcXFT0V9
PXFNNgV5jXMQRwU0O/ztJIQqsE5LsUomE//bLwzj9IVsaQpKDqW6TAPjcdBDPLHvriq7kGjt
WhVhdl0qEYB8lkBEU7V2Yb+SYhmhpDrti9Fq1EsmhiHSkxJcGREoMK/63r9WLZYI3+4W2rAc
UucZa4OT27U5ZISjNg3Ev0rxU5UH2/pT4wJCfxwocmqaRr6UYmrtZmND89X0KigoFD/XSeVv
jwBRNjPAubK9/k5NoRrYqztM9W6sJqrH8+UWZ1Idd/DdmogJh0gNC0+N42Za9yBRURfIdKSb
B3JfpUqcWwE7vUaYrHG1nw54pLUoPG6sAA7Mehl3nd4pZUALHwARAQABzSREYXZpZCBIaWxk
ZW5icmFuZCA8ZGF2aWRAcmVkaGF0LmNvbT7CwZgEEwEIAEICGwMGCwkIBwMCBhUIAgkKCwQW
AgMBAh4BAheAAhkBFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAl8Ox4kFCRKpKXgACgkQTd4Q
9wD/g1oHcA//a6Tj7SBNjFNM1iNhWUo1lxAja0lpSodSnB2g4FCZ4R61SBR4l/psBL73xktp
rDHrx4aSpwkRP6Epu6mLvhlfjmkRG4OynJ5HG1gfv7RJJfnUdUM1z5kdS8JBrOhMJS2c/gPf
wv1TGRq2XdMPnfY2o0CxRqpcLkx4vBODvJGl2mQyJF/gPepdDfcT8/PY9BJ7FL6Hrq1gnAo4
3Iv9qV0JiT2wmZciNyYQhmA1V6dyTRiQ4YAc31zOo2IM+xisPzeSHgw3ONY/XhYvfZ9r7W1l
pNQdc2G+o4Di9NPFHQQhDw3YTRR1opJaTlRDzxYxzU6ZnUUBghxt9cwUWTpfCktkMZiPSDGd
KgQBjnweV2jw9UOTxjb4LXqDjmSNkjDdQUOU69jGMUXgihvo4zhYcMX8F5gWdRtMR7DzW/YE
BgVcyxNkMIXoY1aYj6npHYiNQesQlqjU6azjbH70/SXKM5tNRplgW8TNprMDuntdvV9wNkFs
9TyM02V5aWxFfI42+aivc4KEw69SE9KXwC7FSf5wXzuTot97N9Phj/Z3+jx443jo2NR34XgF
89cct7wJMjOF7bBefo0fPPZQuIma0Zym71cP61OP/i11ahNye6HGKfxGCOcs5wW9kRQEk8P9
M/k2wt3mt/fCQnuP/mWutNPt95w9wSsUyATLmtNrwccz63XOwU0EVcufkQEQAOfX3n0g0fZz
Bgm/S2zF/kxQKCEKP8ID+Vz8sy2GpDvveBq4H2Y34XWsT1zLJdvqPI4af4ZSMxuerWjXbVWb
T6d4odQIG0fKx4F8NccDqbgHeZRNajXeeJ3R7gAzvWvQNLz4piHrO/B4tf8svmRBL0ZB5P5A
2uhdwLU3NZuK22zpNn4is87BPWF8HhY0L5fafgDMOqnf4guJVJPYNPhUFzXUbPqOKOkL8ojk
CXxkOFHAbjstSK5Ca3fKquY3rdX3DNo+EL7FvAiw1mUtS+5GeYE+RMnDCsVFm/C7kY8c2d0G
NWkB9pJM5+mnIoFNxy7YBcldYATVeOHoY4LyaUWNnAvFYWp08dHWfZo9WCiJMuTfgtH9tc75
7QanMVdPt6fDK8UUXIBLQ2TWr/sQKE9xtFuEmoQGlE1l6bGaDnnMLcYu+Asp3kDT0w4zYGsx
5r6XQVRH4+5N6eHZiaeYtFOujp5n+pjBaQK7wUUjDilPQ5QMzIuCL4YjVoylWiBNknvQWBXS
lQCWmavOT9sttGQXdPCC5ynI+1ymZC1ORZKANLnRAb0NH/UCzcsstw2TAkFnMEbo9Zu9w7Kv
AxBQXWeXhJI9XQssfrf4Gusdqx8nPEpfOqCtbbwJMATbHyqLt7/oz/5deGuwxgb65pWIzufa
N7eop7uh+6bezi+rugUI+w6DABEBAAHCwXwEGAEIACYCGwwWIQQb2cqtc1xMOkYN/MpN3hD3
AP+DWgUCXw7HsgUJEqkpoQAKCRBN3hD3AP+DWrrpD/4qS3dyVRxDcDHIlmguXjC1Q5tZTwNB
boaBTPHSy/Nksu0eY7x6HfQJ3xajVH32Ms6t1trDQmPx2iP5+7iDsb7OKAb5eOS8h+BEBDeq
3ecsQDv0fFJOA9ag5O3LLNk+3x3q7e0uo06XMaY7UHS341ozXUUI7wC7iKfoUTv03iO9El5f
XpNMx/YrIMduZ2+nd9Di7o5+KIwlb2mAB9sTNHdMrXesX8eBL6T9b+MZJk+mZuPxKNVfEQMQ
a5SxUEADIPQTPNvBewdeI80yeOCrN+Zzwy/Mrx9EPeu59Y5vSJOx/z6OUImD/GhX7Xvkt3kq
Er5KTrJz3++B6SH9pum9PuoE/k+nntJkNMmQpR4MCBaV/J9gIOPGodDKnjdng+mXliF3Ptu6
3oxc2RCyGzTlxyMwuc2U5Q7KtUNTdDe8T0uE+9b8BLMVQDDfJjqY0VVqSUwImzTDLX9S4g/8
kC4HRcclk8hpyhY2jKGluZO0awwTIMgVEzmTyBphDg/Gx7dZU1Xf8HFuE+UZ5UDHDTnwgv7E
th6RC9+WrhDNspZ9fJjKWRbveQgUFCpe1sa77LAw+XFrKmBHXp9ZVIe90RMe2tRL06BGiRZr
jPrnvUsUUsjRoRNJjKKA/REq+sAnhkNPPZ/NNMjaZ5b8Tovi8C0tmxiCHaQYqj7G2rgnT0kt
WNyWQQ==
Organization: Red Hat
In-Reply-To: <224f0265027a9578534586fa1f6ed80270aa24d5.1748500293.git-series.apopple@xxxxxxxxxx>
Content-Type: text/plain; charset=UTF-8; format=flowed
Content-Transfer-Encoding: 7bit
X-Spam-Status: No, score=-6.3 required=5.0 tests=DKIMWL_WL_HIGH,DKIM_SIGNED,
DKIM_VALID,DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,
MAILING_LIST_MULTI,RCVD_IN_DNSWL_MED,
RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,RCVD_IN_VALIDITY_RPBL_BLOCKED,
SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

On 29.05.25 08:32, Alistair Popple wrote:
> Currently dax is the only user of pmd and pud mapped ZONE_DEVICE
> pages. Therefore page walkers that want to exclude DAX pages can check
> pmd_devmap or pud_devmap. However soon dax will no longer set PFN_DEV,
> meaning dax pages are mapped as normal pages.
>
> Ensure page walkers that currently use pXd_devmap to skip DAX pages
> continue to do so by adding explicit checks of the VMA instead.
>
> Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
> ---
> fs/userfaultfd.c | 2 +-
> mm/hmm.c | 2 +-
> mm/userfaultfd.c | 2 +-
> 3 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 22f4bf9..de671d3 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -304,7 +304,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
> goto out;
>
> ret = false;
> - if (!pmd_present(_pmd) || pmd_devmap(_pmd))
> + if (!pmd_present(_pmd) || vma_is_dax(vmf->vma))
> goto out;
>
> if (pmd_trans_huge(_pmd)) {
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 082f7b7..db12c0a 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -429,7 +429,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
> return hmm_vma_walk_hole(start, end, -1, walk);
> }
>
> - if (pud_leaf(pud) && pud_devmap(pud)) {
> + if (pud_leaf(pud) && vma_is_dax(walk->vma)) {
> unsigned long i, npages, pfn;
> unsigned int required_fault;
> unsigned long *hmm_pfns;
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index e0db855..133f750 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -1791,7 +1791,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
>
> ptl = pmd_trans_huge_lock(src_pmd, src_vma);
> if (ptl) {
> - if (pmd_devmap(*src_pmd)) {
> + if (vma_is_dax(src_vma)) {
> spin_unlock(ptl);
> err = -ENOENT;
> break;

I assume we could also just refuse dax folios, right?

If we decide to check VMAs, we should probably check earlier.

But I wonder, what about anonymous non-dax pages in COW mappings? Is it
possible? Not supported?

If supported, checking the actual folio would be the right thing to do.

--
Cheers,

David / dhildenb

Return-Path: <linux-kernel+bounces-667907-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id E936A41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:43:37 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id A71871BC4BBB
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:43:22 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 4918B22B598;
Fri, 30 May 2025 09:37:35 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b="QfBx34tM"
Received: from mail-pj1-f47.google.com (mail-pj1-f47.google.com [209.85.216.47])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id C5365220F4B
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:37:23 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=209.85.216.47
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597849; cv=none; b=fNXJ7QspwZY66GmbIzHMbB88k7Ynfx+SNO15iMSp5kmQfJS0rEVf7F8AB9lymossyuRAYosj8DsMAa3wKHIY3bbj1wVmII/qpPMcp0otw/fK4/f4LlJOew0Ex02EGWcz7gXHri7Po2Yv7vebbqMSr0cEEJ4Sq3uTJhVwhP1Cvnw=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597849; c=relaxed/simple;
bh=bCYuosSzNO3MtUqiZdCPFyuVcpDUwXDrs2jBQRh3ImE=;
h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
MIME-Version; b=Jcl/VSeKFD7aq52v8zBg+iUCOIMCRFA7iLW8tQ2mSywrzykPP1nu0wBo4BQU60pULPq/Y0z0M9VQvtU3kMxgqvGaRKl2dWTjvqodQ3ZSB6+JvL/bv/6gI/zqBJsDTHFqQXKO5u+O41xx3X6xrxAzOe0Ye6pddE3lt1ZL2w/K+ew=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com; spf=pass smtp.mailfrom=bytedance.com; dkim=pass (2048-bit key) header.d=bytedance.com header.i=@bytedance.com header.b=QfBx34tM; arc=none smtp.client-ip=209.85.216.47
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=bytedance.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=bytedance.com
Received: by mail-pj1-f47.google.com with SMTP id 98e67ed59e1d1-311c95ddfb5so1365990a91.2
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:37:23 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=bytedance.com; s=google; t=1748597843; x=1749202643; darn=vger.kernel.org;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:from:to:cc:subject:date
:message-id:reply-to;
bh=tjVGQKkOlZOy+4fU/xUd4g+vTcZV8exC595abljBAKM=;
b=QfBx34tMTEyJ7KNaVGJ43dQOCPJp6eA0NTc0pbqRF3s/mKeIcZaN/apy1L30o2Y7zM
CUgILPf/uH0YNSQ6B3gGb1zCVH/zMU8kS1gNPoGq79ZZGrAzCjWFfvm10HpLjbTipwqo
Vlo4LwkCu6urf1i8rmcjtHO+Cf51zFcw7OlU349SqrcN9ZqEpEYdSGPNgFSK5R58MtND
qsTKgmkqC4VEuJlsuRSMS65ot5LsjEdOkBZ8j4pmjn1nk/+4+QRB4hrMPa79h2d4iYNO
0/vK5UAzXvjOuzOWlwNAB0mx6uRfy+lyyMX16LmL/jbloTW33YsTKfHhSNAMURO+BLsw
sWFA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597843; x=1749202643;
h=content-transfer-encoding:mime-version:references:in-reply-to
:message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
:subject:date:message-id:reply-to;
bh=tjVGQKkOlZOy+4fU/xUd4g+vTcZV8exC595abljBAKM=;
b=Wr5DSTTX/Md5zSDSUbxYYplGk9XGh0VXe0w6p/IesP+ymuSiqMAptbtgVFhaoJTS99
YzNKPAIpRCVEBeC/oADmXLyfsIvK38S6Cgcs/5D5C5vuJf58Whe0EBJoOSR6qCoJY/mO
zhbuobQHNGj56vLqOf9aA/Svv8oilolFRg+9ZFgFsEMW2VZiQ8QjIjhkJzGuTvuj6KoN
6XxTwTo5Ntng/e0tq8m4v+CnAjWfZRNgCofavbkNcoV/6jtTm9o3h1aHQgZyYLLRxG1F
C1S5ZX4FHN/bvuIF2T6yQCTGd/SBiOseLgxz8Wps5eo+/SVUSoLidUIr/vClW8XYXFvX
dQlA==
X-Forwarded-Encrypted: i=1; AJvYcCXs5dYRD/iKXL0sf3cNyUmBVTjyw8QglbINHBenWFHcEy7ePlbS6THYlCNSISWX755YQpbjEpc7tPCUF60=@vger.kernel.org
X-Gm-Message-State: AOJu0Yx+02aaLhajr3cr0vlOOwyLOd7G9YrXz92V/id+7wIN+yh3VRyl
O6eagwLzHBZRkjbgSp0mOi0keSaWe7uPkNugw8HolxwiwDNnJI+m+5NeD8gDo7wiT+E=
X-Gm-Gg: ASbGncvsr1D0cfz22lYy5bSR+rkipNQZn1hIpB9Hw5PxUHByOIzY/qruLjO9+nuY5V/
E0wdZI9axAXWMQCJIroRcqKmPUyx5FXjtrA7ey1HZMfPAn/lmVeRSVb+TRIP9Y+gxp/iMgM8ENC
M10asUb/pamkv3m4T68vdLeANPwZX5NFQOKfdgOx6kgLY0NREb2GqeK7PYJdkyZ8UzHuY319GiX
w0Hz7NHv2XBmVfzj33T9IOY0yhKC5Mv5S3kiDq4vYRLa7SyWqw/I7cJmLyE50H0lN1ir7RMfQW2
7eyG7kvapbMqext1xVkGMEr3e7mK6DUmqoa5bbRIWjSnboPw1uZJ8gw74ULKJp/p2STQLzvksc7
tzcntkRIREnT+jrmhiViG
X-Google-Smtp-Source: AGHT+IFSHaG5y+mkmzSneS5RbvI+Y1/bhXtbvD9WhGteUuouTIh+462k1YHpGCKGocTvjur7aEf1Ag==
X-Received: by 2002:a17:90b:1d51:b0:311:a314:c2dc with SMTP id 98e67ed59e1d1-3125036bafdmr2653995a91.14.1748597842300;
Fri, 30 May 2025 02:37:22 -0700 (PDT)
Received: from FQ627FTG20.bytedance.net ([63.216.146.178])
by smtp.gmail.com with ESMTPSA id 98e67ed59e1d1-3124e29f7b8sm838724a91.2.2025.05.30.02.37.07
(version=TLS1_3 cipher=TLS_CHACHA20_POLY1305_SHA256 bits=256/256);
Fri, 30 May 2025 02:37:21 -0700 (PDT)
From: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
To: tglx@xxxxxxxxxxxxx,
mingo@xxxxxxxxxx,
bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx,
x86@xxxxxxxxxx,
luto@xxxxxxxxxx,
kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx,
david@xxxxxxxxxx,
juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx,
peterz@xxxxxxxxxxxxx
Cc: dietmar.eggemann@xxxxxxx,
hpa@xxxxxxxxx,
acme@xxxxxxxxxx,
namhyung@xxxxxxxxxx,
mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx,
jolsa@xxxxxxxxxx,
irogers@xxxxxxxxxx,
adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx,
viro@xxxxxxxxxxxxxxxxxx,
brauner@xxxxxxxxxx,
jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx,
Liam.Howlett@xxxxxxxxxx,
vbabka@xxxxxxx,
rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx,
mhocko@xxxxxxxx,
rostedt@xxxxxxxxxxx,
bsegall@xxxxxxxxxx,
mgorman@xxxxxxx,
vschneid@xxxxxxxxxx,
jannh@xxxxxxxxxx,
pfalcato@xxxxxxx,
riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx,
linux-kernel@xxxxxxxxxxxxxxx,
linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx,
linux-mm@xxxxxxxxx,
duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx,
dengliang.1214@xxxxxxxxxxxxx,
xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx,
songmuchun@xxxxxxxxxxxxx,
yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx,
sunjiadong.lff@xxxxxxxxxxxxx,
Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Subject: [RFC v2 35/35] samples/rpal: add RPAL samples
Date: Fri, 30 May 2025 17:28:03 +0800
Message-Id: <b8a8d44e5b81c93598caee82254320507142d4be.1748594841.git.libo.gcs85@xxxxxxxxxxxxx>
X-Mailer: git-send-email 2.39.5 (Apple Git-154)
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

Added test samples for RPAL (with librpal included). Compile via:

cd samples/rpal && make

And run it using the following command:

./server & ./client

Example output:

EPOLL: Message length: 32 bytes, Total TSC cycles: 16439927066,
Message count: 1000000, Average latency: 16439 cycles
RPAL: Message length: 32 bytes, Total TSC cycles: 2197479484,
Message count: 1000000, Average latency: 2197 cycles

Signed-off-by: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
---
samples/rpal/Makefile | 17 +
samples/rpal/asm_define.c | 14 +
samples/rpal/client.c | 178 ++
samples/rpal/librpal/asm_define.h | 6 +
samples/rpal/librpal/asm_x86_64_rpal_call.S | 57 +
samples/rpal/librpal/debug.h | 12 +
samples/rpal/librpal/fiber.c | 119 +
samples/rpal/librpal/fiber.h | 64 +
.../rpal/librpal/jump_x86_64_sysv_elf_gas.S | 81 +
.../rpal/librpal/make_x86_64_sysv_elf_gas.S | 82 +
.../rpal/librpal/ontop_x86_64_sysv_elf_gas.S | 84 +
samples/rpal/librpal/private.h | 341 +++
samples/rpal/librpal/rpal.c | 2351 +++++++++++++++++
samples/rpal/librpal/rpal.h | 149 ++
samples/rpal/librpal/rpal_pkru.h | 78 +
samples/rpal/librpal/rpal_queue.c | 239 ++
samples/rpal/librpal/rpal_queue.h | 55 +
samples/rpal/librpal/rpal_x86_64_call_ret.S | 45 +
samples/rpal/offset.sh | 5 +
samples/rpal/server.c | 249 ++
20 files changed, 4226 insertions(+)
create mode 100644 samples/rpal/Makefile
create mode 100644 samples/rpal/asm_define.c
create mode 100644 samples/rpal/client.c
create mode 100644 samples/rpal/librpal/asm_define.h
create mode 100644 samples/rpal/librpal/asm_x86_64_rpal_call.S
create mode 100644 samples/rpal/librpal/debug.h
create mode 100644 samples/rpal/librpal/fiber.c
create mode 100644 samples/rpal/librpal/fiber.h
create mode 100644 samples/rpal/librpal/jump_x86_64_sysv_elf_gas.S
create mode 100644 samples/rpal/librpal/make_x86_64_sysv_elf_gas.S
create mode 100644 samples/rpal/librpal/ontop_x86_64_sysv_elf_gas.S
create mode 100644 samples/rpal/librpal/private.h
create mode 100644 samples/rpal/librpal/rpal.c
create mode 100644 samples/rpal/librpal/rpal.h
create mode 100644 samples/rpal/librpal/rpal_pkru.h
create mode 100644 samples/rpal/librpal/rpal_queue.c
create mode 100644 samples/rpal/librpal/rpal_queue.h
create mode 100644 samples/rpal/librpal/rpal_x86_64_call_ret.S
create mode 100755 samples/rpal/offset.sh
create mode 100644 samples/rpal/server.c

diff --git a/samples/rpal/Makefile b/samples/rpal/Makefile
new file mode 100644
index 000000000000..25627a970028
--- /dev/null
+++ b/samples/rpal/Makefile
@@ -0,0 +1,17 @@
+.PHONY: rpal
+
+all: server client offset
+
+offset: asm_define.c
+ $(shell ./offset.sh)
+
+server: server.c librpal/*.c librpal/*.S
+ $(CC) $^ -lpthread -g -o $@
+ @printf "RPAL" | dd of=./server bs=1 count=4 conv=notrunc seek=12
+
+client: client.c librpal/*.c librpal/*.S
+ $(CC) $^ -lpthread -g -o $@
+ @printf "RPAL" | dd of=./client bs=1 count=4 conv=notrunc seek=12
+
+clean:
+ rm server client
diff --git a/samples/rpal/asm_define.c b/samples/rpal/asm_define.c
new file mode 100644
index 000000000000..6f7731ebc870
--- /dev/null
+++ b/samples/rpal/asm_define.c
@@ -0,0 +1,14 @@
+#include <stddef.h>
+#include "librpal/private.h"
+
+#define DEFINE(sym, val) asm volatile("\n-> " #sym " %0 " #val "\n" : : "i" (val))
+
+static void common(void)
+{
+ DEFINE(RCI_SENDER_TLS_BASE, offsetof(rpal_call_info_t, sender_tls_base));
+ DEFINE(RCI_SENDER_FCTX, offsetof(rpal_call_info_t, sender_fctx));
+ DEFINE(RCI_PKRU, offsetof(rpal_call_info_t, pkru));
+ DEFINE(RC_SENDER_STATE, offsetof(receiver_context_t, sender_state));
+ DEFINE(RET_BEGIN, offsetof(critical_section_t, ret_begin));
+ DEFINE(RET_END, offsetof(critical_section_t, ret_end));
+}
diff --git a/samples/rpal/client.c b/samples/rpal/client.c
new file mode 100644
index 000000000000..2c4a9eb6115e
--- /dev/null
+++ b/samples/rpal/client.c
@@ -0,0 +1,178 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <x86intrin.h>
+#include "librpal/rpal.h"
+
+#define SOCKET_PATH "/tmp/rpal_socket"
+#define BUFFER_SIZE 1025
+#define MSG_NUM 1000000
+#define MSG_LEN 32
+
+char hello[BUFFER_SIZE];
+char buffer[BUFFER_SIZE] = { 0 };
+
+int remote_id;
+uint64_t remote_sidfd;
+
+#define INIT_MSG "INIT"
+#define SUCC_MSG "SUCC"
+#define FAIL_MSG "FAIL"
+
+#define handle_error(s) \
+ do { \
+ perror(s); \
+ exit(EXIT_FAILURE); \
+ } while (0)
+
+int rpal_epoll_add(int epfd, int fd)
+{
+ struct epoll_event ev;
+
+ ev.events = EPOLLRPALIN | EPOLLIN | EPOLLRDHUP | EPOLLET;
+ ev.data.fd = fd;
+
+ return rpal_epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev);
+}
+
+void rpal_client_init(int fd)
+{
+ struct epoll_event ev;
+ char buffer[BUFFER_SIZE];
+ rpal_error_code_t err;
+ uint64_t remote_key, service_key;
+ int epoll_fd;
+ int proc_fd;
+ int ret;
+
+ proc_fd = rpal_init(1, 0, &err);
+ if (proc_fd < 0)
+ handle_error("rpal init fail");
+ rpal_get_service_key(&service_key);
+
+ strcpy(buffer, INIT_MSG);
+ *(uint64_t *)(buffer + strlen(INIT_MSG)) = service_key;
+ ret = write(fd, buffer, strlen(INIT_MSG) + sizeof(uint64_t));
+ if (ret < 0)
+ handle_error("write key");
+
+ ret = read(fd, buffer, BUFFER_SIZE);
+ if (ret < 0)
+ handle_error("read key");
+
+ memcpy(&remote_key, buffer, sizeof(remote_key));
+ if (remote_key == 0)
+ handle_error("remote down");
+
+ ret = rpal_request_service(remote_key);
+ if (ret) {
+ write(fd, FAIL_MSG, strlen(FAIL_MSG));
+ handle_error("request");
+ }
+
+ ret = write(fd, SUCC_MSG, strlen(SUCC_MSG));
+ if (ret < 0)
+ handle_error("handshake");
+
+ remote_id = rpal_get_request_service_id(remote_key);
+ rpal_sender_init(&err);
+
+ epoll_fd = epoll_create(1024);
+ if (epoll_fd == -1) {
+ perror("epoll_create");
+ exit(EXIT_FAILURE);
+ }
+ rpal_epoll_add(epoll_fd, fd);
+
+ sleep(3); //wait for epoll wait
+ ret = rpal_uds_fdmap(((unsigned long)remote_id << 32) | fd,
+ &remote_sidfd);
+ if (ret < 0)
+ handle_error("uds fdmap fail");
+}
+
+int run_rpal_client(int msg_len)
+{
+ ssize_t valread;
+ int sock = 0;
+ struct sockaddr_un serv_addr;
+ int count = MSG_NUM;
+ int ret;
+
+ if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ perror("socket creation error");
+ return -1;
+ }
+
+ memset(&serv_addr, 0, sizeof(serv_addr));
+ serv_addr.sun_family = AF_UNIX;
+ strncpy(serv_addr.sun_path, SOCKET_PATH, sizeof(SOCKET_PATH));
+
+ if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) <
+ 0) {
+ perror("Connection Failed");
+ return -1;
+ }
+ rpal_client_init(sock);
+
+ while (count) {
+ for (int i = 18; i < msg_len; i++)
+ hello[i] = 'a' + i % 26;
+ sprintf(hello, "0x%016lx", __rdtsc());
+ ret = rpal_write_ptrs(remote_id, remote_sidfd, (int64_t *)hello,
+ msg_len / sizeof(int64_t *));
+ valread = read(sock, buffer, BUFFER_SIZE);
+ if (memcmp(hello, buffer, msg_len) != 0)
+ perror("data error");
+ count--;
+ }
+
+ close(sock);
+}
+
+int run_client(int msg_len)
+{
+ ssize_t valread;
+ int sock = 0;
+ struct sockaddr_un serv_addr;
+ int count = MSG_NUM;
+
+ if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ perror("socket creation error");
+ return -1;
+ }
+
+ memset(&serv_addr, 0, sizeof(serv_addr));
+ serv_addr.sun_family = AF_UNIX;
+ strncpy(serv_addr.sun_path, SOCKET_PATH, sizeof(SOCKET_PATH));
+
+ if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) <
+ 0) {
+ perror("Connection Failed");
+ return -1;
+ }
+
+ while (count) {
+ for (int i = 18; i < msg_len; i++)
+ hello[i] = 'a' + i % 26;
+ sprintf(hello, "0x%016lx", __rdtsc());
+ send(sock, hello, msg_len, 0);
+ valread = read(sock, buffer, BUFFER_SIZE);
+ if (memcmp(hello, buffer, msg_len) != 0)
+ perror("data error");
+ count--;
+ }
+
+ close(sock);
+}
+
+int main()
+{
+ run_client(MSG_LEN);
+ run_rpal_client(MSG_LEN);
+
+ return 0;
+}
diff --git a/samples/rpal/librpal/asm_define.h b/samples/rpal/librpal/asm_define.h
new file mode 100644
index 000000000000..bc57586cda58
--- /dev/null
+++ b/samples/rpal/librpal/asm_define.h
@@ -0,0 +1,6 @@
+#define RCI_SENDER_TLS_BASE 0
+#define RCI_SENDER_FCTX 16
+#define RCI_PKRU 8
+#define RC_SENDER_STATE 72
+#define RET_BEGIN 0
+#define RET_END 8
diff --git a/samples/rpal/librpal/asm_x86_64_rpal_call.S b/samples/rpal/librpal/asm_x86_64_rpal_call.S
new file mode 100644
index 000000000000..538e8ac5f09b
--- /dev/null
+++ b/samples/rpal/librpal/asm_x86_64_rpal_call.S
@@ -0,0 +1,57 @@
+#ifdef __x86_64__
+#define __ASSEMBLY__
+#include "asm_define.h"
+
+.text
+.globl rpal_access_warpper
+.type rpal_access_warpper,@function
+.align 16
+
+rpal_access_warpper:
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+ pushq %rbp
+
+ leaq -0x8(%rsp), %rsp
+ stmxcsr (%rsp)
+ fnstcw 0x4(%rsp)
+
+ pushq %rsp // Save rsp which may be unaligned.
+ pushq (%rsp) // Save the original value again
+ andq $-16, %rsp // Align stack to 16bytes - SysV AMD64 ABI.
+
+ movq %rsp, (%rdi)
+ call rpal_access@plt
+retip:
+ movq 8(%rsp), %rsp // Restore the potentially unaligned stack
+ ldmxcsr (%rsp)
+ fldcw 0x4(%rsp)
+ leaq 0x8(%rsp), %rsp
+
+ popq %rbp
+ popq %rbx
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ ret
+
+.size rpal_access_warpper,.-rpal_access_warpper
+
+
+
+.globl rpal_get_ret_rip
+.type rpal_get_ret_rip, @function
+.align 16
+rpal_get_ret_rip:
+ leaq retip(%rip), %rax
+ ret
+
+.size rpal_get_ret_rip,.-rpal_get_ret_rip
+
+/* Mark that we don't need executable stack. */
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/samples/rpal/librpal/debug.h b/samples/rpal/librpal/debug.h
new file mode 100644
index 000000000000..10d2fef8d69a
--- /dev/null
+++ b/samples/rpal/librpal/debug.h
@@ -0,0 +1,12 @@
+#ifndef RPAL_DEBUG_H
+#define RPAL_DEBUG_H
+
+typedef enum {
+ RPAL_DEBUG_MANAGEMENT = (1 << 0),
+ RPAL_DEBUG_SENDER = (1 << 1),
+ RPAL_DEBUG_RECVER = (1 << 2),
+ RPAL_DEBUG_FIBER = (1 << 3),
+
+ __RPAL_DEBUG_ALL = ~(0ULL),
+} rpal_debug_flag_t;
+#endif
diff --git a/samples/rpal/librpal/fiber.c b/samples/rpal/librpal/fiber.c
new file mode 100644
index 000000000000..2141ad9ab770
--- /dev/null
+++ b/samples/rpal/librpal/fiber.c
@@ -0,0 +1,119 @@
+#ifdef __x86_64__
+#include "debug.h"
+#include "fiber.h"
+#include "private.h"
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#define RPAL_CHECK_FAIL -1
+#define STACK_DEBUG 1
+
+static task_t *make_fiber_ctx(task_t *fc)
+{
+ fc->fctx = make_fcontext(fc->sp, 0, NULL);
+ return fc;
+}
+
+static task_t *fiber_ctx_create(void (*fn)(void *ud), void *ud, void *stack,
+ size_t size)
+{
+ task_t *fc;
+ int i;
+
+ if (stack == NULL)
+ return NULL;
+
+ fc = (task_t *)stack;
+ fc->fn = fn;
+ fc->ud = ud;
+ fc->size = size;
+ fc->sp = stack + size;
+ for (i = 0; i < NR_PADDING; ++i) {
+ fc->padding[i] = 0xdeadbeef;
+ }
+
+ return make_fiber_ctx(fc);
+}
+
+task_t *fiber_ctx_alloc(void (*fn)(void *ud), void *ud, size_t size)
+{
+ void *stack;
+ size_t stack_size;
+ size_t total_size;
+ void *lower_guard;
+ void *upper_guard;
+
+ if (PAGE_SIZE == 4096 || STACK_DEBUG) {
+ stack_size = (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+
+ dbprint(RPAL_DEBUG_FIBER,
+ "fiber_ctx_alloc: stack size adjusted from %lu to %lu\n",
+ size, stack_size);
+
+ // Allocate a stack using mmap with 2 extra pages, 1 at each end
+ // which will be PROT_NONE to act as guard pages to catch overflow
+ // and underflow. This will result in a SIGSEGV but should make it
+ // easier to catch a stack that is too small (or underflows).
+ //
+ // Notes:
+ //
+ // 1. On ARM64 with 64K pages this would be quite wasteful of memory
+ // so it is behind a DEBUG flag to enable/disable on that platform.
+ //
+ // 2. If the requested stack size is not a multiple of a page size
+ // then stack underflow wont always be caught as there is some
+ // extra space up until the next page boundary with the guard page.
+ //
+ // 3. The task_t is placed at the top of the stack so can be overwritten
+ // just before the stack overflows and hits the guard page.
+ //
+
+ total_size = stack_size + (PAGE_SIZE * 2);
+ lower_guard = mmap(NULL, total_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (lower_guard == MAP_FAILED) {
+ errprint("mmap of %lu bytes failed: %s\n", total_size,
+ strerror(errno));
+ return NULL;
+ }
+
+ stack = lower_guard + PAGE_SIZE;
+ upper_guard = stack + stack_size;
+ mprotect(lower_guard, PAGE_SIZE, PROT_NONE);
+ mprotect(upper_guard, PAGE_SIZE, PROT_NONE);
+
+ dbprint(RPAL_DEBUG_FIBER,
+ "Total stack of size %lu bytes allocated @ %p\n",
+ total_size, stack);
+ dbprint(RPAL_DEBUG_FIBER,
+ "Underflow guard page %p - %p overflow guard page %p - %p\n",
+ lower_guard, lower_guard + PAGE_SIZE - 1, upper_guard,
+ upper_guard + PAGE_SIZE - 1);
+ } else {
+ stack = malloc(size);
+ }
+ return fiber_ctx_create(fn, ud, stack, size);
+}
+
+void fiber_ctx_free(task_t *fc)
+{
+ size_t stack_size;
+ size_t total_size;
+ void *addr;
+
+ if (STACK_DEBUG) {
+ stack_size = (fc->size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+ total_size = stack_size + (PAGE_SIZE * 2);
+ addr = fc;
+ addr -= PAGE_SIZE;
+ if (munmap(addr, total_size) != 0) {
+ errprint("munmap of %lu bytes @ %p failed: %s\n",
+ total_size, addr, strerror(errno));
+ }
+ } else {
+ free(fc);
+ }
+}
+#endif
diff --git a/samples/rpal/librpal/fiber.h b/samples/rpal/librpal/fiber.h
new file mode 100644
index 000000000000..b46485ba740f
--- /dev/null
+++ b/samples/rpal/librpal/fiber.h
@@ -0,0 +1,64 @@
+#ifndef FIBER_H
+#define FIBER_H
+
+#include <stdlib.h>
+
+typedef void *fcontext_t;
+typedef struct {
+ fcontext_t fctx;
+ void *ud;
+} transfer_t;
+
+typedef struct fiber_stack {
+ unsigned long padding;
+ unsigned long r12;
+ unsigned long r13;
+ unsigned long r14;
+ unsigned long r15;
+ unsigned long rbx;
+ unsigned long rbp;
+ unsigned long rip;
+} fiber_stack_t;
+
+#define NR_PADDING 8
+typedef struct fiber_ctx {
+ void *sp;
+ size_t size;
+ void (*fn)(void *fc);
+ void *ud;
+ fcontext_t fctx;
+ int padding[NR_PADDING];
+} task_t;
+
+task_t *fiber_ctx_alloc(void (*fn)(void *ud), void *ud, size_t size);
+void fiber_ctx_free(task_t *fc);
+
+/**
+ * @brief Make a context for jump_fcontext.
+ *
+ * @param sp The stack top pointer of context.
+ * @param size The size of stack, this argument is useless. But a second argument is neccessary.
+ * @param fn The function pointer of the context function.
+ *
+ * @return The pointer of the newly made context.
+ */
+extern fcontext_t make_fcontext(void *sp, size_t size, void (*fn)(transfer_t));
+
+/**
+ * @brief jump to target context and execute fn with argument ud
+ *
+ * @param to The pointer of target context.
+ * @param ud The data part of the argument of fn.
+ *
+ * @return the pointer of the prev transfer_t struct, where RAX store
+ * previous context, RDX store ud passed by previous caller.
+ */
+extern transfer_t jump_fcontext(fcontext_t const to, void *ud);
+
+/**
+ * @brief To be written.
+ */
+extern transfer_t ontop_fcontext(fcontext_t const to, void *ud,
+ transfer_t (*fn)(transfer_t));
+
+#endif
diff --git a/samples/rpal/librpal/jump_x86_64_sysv_elf_gas.S b/samples/rpal/librpal/jump_x86_64_sysv_elf_gas.S
new file mode 100644
index 000000000000..43d3a8149c58
--- /dev/null
+++ b/samples/rpal/librpal/jump_x86_64_sysv_elf_gas.S
@@ -0,0 +1,81 @@
+/*
+ Copyright Oliver Kowalke 2009.
+ Distributed under the Boost Software License, Version 1.0.
+ (See accompanying file LICENSE_1_0.txt or copy at
+ http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+/****************************************************************************************
+ * *
+ * ---------------------------------------------------------------------------------- *
+ * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | *
+ * ---------------------------------------------------------------------------------- *
+ * | 0x0 | 0x4 | 0x8 | 0xc | 0x10 | 0x14 | 0x18 | 0x1c | *
+ * ---------------------------------------------------------------------------------- *
+ * | fc_mxcsr|fc_x87_cw| R12 | R13 | R14 | *
+ * ---------------------------------------------------------------------------------- *
+ * ---------------------------------------------------------------------------------- *
+ * | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | *
+ * ---------------------------------------------------------------------------------- *
+ * | 0x20 | 0x24 | 0x28 | 0x2c | 0x30 | 0x34 | 0x38 | 0x3c | *
+ * ---------------------------------------------------------------------------------- *
+ * | R15 | RBX | RBP | RIP | *
+ * ---------------------------------------------------------------------------------- *
+ * *
+ ****************************************************************************************/
+#ifdef __x86_64__
+.text
+.globl jump_fcontext
+.type jump_fcontext,@function
+.align 16
+jump_fcontext:
+ leaq -0x38(%rsp), %rsp /* prepare stack */
+
+#if !defined(BOOST_USE_TSX)
+ stmxcsr (%rsp) /* save MMX control- and status-word */
+ fnstcw 0x4(%rsp) /* save x87 control-word */
+#endif
+
+ movq %r12, 0x8(%rsp) /* save R12 */
+ movq %r13, 0x10(%rsp) /* save R13 */
+ movq %r14, 0x18(%rsp) /* save R14 */
+ movq %r15, 0x20(%rsp) /* save R15 */
+ movq %rbx, 0x28(%rsp) /* save RBX */
+ movq %rbp, 0x30(%rsp) /* save RBP */
+
+ /* store RSP (pointing to context-data) in RAX */
+ movq %rsp, %rax
+
+ /* restore RSP (pointing to context-data) from RDI */
+ movq %rdi, %rsp
+
+ movq 0x38(%rsp), %r8 /* restore return-address */
+
+#if !defined(BOOST_USE_TSX)
+ ldmxcsr (%rsp) /* restore MMX control- and status-word */
+ fldcw 0x4(%rsp) /* restore x87 control-word */
+#endif
+
+ movq 0x8(%rsp), %r12 /* restore R12 */
+ movq 0x10(%rsp), %r13 /* restore R13 */
+ movq 0x18(%rsp), %r14 /* restore R14 */
+ movq 0x20(%rsp), %r15 /* restore R15 */
+ movq 0x28(%rsp), %rbx /* restore RBX */
+ movq 0x30(%rsp), %rbp /* restore RBP */
+
+ leaq 0x40(%rsp), %rsp /* prepare stack */
+
+ /* return transfer_t from jump */
+ /* RAX == fctx, RDX == data */
+ movq %rsi, %rdx
+ /* pass transfer_t as first arg in context function */
+ /* RDI == fctx, RSI == data */
+ movq %rax, %rdi
+
+ /* indirect jump to context */
+ jmp *%r8
+.size jump_fcontext,.-jump_fcontext
+
+/* Mark that we don't need executable stack. */
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/samples/rpal/librpal/make_x86_64_sysv_elf_gas.S b/samples/rpal/librpal/make_x86_64_sysv_elf_gas.S
new file mode 100644
index 000000000000..4f3af9247110
--- /dev/null
+++ b/samples/rpal/librpal/make_x86_64_sysv_elf_gas.S
@@ -0,0 +1,82 @@
+/*
+ Copyright Oliver Kowalke 2009.
+ Distributed under the Boost Software License, Version 1.0.
+ (See accompanying file LICENSE_1_0.txt or copy at
+ http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+/****************************************************************************************
+ * *
+ * ---------------------------------------------------------------------------------- *
+ * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | *
+ * ---------------------------------------------------------------------------------- *
+ * | 0x0 | 0x4 | 0x8 | 0xc | 0x10 | 0x14 | 0x18 | 0x1c | *
+ * ---------------------------------------------------------------------------------- *
+ * | fc_mxcsr|fc_x87_cw| R12 | R13 | R14 | *
+ * ---------------------------------------------------------------------------------- *
+ * ---------------------------------------------------------------------------------- *
+ * | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | *
+ * ---------------------------------------------------------------------------------- *
+ * | 0x20 | 0x24 | 0x28 | 0x2c | 0x30 | 0x34 | 0x38 | 0x3c | *
+ * ---------------------------------------------------------------------------------- *
+ * | R15 | RBX | RBP | RIP | *
+ * ---------------------------------------------------------------------------------- *
+ * *
+ ****************************************************************************************/
+#ifdef __x86_64__
+.text
+.globl make_fcontext
+.type make_fcontext,@function
+.align 16
+make_fcontext:
+ /* first arg of make_fcontext() == top of context-stack */
+ movq %rdi, %rax
+
+ /* shift address in RAX to lower 16 byte boundary */
+ andq $-16, %rax
+
+ /* reserve space for context-data on context-stack */
+ /* on context-function entry: (RSP -0x8) % 16 == 0 */
+ leaq -0x40(%rax), %rax
+
+ /* third arg of make_fcontext() == address of context-function */
+ /* stored in RBX */
+ movq %rdx, 0x28(%rax)
+
+ /* save MMX control- and status-word */
+ stmxcsr (%rax)
+ /* save x87 control-word */
+ fnstcw 0x4(%rax)
+
+ /* compute abs address of label trampoline */
+ leaq trampoline(%rip), %rcx
+ /* save address of trampoline as return-address for context-function */
+ /* will be entered after calling jump_fcontext() first time */
+ movq %rcx, 0x38(%rax)
+
+ /* compute abs address of label finish */
+ leaq finish(%rip), %rcx
+ /* save address of finish as return-address for context-function */
+ /* will be entered after context-function returns */
+ movq %rcx, 0x30(%rax)
+
+ ret /* return pointer to context-data */
+
+trampoline:
+ /* store return address on stack */
+ /* fix stack alignment */
+ push %rbp
+ /* jump to context-function */
+ jmp *%rbx
+
+finish:
+ /* exit code is zero */
+ xorq %rdi, %rdi
+ /* exit application */
+ call _exit@PLT
+ hlt
+.size make_fcontext,.-make_fcontext
+
+/* Mark that we don't need executable stack. */
+.section .note.GNU-stack,"",%progbits
+#endif
\ No newline at end of file
diff --git a/samples/rpal/librpal/ontop_x86_64_sysv_elf_gas.S b/samples/rpal/librpal/ontop_x86_64_sysv_elf_gas.S
new file mode 100644
index 000000000000..9dce797c2541
--- /dev/null
+++ b/samples/rpal/librpal/ontop_x86_64_sysv_elf_gas.S
@@ -0,0 +1,84 @@
+/*
+ Copyright Oliver Kowalke 2009.
+ Distributed under the Boost Software License, Version 1.0.
+ (See accompanying file LICENSE_1_0.txt or copy at
+ http://www.boost.org/LICENSE_1_0.txt)
+*/
+
+/****************************************************************************************
+ * *
+ * ---------------------------------------------------------------------------------- *
+ * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | *
+ * ---------------------------------------------------------------------------------- *
+ * | 0x0 | 0x4 | 0x8 | 0xc | 0x10 | 0x14 | 0x18 | 0x1c | *
+ * ---------------------------------------------------------------------------------- *
+ * | fc_mxcsr|fc_x87_cw| R12 | R13 | R14 | *
+ * ---------------------------------------------------------------------------------- *
+ * ---------------------------------------------------------------------------------- *
+ * | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | *
+ * ---------------------------------------------------------------------------------- *
+ * | 0x20 | 0x24 | 0x28 | 0x2c | 0x30 | 0x34 | 0x38 | 0x3c | *
+ * ---------------------------------------------------------------------------------- *
+ * | R15 | RBX | RBP | RIP | *
+ * ---------------------------------------------------------------------------------- *
+ * *
+ ****************************************************************************************/
+#ifdef __x86_64__
+.text
+.globl ontop_fcontext
+.type ontop_fcontext,@function
+.align 16
+ontop_fcontext:
+ /* preserve ontop-function in R8 */
+ movq %rdx, %r8
+
+ leaq -0x38(%rsp), %rsp /* prepare stack */
+
+#if !defined(BOOST_USE_TSX)
+ stmxcsr (%rsp) /* save MMX control- and status-word */
+ fnstcw 0x4(%rsp) /* save x87 control-word */
+#endif
+
+ movq %r12, 0x8(%rsp) /* save R12 */
+ movq %r13, 0x10(%rsp) /* save R13 */
+ movq %r14, 0x18(%rsp) /* save R14 */
+ movq %r15, 0x20(%rsp) /* save R15 */
+ movq %rbx, 0x28(%rsp) /* save RBX */
+ movq %rbp, 0x30(%rsp) /* save RBP */
+
+ /* store RSP (pointing to context-data) in RAX */
+ movq %rsp, %rax
+
+ /* restore RSP (pointing to context-data) from RDI */
+ movq %rdi, %rsp
+
+#if !defined(BOOST_USE_TSX)
+ ldmxcsr (%rsp) /* restore MMX control- and status-word */
+ fldcw 0x4(%rsp) /* restore x87 control-word */
+#endif
+
+ movq 0x8(%rsp), %r12 /* restore R12 */
+ movq 0x10(%rsp), %r13 /* restore R13 */
+ movq 0x18(%rsp), %r14 /* restore R14 */
+ movq 0x20(%rsp), %r15 /* restore R15 */
+ movq 0x28(%rsp), %rbx /* restore RBX */
+ movq 0x30(%rsp), %rbp /* restore RBP */
+
+ leaq 0x38(%rsp), %rsp /* prepare stack */
+
+ /* return transfer_t from jump */
+ /* RAX == fctx, RDX == data */
+ movq %rsi, %rdx
+ /* pass transfer_t as first arg in context function */
+ /* RDI == fctx, RSI == data */
+ movq %rax, %rdi
+
+ /* keep return-address on stack */
+
+ /* indirect jump to context */
+ jmp *%r8
+.size ontop_fcontext,.-ontop_fcontext
+
+/* Mark that we don't need executable stack. */
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/samples/rpal/librpal/private.h b/samples/rpal/librpal/private.h
new file mode 100644
index 000000000000..9dc78f449f0f
--- /dev/null
+++ b/samples/rpal/librpal/private.h
@@ -0,0 +1,341 @@
+#ifndef PRIVATE_H
+#define PRIVATE_H
+
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+#ifdef __x86_64__
+#include <immintrin.h>
+#endif
+#include <pthread.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <sys/ioctl.h>
+
+#include "debug.h"
+#include "rpal_queue.h"
+#include "fiber.h"
+#include "rpal.h"
+
+#ifdef __x86_64__
+static inline void write_tls_base(unsigned long tls_base)
+{
+ asm volatile("wrfsbase %0" ::"r"(tls_base) : "memory");
+}
+
+static inline unsigned long read_tls_base(void)
+{
+ unsigned long fsbase;
+ asm volatile("rdfsbase %0" : "=r"(fsbase)::"memory");
+ return fsbase;
+}
+#endif
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+// | fd_timestamp | pad | rthread_id | server_fd |
+// | 16 | 8 | 8 | 32 |
+#define LOW32_MASK ((1UL << 32) - 1)
+#define MIDL8_MASK ((unsigned long)(((1UL << 8) - 1)) << 32)
+
+#define HIGH16_OFFSET 48
+#define HIGH32_OFFSET 32
+
+#define get_high16(val) ({ (val) >> HIGH16_OFFSET; })
+
+#define get_high32(val) ({ (val) >> HIGH32_OFFSET; })
+
+#define get_midl8(val) ({ ((val) & MIDL8_MASK) >> HIGH32_OFFSET; })
+#define get_low32(val) ({ (val) & LOW32_MASK; })
+
+#define get_fdtimestamp(rpalfd) get_high16(rpalfd)
+#define get_rid(rpalfd) get_midl8(rpalfd)
+#define get_sfd(rpalfd) get_low32(rpalfd)
+
+#define PAGE_SIZE 4096
+#define DEFUALT_STACK_SIZE (PAGE_SIZE * 4)
+#define TRAMPOLINE_SIZE (PAGE_SIZE * 1)
+
+#define BITS_PER_LONG 64
+#define BITS_TO_LONGS(x) \
+ (((x) + 8 * sizeof(unsigned long) - 1) / (8 * sizeof(unsigned long)))
+
+#define KEY_SIZE 16
+
+enum rpal_sender_state {
+ RPAL_SENDER_STATE_RUNNING,
+ RPAL_SENDER_STATE_CALL,
+ RPAL_SENDER_STATE_KERNEL_RET,
+};
+
+enum rpal_epoll_event {
+ RPAL_KERNEL_PENDING = 0x1,
+ RPAL_USER_PENDING = 0x2,
+};
+
+enum rpal_receiver_state {
+ RPAL_RECEIVER_STATE_RUNNING,
+ RPAL_RECEIVER_STATE_KERNEL_RET,
+ RPAL_RECEIVER_STATE_READY,
+ RPAL_RECEIVER_STATE_WAIT,
+ RPAL_RECEIVER_STATE_CALL,
+ RPAL_RECEIVER_STATE_LAZY_SWITCH,
+ RPAL_RECEIVER_STATE_MAX,
+};
+
+enum rpal_command_type {
+ RPAL_CMD_GET_API_VERSION_AND_CAP,
+ RPAL_CMD_GET_SERVICE_KEY,
+ RPAL_CMD_GET_SERVICE_ID,
+ RPAL_CMD_REGISTER_SENDER,
+ RPAL_CMD_UNREGISTER_SENDER,
+ RPAL_CMD_REGISTER_RECEIVER,
+ RPAL_CMD_UNREGISTER_RECEIVER,
+ RPAL_CMD_ENABLE_SERVICE,
+ RPAL_CMD_DISABLE_SERVICE,
+ RPAL_CMD_REQUEST_SERVICE,
+ RPAL_CMD_RELEASE_SERVICE,
+ RPAL_CMD_GET_SERVICE_PKEY,
+ RPAL_CMD_UDS_FDMAP,
+ RPAL_NR_CMD,
+};
+
+/* RPAL ioctl macro */
+#define RPAL_IOCTL_MAGIC 0x33
+#define RPAL_IOCTL_GET_API_VERSION_AND_CAP \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_GET_API_VERSION_AND_CAP, \
+ struct rpal_version_info *)
+#define RPAL_IOCTL_GET_SERVICE_KEY \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_GET_SERVICE_KEY, unsigned long)
+#define RPAL_IOCTL_GET_SERVICE_ID \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_GET_SERVICE_ID, int *)
+#define RPAL_IOCTL_REGISTER_SENDER \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_REGISTER_SENDER, unsigned long)
+#define RPAL_IOCTL_UNREGISTER_SENDER \
+ _IO(RPAL_IOCTL_MAGIC, RPAL_CMD_UNREGISTER_SENDER)
+#define RPAL_IOCTL_REGISTER_RECEIVER \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_REGISTER_RECEIVER, unsigned long)
+#define RPAL_IOCTL_UNREGISTER_RECEIVER \
+ _IO(RPAL_IOCTL_MAGIC, RPAL_CMD_UNREGISTER_RECEIVER)
+#define RPAL_IOCTL_ENABLE_SERVICE \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_ENABLE_SERVICE, unsigned long)
+#define RPAL_IOCTL_DISABLE_SERVICE \
+ _IO(RPAL_IOCTL_MAGIC, RPAL_CMD_DISABLE_SERVICE)
+#define RPAL_IOCTL_REQUEST_SERVICE \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_REQUEST_SERVICE, unsigned long)
+#define RPAL_IOCTL_RELEASE_SERVICE \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_RELEASE_SERVICE, unsigned long)
+#define RPAL_IOCTL_GET_SERVICE_PKEY \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_GET_SERVICE_PKEY, int *)
+#define RPAL_IOCTL_UDS_FDMAP \
+ _IOWR(RPAL_IOCTL_MAGIC, RPAL_CMD_UDS_FDMAP, void *)
+
+typedef enum rpal_receiver_status {
+ RPAL_RECEIVER_UNINITIALIZED,
+ RPAL_RECEIVER_INITIALIZED,
+ RPAL_RECEIVER_AVAILABLE,
+} rpal_receiver_status_t;
+
+enum RPAL_CAPABILITIES {
+ RPAL_CAP_PKU,
+};
+
+#define RPAL_SID_SHIFT 24
+#define RPAL_ID_SHIFT 8
+#define RPAL_RECEIVER_STATE_MASK ((1 << RPAL_ID_SHIFT) - 1)
+#define RPAL_SID_MASK (~((1 << RPAL_SID_SHIFT) - 1))
+#define RPAL_ID_MASK (~(0 | RPAL_RECEIVER_STATE_MASK | RPAL_SID_MASK))
+#define RPAL_MAX_ID ((1 << (RPAL_SID_SHIFT - RPAL_ID_SHIFT)) - 1)
+#define RPAL_BUILD_CALL_STATE(id, sid) \
+ ((sid << RPAL_SID_SHIFT) | (id << RPAL_ID_SHIFT) | RPAL_RECEIVER_STATE_CALL)
+
+typedef struct rpal_capability {
+ int compat_version;
+ int api_version;
+ unsigned long cap;
+} rpal_capability_t;
+
+typedef struct task_context {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long rbx;
+ unsigned long rbp;
+ unsigned long rip;
+ unsigned long rsp;
+} task_context_t;
+
+typedef struct receiver_context {
+ task_context_t task_context;
+ int receiver_id;
+ int receiver_state;
+ int sender_state;
+ int ep_pending;
+ int rpal_ep_poll_magic;
+ int epfd;
+ void *ep_events;
+ int maxevents;
+ int timeout;
+ int64_t total_time;
+} receiver_context_t;
+
+typedef struct rpal_call_info {
+ unsigned long sender_tls_base;
+ uint32_t pkru;
+ fcontext_t sender_fctx;
+} rpal_call_info_t;
+
+enum thread_type {
+ RPAL_RECEIVER = 0x1,
+ RPAL_SENDER = 0x2,
+};
+typedef struct rpal_receiver_info {
+ long tid;
+ unsigned long tls_base;
+
+ int epfd;
+ rpal_receiver_status_t status;
+ epoll_uevent_queue_t ueventq;
+ volatile uint64_t uqlock;
+
+ fcontext_t main_ctx;
+ task_t *ep_stack;
+ task_t *trampoline;
+
+ rpal_call_info_t rci;
+
+ volatile receiver_context_t *rc;
+ struct rpal_thread_pool *rtp;
+} rpal_receiver_info_t;
+
+typedef struct fd_table fd_table_t;
+/* Keep it the same as kernel */
+struct rpal_thread_pool {
+ rpal_receiver_info_t *rris;
+ fd_table_t *fdt;
+ uint64_t service_key;
+ int nr_threads;
+ int service_id;
+ int pkey;
+};
+
+struct rpal_request_arg {
+ unsigned long version;
+ uint64_t key;
+ struct rpal_thread_pool **rtp;
+ int *id;
+ int *pkey;
+};
+
+struct rpal_uds_fdmap_arg {
+ int service_id;
+ int cfd;
+ unsigned long *res;
+};
+
+#define RPAL_ERROR_MAGIC 0x98CC98CC
+
+typedef struct rpal_error_context {
+ unsigned long tls_base;
+ uint64_t erip;
+ uint64_t ersp;
+ int state;
+ int magic;
+} rpal_error_context_t;
+
+typedef struct sender_context {
+ task_context_t task_context;
+ rpal_error_context_t ec;
+ int sender_id;
+ int64_t start_time;
+ int64_t total_time;
+} sender_context_t;
+
+#define RPAL_EP_POLL_MAGIC 0xCC98CC98
+
+typedef struct rpal_sender_info {
+ int idx;
+ int tid;
+ int pkey;
+ int inited;
+ sender_context_t sc;
+} rpal_sender_info_t;
+
+typedef struct fdt_node fdt_node_t;
+
+typedef struct fd_event {
+ int epfd;
+ int fd;
+ struct epoll_event epev;
+ uint32_t events;
+ int wait;
+
+ rpal_queue_t q;
+ int pkey; // unused
+ fdt_node_t *node;
+ struct fd_event *next;
+ uint16_t timestamp;
+ uint16_t outdated;
+ uint64_t service_key;
+} fd_event_t;
+
+struct fdt_node {
+ fd_event_t **events;
+ fdt_node_t *next;
+ int *ref_count;
+ uint16_t *timestamps;
+};
+
+// when sender calls fd_event_get, we must check this number to avoid
+// accessing outdated fdt_node definitions
+
+#define FDTAB_MAG1 0x4D414731UL // add fde lazyswitch
+#define FDTAB_MAG2 0x14D414731UL // add fde timestamp
+#define FDTAB_MAG3 0x34D414731UL // add fde outdated
+#define FDTAB_MAG4 0x74D414731UL // add automatic identification rpal mode
+
+enum fde_ref_status {
+ FDE_FREEING = -100,
+ FDE_FREED = -1,
+ FDE_AVAILABLE = 0,
+};
+
+#define DEFAULT_NODE_SHIFT 14 // 2^14 elements per node
+typedef struct fd_table {
+ fdt_node_t *head;
+ fdt_node_t *tail;
+ int max_fd;
+ unsigned int node_shift;
+ unsigned int node_mask;
+ pthread_mutex_t lock;
+ unsigned long magic;
+ fd_event_t *freelist;
+ pthread_mutex_t list_lock;
+} fd_table_t;
+
+typedef struct critical_section {
+ unsigned long ret_begin;
+ unsigned long ret_end;
+} critical_section_t;
+
+struct rpal_service_metadata {
+ unsigned long version;
+ struct rpal_thread_pool *rtp;
+ critical_section_t rcs;
+ int pkey;
+};
+
+#ifndef RPAL_DEBUG
+#define dbprint(category, format, args...) ((void)0)
+#else
+void dbprint(rpal_debug_flag_t category, char *format, ...)
+ __attribute__((format(printf, 2, 3)));
+#endif
+void errprint(const char *format, ...) __attribute__((format(printf, 1, 2)));
+void warnprint(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+#endif
diff --git a/samples/rpal/librpal/rpal.c b/samples/rpal/librpal/rpal.c
new file mode 100644
index 000000000000..64bd2b93bd67
--- /dev/null
+++ b/samples/rpal/librpal/rpal.c
@@ -0,0 +1,2351 @@
+#include "private.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <sys/eventfd.h>
+#include <linux/futex.h>
+#include <signal.h>
+#include <stdarg.h>
+
+#include "rpal_pkru.h"
+
+/* prints an error message to stderr */
+void errprint(const char *format, ...)
+{
+ va_list args;
+
+ fprintf(stderr, "[RPAL_ERROR] ");
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+}
+
+/* prints a warning message to stderr */
+void warnprint(const char *format, ...)
+{
+ va_list args;
+
+ fprintf(stderr, "[RPAL_WARNING] ");
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+}
+
+#ifdef RPAL_DEBUG
+void dbprint(rpal_debug_flag_t category, char *format, ...)
+{
+ if (category & RPAL_DEBUG) {
+ va_list args;
+ fprintf(stderr, "[RPAL_DEBUG] ");
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+ }
+}
+#endif
+
+#define SAVE_FPU(mxcsr, fpucw) \
+ __asm__ __volatile__("stmxcsr %0;" \
+ "fnstcw %1;" \
+ : "=m"(mxcsr), "=m"(fpucw) \
+ :)
+#define RESTORE_FPU(mxcsr, fpucw) \
+ __asm__ __volatile__("ldmxcsr %0;" \
+ "fldcw %1;" \
+ : \
+ : "m"(mxcsr), "m"(fpucw))
+
+#define ERRREPORT(EPTR, ECODE, ...) \
+ if (EPTR) { \
+ *EPTR = ECODE; \
+ } \
+ errprint(__VA_ARGS__);
+
+#define RPAL_MGT_FILE "/proc/rpal"
+#define MAX_SUPPROTED_CPUS 192
+
+static __always_inline unsigned long __ffs(unsigned long word)
+{
+ asm("rep; bsf %1,%0" : "=r"(word) : "rm"(word));
+
+ return word;
+}
+
+static void __set_bit(uint64_t *bitmap, int idx)
+{
+ int bit, i;
+ i = idx / 8;
+ bit = idx % 8;
+ bitmap[i] |= (1UL << bit);
+}
+
+static int clear_first_set_bit(uint64_t *bitmap, int size)
+{
+ int idx;
+ int bit, i;
+
+ for (i = 0; i * BITS_PER_LONG < size; i++) {
+ if (bitmap[i]) {
+ bit = __ffs(bitmap[i]);
+ idx = i * BITS_PER_LONG + bit;
+ if (idx >= size) {
+ return -1;
+ }
+ bitmap[i] &= ~(1UL << bit);
+ return idx;
+ }
+ }
+ return -1;
+}
+
+extern void rpal_get_critical_addr(critical_section_t *rcs);
+static critical_section_t rcs = { 0 };
+
+#define MAX_SERVICEID 254 // Intel MPK Limit
+#define MIN_RPAL_KERNEL_API_VERSION 1
+#define TARGET_RPAL_KERNEL_API_VERSION \
+ 1 // RPAL will disable when KERNEL_API < TARGET_RPAL_KERNEL_API_VERSION
+
+enum {
+ RCALL_IN = 0x1 << 0,
+ RCALL_OUT = 0x1 << 1,
+};
+
+enum {
+ FDE_NO_TRIGGER,
+ FDE_TRIGGER_OUT,
+};
+
+#define EPOLLRPALINOUT_BITS (EPOLLRPALIN | EPOLLRPALOUT)
+
+#define DEFAULT_QUEUE_SIZE 32U
+
+typedef struct rpal_requested_service {
+ struct rpal_thread_pool *service;
+ int pkey;
+ uint64_t key;
+} rpal_requeseted_service_t;
+
+static int rpal_mgtfd = -1;
+static int inited;
+int pkru_enabled = 0;
+
+static rpal_capability_t version;
+static pthread_key_t rpal_key;
+static rpal_requeseted_service_t requested_services[MAX_SERVICEID];
+static pthread_mutex_t release_lock;
+
+typedef struct rpal_local {
+ unsigned int tflag;
+ rpal_receiver_info_t *rri;
+ rpal_sender_info_t *rsi;
+} rpal_local_t;
+
+#define SENDERS_PAGE_ORDER 3
+#define RPALTHREAD_PAGE_ORDER 0
+
+typedef struct rpal_thread_metadata {
+ int rpal_receiver_idx;
+ int service_id;
+ const int epcpage_order;
+ uint64_t service_key;
+ struct rpal_thread_pool *rtp;
+ receiver_context_t *rc;
+ pid_t pid;
+ int *eventfds;
+} rpal_thread_metadata_t;
+
+static rpal_thread_metadata_t threads_md = {
+ .service_id = -1,
+ .epcpage_order = RPALTHREAD_PAGE_ORDER,
+};
+
+static inline rpal_sender_info_t *current_rpal_sender(void)
+{
+ rpal_local_t *local;
+
+ local = pthread_getspecific(rpal_key);
+ if (local && (local->tflag & RPAL_SENDER)) {
+ return local->rsi;
+ } else {
+ return NULL;
+ }
+}
+
+static inline rpal_receiver_info_t *current_rpal_thread(void)
+{
+ rpal_local_t *local;
+
+ local = pthread_getspecific(rpal_key);
+ if (local && (local->tflag & RPAL_RECEIVER)) {
+ return local->rri;
+ } else {
+ return NULL;
+ }
+}
+
+static status_t rpal_register_sender_local(rpal_sender_info_t *sender)
+{
+ rpal_local_t *local;
+ local = pthread_getspecific(rpal_key);
+ if (!local) {
+ local = malloc(sizeof(rpal_local_t));
+ if (!local)
+ return RPAL_FAILURE;
+ memset(local, 0, sizeof(rpal_local_t));
+ pthread_setspecific(rpal_key, local);
+ }
+ if (local->tflag & RPAL_SENDER) {
+ return RPAL_FAILURE;
+ }
+ local->rsi = sender;
+ local->tflag |= RPAL_SENDER;
+ return RPAL_SUCCESS;
+}
+
+static status_t rpal_unregister_sender_local(void)
+{
+ rpal_local_t *local;
+ local = pthread_getspecific(rpal_key);
+ if (!local || !(local->tflag & RPAL_SENDER))
+ return RPAL_FAILURE;
+
+ local->rsi = NULL;
+ local->tflag &= ~RPAL_SENDER;
+ if (!local->tflag) {
+ pthread_setspecific(rpal_key, NULL);
+ free(local);
+ }
+ return RPAL_SUCCESS;
+}
+
+static status_t rpal_register_receiver_local(rpal_receiver_info_t *thread)
+{
+ rpal_local_t *local;
+ local = pthread_getspecific(rpal_key);
+ if (!local) {
+ local = malloc(sizeof(rpal_local_t));
+ if (!local)
+ return RPAL_FAILURE;
+ memset(local, 0, sizeof(rpal_local_t));
+ pthread_setspecific(rpal_key, local);
+ }
+ if (local->tflag & RPAL_RECEIVER) {
+ return RPAL_FAILURE;
+ }
+ local->rri = thread;
+ local->tflag |= RPAL_RECEIVER;
+ return RPAL_SUCCESS;
+}
+
+static status_t rpal_unregister_receiver_local(void)
+{
+ rpal_local_t *local;
+ local = pthread_getspecific(rpal_key);
+ if (!local || !(local->tflag & RPAL_RECEIVER))
+ return RPAL_FAILURE;
+
+ local->rri = NULL;
+ local->tflag &= ~RPAL_RECEIVER;
+ if (!local->tflag) {
+ pthread_setspecific(rpal_key, NULL);
+ free(local);
+ }
+ return RPAL_SUCCESS;
+}
+
+#define MAX_SENDERS 256
+typedef struct rpal_senders_metadata {
+ uint64_t bitmap[BITS_TO_LONGS(MAX_SENDERS)];
+ pthread_mutex_t lock;
+ int sdpage_order;
+ rpal_sender_info_t *senders;
+} rpal_senders_metadata_t;
+
+static rpal_senders_metadata_t *senders_md;
+
+static long rpal_ioctl(unsigned long cmd, unsigned long arg)
+{
+ struct {
+ unsigned long *ret;
+ unsigned long cmd;
+ unsigned long arg0;
+ unsigned long arg1;
+ } args;
+ const int args_size = sizeof(args);
+ int ret;
+
+ if (rpal_mgtfd == -1) {
+ errprint("rpal_mgtfd is not opened\n");
+ return -1;
+ }
+
+ ret = ioctl(rpal_mgtfd, cmd, arg);
+
+ return ret;
+}
+
+static inline long rpal_register_sender(rpal_sender_info_t *sender)
+{
+ long ret;
+
+ if (rpal_register_sender_local(sender) == RPAL_FAILURE)
+ return RPAL_FAILURE;
+
+ ret = rpal_ioctl(RPAL_IOCTL_REGISTER_SENDER,
+ (unsigned long)&sender->sc);
+ if (ret < 0) {
+ rpal_unregister_sender_local();
+ }
+ return ret;
+}
+
+static inline long rpal_register_receiver(rpal_receiver_info_t *rri)
+{
+ long ret;
+
+ if (rpal_register_receiver_local(rri) == RPAL_FAILURE)
+ return RPAL_FAILURE;
+ ret = rpal_ioctl(RPAL_IOCTL_REGISTER_RECEIVER,
+ (unsigned long)rri->rc);
+ if (ret < 0) {
+ rpal_unregister_receiver_local();
+ }
+ return ret;
+}
+
+static inline long rpal_unregister_sender(void)
+{
+ if (rpal_unregister_sender_local() == RPAL_FAILURE)
+ return RPAL_FAILURE;
+ return rpal_ioctl(RPAL_IOCTL_UNREGISTER_SENDER, 0);
+}
+
+static inline long rpal_unregister_receiver(void)
+{
+ if (rpal_unregister_receiver_local() == RPAL_FAILURE)
+ return RPAL_FAILURE;
+ return rpal_ioctl(RPAL_IOCTL_UNREGISTER_RECEIVER, 0);
+}
+
+static int rpal_get_service_pkey(void)
+{
+ int pkey, ret;
+
+ ret = rpal_ioctl(RPAL_IOCTL_GET_SERVICE_PKEY, (unsigned long)&pkey);
+ if (ret < 0 || pkey == -1) {
+ warnprint("MPK not supported on this host, disabling PKRU\n");
+ return -1;
+ }
+ return pkey;
+}
+
+static int __rpal_get_service_id(void)
+{
+ int id, ret;
+
+ ret = rpal_ioctl(RPAL_IOCTL_GET_SERVICE_ID, (unsigned long)&id);
+
+ if (ret < 0)
+ return ret;
+ else
+ return id;
+}
+
+static uint64_t __rpal_get_service_key(void)
+{
+ int ret;
+ uint64_t key;
+
+ ret = rpal_ioctl(RPAL_IOCTL_GET_SERVICE_KEY, (unsigned long)&key);
+ if (ret < 0)
+ return 0;
+ else
+ return key;
+}
+
+static void *rpal_get_shared_page(int order)
+{
+ void *p;
+ int size;
+ int flags = MAP_SHARED;
+
+ if (rpal_mgtfd == -1) {
+ return NULL;
+ }
+ size = PAGE_SIZE * (1 << order);
+
+ p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, rpal_mgtfd, 0);
+
+ return p;
+}
+
+static int rpal_free_shared_page(void *page, int order)
+{
+ int ret = 0;
+ int size;
+
+ size = PAGE_SIZE * (1 << order);
+ ret = munmap(page, size);
+ if (ret) {
+ errprint("munmap fail: %d\n", ret);
+ }
+ return ret;
+}
+
+static inline int rpal_inited(void)
+{
+ return (inited == 1);
+}
+
+static inline int sender_idx_is_invalid(int idx)
+{
+ if (idx < 0 || idx >= MAX_SENDERS)
+ return 1;
+ return 0;
+}
+
+static int rpal_sender_info_alloc(rpal_sender_info_t **sender)
+{
+ int idx;
+
+ if (!senders_md)
+ return RPAL_FAILURE;
+ pthread_mutex_lock(&senders_md->lock);
+ idx = clear_first_set_bit(senders_md->bitmap, MAX_SENDERS);
+ if (idx < 0) {
+ errprint("sender data alloc failed: %d, bitmap: %lx\n", idx,
+ senders_md->bitmap[0]);
+ goto unlock;
+ }
+ *sender = senders_md->senders + idx;
+
+unlock:
+ pthread_mutex_unlock(&senders_md->lock);
+ return idx;
+}
+
+static void rpal_sender_info_free(int idx)
+{
+ if (sender_idx_is_invalid(idx)) {
+ return;
+ }
+ pthread_mutex_lock(&senders_md->lock);
+ __set_bit(senders_md->bitmap, idx);
+ pthread_mutex_unlock(&senders_md->lock);
+}
+
+extern unsigned long rpal_get_ret_rip(void);
+
+static int rpal_sender_inited(rpal_sender_info_t *sender)
+{
+ return (sender->inited == 1);
+}
+
+status_t rpal_sender_init(rpal_error_code_t *error)
+{
+ int idx;
+ int ret = RPAL_FAILURE;
+ rpal_sender_info_t *sender;
+
+ if (!rpal_inited()) {
+ ERRREPORT(error, RPAL_DONT_INITED, "%s: rpal do not init\n",
+ __FUNCTION__);
+ goto error_out;
+ }
+ sender = current_rpal_sender();
+ if (sender) {
+ goto error_out;
+ }
+ idx = rpal_sender_info_alloc(&sender);
+ if (idx < 0) {
+ if (error) {
+ *error = RPAL_ERR_SENDER_INIT;
+ }
+ goto error_out;
+ }
+ sender->idx = idx;
+ sender->sc.sender_id = idx;
+ sender->tid = syscall(SYS_gettid);
+ sender->pkey = rpal_get_service_pkey();
+ sender->sc.ec.erip = rpal_get_ret_rip();
+ ret = rpal_register_sender(sender);
+ if (ret) {
+ ERRREPORT(error, RPAL_ERR_SENDER_REG,
+ "rpal_register_sender error: %d\n", ret);
+ goto sender_register_failed;
+ }
+ sender->inited = 1;
+ return RPAL_SUCCESS;
+
+sender_register_failed:
+ rpal_sender_info_free(idx);
+error_out:
+ return RPAL_FAILURE;
+}
+
+status_t rpal_sender_exit(void)
+{
+ int idx;
+ rpal_sender_info_t *sender;
+
+ sender = current_rpal_sender();
+
+ if (sender) {
+ idx = sender->idx;
+ sender->idx = 0;
+ sender->tid = 0;
+ rpal_unregister_sender();
+ rpal_sender_info_free(idx);
+ sender->pkey = 0;
+ }
+ return RPAL_SUCCESS;
+}
+
+static status_t rpal_enable_service(rpal_error_code_t *error)
+{
+ struct rpal_service_metadata rsm;
+ long ret = 0;
+
+ rsm.version = 0;
+ rsm.rtp = threads_md.rtp;
+ rsm.rcs = rcs;
+ rsm.pkey = -1;
+ ret = rpal_ioctl(RPAL_IOCTL_ENABLE_SERVICE, (unsigned long)&rsm);
+ if (ret) {
+ ERRREPORT(error, RPAL_ERR_ENABLE_SERVICE,
+ "rpal enable service failed: %ld\n", ret)
+ return RPAL_FAILURE;
+ }
+ threads_md.rtp->pkey = rpal_get_service_pkey();
+ return RPAL_SUCCESS;
+}
+
+static status_t rpal_disable_service(void)
+{
+ long ret = 0;
+ ret = rpal_ioctl(RPAL_IOCTL_DISABLE_SERVICE, 0);
+ if (ret) {
+ errprint("rpal disable service failed: %ld\n", ret);
+ return RPAL_FAILURE;
+ }
+ return RPAL_SUCCESS;
+}
+
+static status_t add_requested_service(struct rpal_thread_pool *rtp, uint64_t key, int id, int pkey)
+{
+ struct rpal_thread_pool *expected = NULL;
+
+ if (!rtp) {
+ errprint("add requested service null\n");
+ return RPAL_FAILURE;
+ }
+
+ if (!__atomic_compare_exchange_n(&requested_services[id].service,
+ &expected, rtp, 1, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST)) {
+ errprint("rpal service %d already add, expected: %ld\n", id,
+ expected->service_key);
+ return RPAL_FAILURE;
+ }
+ requested_services[id].key = key;
+ requested_services[id].pkey = pkey;
+ return RPAL_SUCCESS;
+}
+
+int rpal_get_request_service_id(uint64_t key)
+{
+ int i;
+
+ for (i = 0; i < MAX_SERVICEID; i++) {
+ if (requested_services[i].key == key)
+ return i;
+ }
+ return -1;
+}
+
+static struct rpal_thread_pool *get_service_from_key(uint64_t key)
+{
+ int i;
+ struct rpal_thread_pool *rtp;
+
+ for (i = 0; i < MAX_SERVICEID; i++) {
+ if (requested_services[i].key == key)
+ return requested_services[i].service;
+ }
+ return NULL;
+}
+
+static inline struct rpal_thread_pool *get_service_from_id(int id)
+{
+ return requested_services[id].service;
+}
+
+static inline int get_service_pkey_from_id(int id)
+{
+ return requested_services[id].pkey;
+}
+
+static struct rpal_thread_pool *del_requested_service(uint64_t key)
+{
+ int id;
+ struct rpal_thread_pool *rtp;
+
+ id = rpal_get_request_service_id(key);
+ if (id == -1)
+ return NULL;
+ rtp = __atomic_exchange_n(&requested_services[id].service, NULL,
+ __ATOMIC_RELAXED);
+ return rtp;
+}
+
+int rpal_request_service(uint64_t key)
+{
+ struct rpal_request_arg rra;
+ long ret = RPAL_FAILURE;
+ struct rpal_thread_pool *rtp;
+ int id, pkey;
+
+ if (!rpal_inited()) {
+ errprint("%s: rpal do not init\n", __FUNCTION__);
+ goto error_out;
+ }
+
+ rra.version = 0;
+ rra.key = key;
+ rra.rtp = &rtp;
+ rra.id = &id;
+ rra.pkey = &pkey;
+ ret = rpal_ioctl(RPAL_IOCTL_REQUEST_SERVICE, (unsigned long)&rra);
+ if (ret) {
+ goto error_out;
+ }
+
+ ret = add_requested_service(rtp, key, id, pkey);
+ if (ret == RPAL_FAILURE) {
+ goto add_requested_failed;
+ }
+
+ return RPAL_SUCCESS;
+
+add_requested_failed:
+ rpal_ioctl(RPAL_IOCTL_RELEASE_SERVICE, key);
+error_out:
+ return (int)ret;
+}
+
+static void fdt_freelist_forcefree(fd_table_t *fdt, uint64_t service_key);
+
+status_t rpal_release_service(uint64_t key)
+{
+ long ret;
+ struct rpal_thread_pool *rtp;
+
+ if (!rpal_inited()) {
+ errprint("%s: rpal do not init\n", __FUNCTION__);
+ return RPAL_FAILURE;
+ }
+
+ rtp = del_requested_service(key);
+ ret = rpal_ioctl(RPAL_IOCTL_RELEASE_SERVICE, key);
+ if (ret) {
+ errprint("rpal release service failed: %ld\n", ret);
+ return RPAL_FAILURE;
+ }
+ fdt_freelist_forcefree(threads_md.rtp->fdt, key);
+ return RPAL_SUCCESS;
+}
+
+static void try_clean_lock(rpal_receiver_info_t *rri, uint64_t key)
+{
+ uint64_t lock_state = key | 1UL << 63;
+
+ if (__atomic_load_n(&rri->uqlock, __ATOMIC_RELAXED) == lock_state)
+ uevent_queue_fix(&rri->ueventq);
+
+ if (__atomic_compare_exchange_n(&rri->uqlock, &lock_state, (uint64_t)0,
+ 1, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ dbprint(RPAL_DEBUG_MANAGEMENT,
+ "Serivce (key: %lu) does exit with holding lock\n",
+ key);
+}
+
+struct release_info {
+ uint64_t keys[KEY_SIZE];
+ int size;
+};
+
+status_t rpal_clean_service_start(int64_t *ptr)
+{
+ rpal_receiver_info_t *rri;
+ struct release_info *info;
+ int i, j;
+ int size;
+
+ if (!ptr) {
+ goto error_out;
+ }
+
+ info = malloc(sizeof(struct release_info));
+ if (info == NULL) {
+ errprint("alloc release_info fail\n");
+ goto error_out;
+ }
+
+ pthread_mutex_lock(&release_lock);
+ size = read(rpal_mgtfd, info->keys, KEY_SIZE * sizeof(uint64_t));
+ if (size <= 0) {
+ errprint("Read keys on rpal_mgtfd failed\n");
+ goto error_unlock;
+ }
+
+ size /= sizeof(uint64_t);
+ info->size = size;
+
+ for (i = 0; i < size; i++) {
+ for (j = 0; j < threads_md.rtp->nr_threads; j++) {
+ rri = threads_md.rtp->rris + j;
+ try_clean_lock(rri, info->keys[i]);
+ }
+ }
+ pthread_mutex_unlock(&release_lock);
+ *ptr = (int64_t)info;
+ return RPAL_SUCCESS;
+
+error_unlock:
+ pthread_mutex_unlock(&release_lock);
+ free(info);
+error_out:
+ return RPAL_FAILURE;
+}
+
+void rpal_clean_service_end(int64_t *ptr)
+{
+ int i;
+ struct release_info *info;
+
+ if (ptr == NULL)
+ return;
+ info = (struct release_info *)(*ptr);
+ if (info == NULL)
+ return;
+ for (i = 0; i < info->size; i++) {
+ dbprint(RPAL_DEBUG_MANAGEMENT, "release service: 0x%lx\n",
+ info->keys[i]);
+ rpal_release_service(info->keys[i]);
+ }
+ free(info);
+}
+int rpal_get_service_id(void)
+{
+ if (!rpal_inited()) {
+ return RPAL_FAILURE;
+ }
+ return threads_md.service_id;
+}
+
+status_t rpal_get_service_key(uint64_t *service_key)
+{
+ if (!rpal_inited() || !service_key) {
+ return RPAL_FAILURE;
+ }
+ *service_key = threads_md.service_key;
+ return RPAL_SUCCESS;
+}
+
+static fdt_node_t *fdt_node_alloc(fd_table_t *fdt)
+{
+ fdt_node_t *node;
+ fd_event_t **ev;
+ int *ref_count;
+ uint16_t *timestamps;
+ int size = 0;
+
+ node = malloc(sizeof(fdt_node_t));
+ if (!node)
+ goto node_alloc_failed;
+
+ size = sizeof(fd_event_t **) * (1 << fdt->node_shift);
+ ev = malloc(size);
+ if (!ev)
+ goto events_alloc_failed;
+ memset(ev, 0, size);
+
+ size = sizeof(int) * (1 << fdt->node_shift);
+ ref_count = malloc(size);
+ if (!ref_count)
+ goto used_alloc_failed;
+ memset(ref_count, 0xff, size);
+
+ size = sizeof(uint16_t) * (1 << fdt->node_shift);
+ timestamps = malloc(size);
+ if (!timestamps)
+ goto ts_alloc_failed;
+ memset(timestamps, 0, size);
+
+ node->events = ev;
+ node->ref_count = ref_count;
+ node->next = NULL;
+ node->timestamps = timestamps;
+ if (!fdt->head) {
+ fdt->head = node;
+ fdt->tail = node;
+ } else {
+ fdt->tail->next = node;
+ fdt->tail = node;
+ }
+ fdt->max_fd += (1 << fdt->node_shift);
+ return node;
+
+ts_alloc_failed:
+ free(ref_count);
+used_alloc_failed:
+ free(ev);
+events_alloc_failed:
+ free(node);
+node_alloc_failed:
+ errprint("%s Error!!! max_fd: %d\n", __FUNCTION__, fdt->max_fd);
+ return NULL;
+}
+
+static void fdt_node_free_all(fd_table_t *fdt)
+{
+ fdt_node_t *node, *ptr;
+
+ node = fdt->head;
+ while (node) {
+ free(node->timestamps);
+ free(node->ref_count);
+ free(node->events);
+ ptr = node;
+ node = node->next;
+ free(ptr);
+ }
+}
+
+static fdt_node_t *fdt_node_expand(fd_table_t *fdt, int fd)
+{
+ fdt_node_t *node = NULL;
+ while (fd >= fdt->max_fd) {
+ node = fdt_node_alloc(fdt);
+ if (!node)
+ break;
+ }
+ return node;
+}
+
+static fdt_node_t *fdt_node_search(fd_table_t *fdt, int fd)
+{
+ fdt_node_t *node = NULL;
+ int pos = 0;
+ if (fd >= fdt->max_fd)
+ return NULL;
+ pos = fd >> fdt->node_shift;
+ node = fdt->head;
+ while (pos) {
+ if (!node) {
+ errprint(
+ "fdt node search ERROR! fd: %d, pos: %d, fdt->max_fd: %d\n",
+ fd, pos, fdt->max_fd);
+ return NULL;
+ }
+ node = node->next;
+ pos--;
+ }
+ return node;
+}
+
+static fd_table_t *fd_table_alloc(unsigned int node_shift)
+{
+ fd_table_t *fdt;
+ pthread_mutexattr_t mattr;
+
+ fdt = malloc(sizeof(fd_table_t));
+ if (!fdt)
+ return NULL;
+ fdt->head = NULL;
+ fdt->tail = NULL;
+ fdt->max_fd = 0;
+ fdt->node_shift = node_shift;
+ fdt->node_mask = (1 << node_shift) - 1;
+ fdt->freelist = NULL;
+ pthread_mutex_init(&fdt->list_lock, NULL);
+
+ pthread_mutexattr_init(&mattr);
+ pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
+ pthread_mutex_init(&fdt->lock, &mattr);
+ return fdt;
+}
+
+static void fd_table_free(fd_table_t *fdt)
+{
+ if (!fdt)
+ return;
+ fdt_node_free_all(fdt);
+ free(fdt);
+ return;
+}
+
+static inline fd_event_t *fd_event_alloc(int fd, int epfd,
+ struct epoll_event *event)
+{
+ fd_event_t *fde;
+ uint64_t *qdata;
+
+ fde = (fd_event_t *)malloc(sizeof(fd_event_t));
+ if (!fde)
+ return NULL;
+
+ fde->fd = fd;
+ fde->epfd = epfd;
+ fde->epev = *event;
+ fde->events = 0;
+ fde->node = NULL;
+ fde->next = NULL;
+ fde->timestamp = 0;
+ fde->service_key = 0;
+ __atomic_store_n(&fde->outdated, (uint16_t)0, __ATOMIC_RELEASE);
+
+ qdata = malloc(DEFAULT_QUEUE_SIZE * sizeof(uint64_t));
+ if (!qdata) {
+ errprint("malloc queue data failed\n");
+ goto malloc_error;
+ }
+ if (rpal_queue_init(&fde->q, qdata, DEFAULT_QUEUE_SIZE)) {
+ errprint("fde queue alloc failed, fd: %d\n", fd);
+ goto init_error;
+ }
+ return fde;
+
+init_error:
+ free(qdata);
+malloc_error:
+ free(fde);
+ return NULL;
+}
+
+static inline void fd_event_free(fd_event_t *fde)
+{
+ uint64_t *qdata;
+
+ if (!fde)
+ return;
+ qdata = rpal_queue_destroy(&fde->q);
+ free(qdata);
+ free(fde);
+ return;
+}
+
+static void fdt_freelist_insert(fd_table_t *fdt, fd_event_t *fde)
+{
+ if (!fde)
+ return;
+
+ pthread_mutex_lock(&fdt->list_lock);
+ if (fdt->freelist == NULL) {
+ fdt->freelist = fde;
+ } else {
+ fde->next = fdt->freelist;
+ fdt->freelist = fde;
+ }
+ pthread_mutex_unlock(&fdt->list_lock);
+}
+
+static void fdt_freelist_forcefree(fd_table_t *fdt, uint64_t service_key)
+{
+ fd_event_t *prev, *pos, *f_fde;
+ fdt_node_t *node;
+ int idx;
+
+ pthread_mutex_lock(&fdt->list_lock);
+ prev = NULL;
+ pos = fdt->freelist;
+ while (pos) {
+ idx = pos->fd & fdt->node_mask;
+ node = pos->node;
+ if (pos->service_key == service_key) {
+ __atomic_exchange_n(&node->ref_count[idx], FDE_FREEING,
+ __ATOMIC_RELAXED);
+ if (!prev) {
+ fdt->freelist = pos->next;
+ } else {
+ prev->next = pos->next;
+ }
+ f_fde = pos;
+ pos = pos->next;
+ node->events[idx] = NULL;
+ __atomic_store_n(&node->ref_count[idx], -1,
+ __ATOMIC_RELEASE);
+ fd_event_free(f_fde);
+ } else {
+ prev = pos;
+ pos = pos->next;
+ }
+ }
+ pthread_mutex_unlock(&fdt->list_lock);
+ return;
+}
+
+static void fdt_freelist_lazyfree(fd_table_t *fdt)
+{
+ fd_event_t *prev, *pos, *f_fde;
+ fdt_node_t *node;
+ int idx;
+ int expected;
+
+ pthread_mutex_lock(&fdt->list_lock);
+ prev = NULL;
+ pos = fdt->freelist;
+
+ while (pos) {
+ idx = pos->fd & fdt->node_mask;
+ // do lazyfree when ref_count less than 0
+ expected = FDE_AVAILABLE;
+ node = pos->node;
+ if (__atomic_compare_exchange_n(
+ &node->ref_count[idx], &expected, FDE_FREEING, 1,
+ __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+ if (!prev) {
+ fdt->freelist = pos->next;
+ } else {
+ prev->next = pos->next;
+ }
+ f_fde = pos;
+ pos = pos->next;
+ node->events[idx] = NULL;
+ __atomic_store_n(&node->ref_count[idx], -1,
+ __ATOMIC_RELEASE);
+ fd_event_free(f_fde);
+ } else {
+ if (expected < 0) {
+ errprint("error ref: %d, fd: %d\n", expected,
+ pos->fd);
+ }
+ prev = pos;
+ pos = pos->next;
+ }
+ }
+ pthread_mutex_unlock(&fdt->list_lock);
+ return;
+}
+
+static uint16_t fde_timestamp_get(fd_table_t *fdt, int fd)
+{
+ fdt_node_t *node;
+ int idx;
+
+ node = fdt_node_search(fdt, fd);
+ if (!node) {
+ return 0;
+ }
+ idx = fd & fdt->node_mask;
+ return node->timestamps[idx];
+}
+
+static void fd_event_put(fd_table_t *fdt, fd_event_t *fde);
+
+static fd_event_t *fd_event_get(fd_table_t *fdt, int fd)
+{
+ fd_event_t *fde = NULL;
+ fdt_node_t *node;
+ int idx;
+ int val = -1;
+ int expected;
+
+ node = fdt_node_search(fdt, fd);
+ if (!node) {
+ return NULL;
+ }
+ idx = fd & fdt->node_mask;
+
+retry:
+ val = __atomic_load_n(&node->ref_count[idx], __ATOMIC_ACQUIRE);
+ if (val < 0)
+ return NULL;
+ expected = val;
+ val++;
+ if (!__atomic_compare_exchange_n(&node->ref_count[idx], &expected, val,
+ 1, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST)) {
+ if (expected >= 0) {
+ goto retry;
+ } else {
+ return NULL;
+ }
+ }
+ fde = node->events[idx];
+ if (!fde) {
+ errprint("error get: %d, fd: %d\n", val, fd);
+ } else {
+ if (__atomic_load_n(&fde->outdated, __ATOMIC_ACQUIRE)) {
+ fd_event_put(fdt, fde);
+ fde = NULL;
+ }
+ }
+ return fde;
+}
+
+static void fd_event_put(fd_table_t *fdt, fd_event_t *fde)
+{
+ int idx;
+ int val;
+
+ if (!fde)
+ return;
+
+ idx = fde->fd & fdt->node_mask;
+ val = __atomic_sub_fetch(&fde->node->ref_count[idx], 1,
+ __ATOMIC_RELEASE);
+ if (val < 0) {
+ errprint("error put: %d, fd: %d\n", val, fde->fd);
+ }
+ return;
+}
+
+int rpal_access(void *addr, access_fn do_access, int *ret, va_list va);
+
+int rpal_access(void *addr, access_fn do_access, int *ret, va_list va)
+{
+ int func_ret;
+
+ func_ret = do_access(va);
+ if (ret) {
+ *ret = func_ret;
+ }
+ return RPAL_SUCCESS;
+}
+
+extern status_t rpal_access_warpper(void *addr, access_fn do_access, int *ret,
+ va_list va);
+
+#define rpal_write_access_safety(ACCESS_FUNC, FUNC_RET, ...) \
+ ({ \
+ status_t __access = RPAL_FAILURE; \
+ uint32_t old_pkru = 0; \
+ old_pkru = rdpkru(); \
+ __access = rpal_read_access_safety(ACCESS_FUNC, FUNC_RET, \
+ ##__VA_ARGS__); \
+ wrpkru(old_pkru); \
+ __access; \
+ })
+
+status_t rpal_read_access_safety(access_fn do_access, int *ret, ...)
+{
+ rpal_sender_info_t *sender;
+ sender_context_t *sc;
+ rpal_error_code_t error;
+ status_t access = RPAL_FAILURE;
+ va_list args;
+
+ sender = current_rpal_sender();
+ if (!sender || !rpal_sender_inited(sender)) {
+ dbprint(RPAL_DEBUG_SENDER, "%s: sender(%d) do not init\n",
+ __FUNCTION__, getpid());
+ if (RPAL_FAILURE == rpal_sender_init(&error)) {
+ return RPAL_FAILURE;
+ }
+ sender = current_rpal_sender();
+ }
+ sc = &sender->sc;
+ sc->ec.magic = RPAL_ERROR_MAGIC;
+ va_start(args, ret);
+ access = rpal_access_warpper(&(sc->ec.ersp), do_access, ret, args);
+ va_end(args);
+ sc->ec.magic = 0;
+
+ return access;
+}
+
+static int64_t __do_rpal_uds_fdmap(int service_id, int connfd)
+{
+ struct rpal_uds_fdmap_arg arg;
+ int64_t res;
+ int ret;
+
+ arg.cfd = connfd;
+ arg.service_id = service_id;
+ arg.res = &res;
+ ret = rpal_ioctl(RPAL_IOCTL_UDS_FDMAP, (unsigned long)&arg);
+ if (ret < 0)
+ return RPAL_FAILURE;
+
+ return res;
+}
+
+static status_t do_rpal_uds_fdmap(va_list va)
+{
+ int64_t ret;
+ int sfd, cfd, sid;
+ struct rpal_thread_pool *srtp;
+ uint64_t stamp = 0;
+ uint64_t sid_fd;
+ uint64_t *rpalfd;
+ fd_event_t *fde;
+
+ sid_fd = va_arg(va, uint64_t);
+ rpalfd = va_arg(va, uint64_t *);
+
+ if (!rpalfd) {
+ return RPAL_FAILURE;
+ }
+ sid = get_high32(sid_fd);
+ cfd = get_low32(sid_fd);
+
+ ret = __do_rpal_uds_fdmap(sid, cfd);
+ if (ret < 0) {
+ errprint("%s failed %ld, cfd: %d\n", __FUNCTION__, ret, cfd);
+ return RPAL_FAILURE;
+ }
+
+ srtp = get_service_from_id(sid);
+ if (!srtp) {
+ errprint("%s INVALID service_id: %d\n", __FUNCTION__, sid);
+ return RPAL_FAILURE;
+ }
+ sfd = get_sfd(ret);
+ stamp = fde_timestamp_get(srtp->fdt, sfd);
+ ret |= (stamp << HIGH16_OFFSET);
+
+ fde = fd_event_get(threads_md.rtp->fdt, cfd);
+ if (!fde) {
+ errprint("%s get self fde error, fd: %d\n", __FUNCTION__, cfd);
+ goto out;
+ }
+ fde->service_key = srtp->service_key;
+ fd_event_put(threads_md.rtp->fdt, fde);
+out:
+ *rpalfd = ret;
+ return RPAL_SUCCESS;
+}
+
+int rpal_get_peer_rid(uint64_t sid_fd)
+{
+ int64_t ret;
+ int sid, cfd;
+ int rid;
+
+ sid = get_high32(sid_fd);
+ cfd = get_low32(sid_fd);
+
+ ret = __do_rpal_uds_fdmap(sid, cfd);
+ if (ret < 0) {
+ errprint("%s failed %ld, cfd: %d\n", __FUNCTION__, ret, cfd);
+ return RPAL_FAILURE;
+ }
+ rid = get_rid(ret);
+ return rid;
+}
+
+status_t rpal_uds_fdmap(uint64_t sid_fd, uint64_t *rpalfd)
+{
+ status_t ret = RPAL_FAILURE;
+ status_t access;
+ uint32_t old_pkru;
+
+ old_pkru = rdpkru();
+ wrpkru(old_pkru & RPAL_PKRU_BASE_CODE_READ);
+ access = rpal_read_access_safety(do_rpal_uds_fdmap, &ret, sid_fd,
+ rpalfd);
+ wrpkru(old_pkru);
+ if (access == RPAL_FAILURE) {
+ return RPAL_FAILURE;
+ }
+ return ret;
+}
+
+static status_t fd_event_install(fd_table_t *fdt, int fd, int epfd,
+ struct epoll_event *event)
+{
+ fdt_node_t *node;
+ fd_event_t *fde;
+ int idx;
+ int expected;
+
+ fde = fd_event_alloc(fd, epfd, event);
+ if (!fde) {
+ goto fde_error;
+ }
+ pthread_mutex_lock(&fdt->lock);
+ if (fd >= fdt->max_fd) {
+ node = fdt_node_expand(fdt, fd);
+ } else {
+ node = fdt_node_search(fdt, fd);
+ }
+ pthread_mutex_unlock(&fdt->lock);
+
+ if (!node) {
+ errprint("fd node search failed, fd: %d\n", fd);
+ goto node_error;
+ }
+ idx = fd & fdt->node_mask;
+ fdt_freelist_lazyfree(fdt);
+ expected = __atomic_load_n(&node->ref_count[idx], __ATOMIC_ACQUIRE);
+ if (expected != FDE_FREED) {
+ goto node_error;
+ }
+ fde->timestamp =
+ __atomic_add_fetch(&node->timestamps[idx], 1, __ATOMIC_RELEASE);
+ fde->node = node;
+ node->events[idx] = fde;
+ if (!__atomic_compare_exchange_n(&node->ref_count[idx], &expected,
+ FDE_AVAILABLE, 1, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST)) {
+ errprint("may override fd: %d, val: %d\n", fd, expected);
+ node->events[idx] = NULL;
+ goto node_error;
+ }
+ return RPAL_SUCCESS;
+
+node_error:
+ fd_event_free(fde);
+fde_error:
+ return RPAL_FAILURE;
+}
+
+static status_t fd_event_uninstall(fd_table_t *fdt, int fd)
+{
+ fd_event_t *fde;
+ fdt_node_t *node;
+ int idx;
+ int ret = RPAL_SUCCESS;
+ int expected;
+
+ node = fdt_node_search(fdt, fd);
+ if (!node) {
+ ret = RPAL_FAILURE;
+ goto out;
+ }
+ idx = fd & fdt->node_mask;
+ fde = node->events[idx];
+ if (!fde) {
+ ret = RPAL_FAILURE;
+ goto out;
+ }
+ expected = FDE_AVAILABLE;
+ __atomic_store_n(&fde->outdated, (uint16_t)1, __ATOMIC_RELEASE);
+ if (__atomic_compare_exchange_n(&node->ref_count[idx], &expected,
+ FDE_FREEING, 1, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST)) {
+ node->events[idx] = NULL;
+ __atomic_store_n(&node->ref_count[idx], -1, __ATOMIC_RELEASE);
+ fd_event_free(fde);
+ } else {
+ if (expected < FDE_AVAILABLE) {
+ errprint("error cnt: %d, fd: %d\n", expected, fde->fd);
+ }
+ // link this fde for free_head
+ fdt_freelist_insert(fdt, fde);
+ }
+
+out:
+ fdt_freelist_lazyfree(fdt);
+ return ret;
+}
+
+static status_t fd_event_modify(fd_table_t *fdt, int fd,
+ struct epoll_event *event)
+{
+ fd_event_t *fde;
+
+ fde = fd_event_get(fdt, fd);
+ if (!fde) {
+ errprint("fde MOD fd(%d) ERROR!\n", fd);
+ return RPAL_FAILURE;
+ }
+ fde->fd = fd;
+ fde->epev = *event;
+ fde->events = 0;
+ fd_event_put(fdt, fde);
+ return RPAL_SUCCESS;
+}
+
+static int rpal_receiver_info_create(struct rpal_thread_pool *rtp, int id)
+{
+ rpal_receiver_info_t *rri = &rtp->rris[id];
+
+ rri->ep_stack = fiber_ctx_alloc(NULL, NULL, DEFUALT_STACK_SIZE);
+ if (!rri->ep_stack)
+ return -1;
+
+ rri->trampoline = fiber_ctx_alloc(NULL, NULL, TRAMPOLINE_SIZE);
+ if (!rri->trampoline) {
+ fiber_ctx_free(rri->ep_stack);
+ return -1;
+ }
+
+ rri->rc = threads_md.rc + id;
+ rri->rc->receiver_id = id;
+ rri->rtp = rtp;
+
+ return 0;
+}
+
+static void rpal_receiver_info_destroy(rpal_receiver_info_t *rri)
+{
+ fiber_ctx_free(rri->ep_stack);
+ fiber_ctx_free(rri->trampoline);
+ return;
+}
+
+static struct rpal_thread_pool *rpal_thread_pool_create(int nr_threads,
+ rpal_thread_metadata_t *rtm)
+{
+ void *p;
+ int i, j;
+ struct rpal_thread_pool *rtp;
+
+ if (rpal_inited())
+ goto out;
+ rtp = malloc(sizeof(struct rpal_thread_pool));
+ if (rtp == NULL) {
+ goto out;
+ }
+ threads_md.eventfds = malloc(nr_threads * sizeof(int));
+ if (threads_md.eventfds == NULL) {
+ goto eventfds_alloc_fail;
+ }
+ rtp->nr_threads = nr_threads;
+ rtp->pkey = -1;
+ p = malloc(nr_threads * sizeof(rpal_receiver_info_t));
+ if (p == NULL) {
+ goto rri_alloc_fail;
+ }
+ rtp->rris = p;
+ memset(p, 0, nr_threads * sizeof(rpal_receiver_info_t));
+
+ rtp->fdt = fd_table_alloc(DEFAULT_NODE_SHIFT);
+ if (!rtp->fdt) {
+ goto fdt_alloc_fail;
+ }
+
+ p = rpal_get_shared_page(rtm->epcpage_order);
+
+ if (!p)
+ goto page_alloc_fail;
+ rtm->rc = p;
+
+ for (i = 0; i < nr_threads; i++) {
+ if (rpal_receiver_info_create(rtp, i)) {
+ for (j = 0; j < i; j++) {
+ rpal_receiver_info_destroy(&rtp->rris[j]);
+ }
+ goto rri_create_fail;
+ }
+ }
+ return rtp;
+
+rri_create_fail:
+ rpal_free_shared_page(rtm->rc, rtm->epcpage_order);
+page_alloc_fail:
+ fd_table_free(rtp->fdt);
+fdt_alloc_fail:
+ free(rtp->rris);
+rri_alloc_fail:
+ free(threads_md.eventfds);
+eventfds_alloc_fail:
+ free(rtp);
+out:
+ return NULL;
+}
+
+static void rpal_thread_pool_destory(rpal_thread_metadata_t *rtm)
+{
+ int i;
+ struct rpal_thread_pool *rtp;
+
+ if (!rpal_inited()) {
+ errprint("thread pool is not created.\n");
+ return;
+ }
+ pthread_mutex_destroy(&release_lock);
+ rtp = threads_md.rtp;
+ fd_table_free(rtp->fdt);
+ for (i = 0; i < rtp->nr_threads; ++i) {
+ rpal_receiver_info_destroy(&rtp->rris[i]);
+ }
+ rpal_free_shared_page(threads_md.rc, threads_md.epcpage_order);
+ free(rtp->rris);
+ free(threads_md.eventfds);
+ free(rtp);
+}
+
+static inline int rpal_receiver_inited(rpal_receiver_info_t *rri)
+{
+ if (!rri)
+ return 0;
+ return (rri->status != RPAL_RECEIVER_UNINITIALIZED);
+}
+
+static inline int rpal_receiver_available(rpal_receiver_info_t *rri)
+{
+ return (rri->status == RPAL_RECEIVER_AVAILABLE);
+}
+
+static int rpal_receiver_idx_get(void)
+{
+ return __atomic_fetch_add(&threads_md.rpal_receiver_idx, 1,
+ __ATOMIC_RELAXED);
+}
+
+int rpal_receiver_init(void)
+{
+ int ret = 0;
+ int receiver_idx;
+ rpal_receiver_info_t *rri;
+
+ if (!rpal_inited()) {
+ errprint("thread pool is not created.\n");
+ goto error_out;
+ }
+
+ receiver_idx = rpal_receiver_idx_get();
+ if (receiver_idx >= threads_md.rtp->nr_threads) {
+ errprint(
+ "rpal thread pool size exceeded. thread_idx: %d, thread pool capacity: %d\n",
+ receiver_idx, threads_md.rtp->nr_threads);
+ goto error_out;
+ }
+
+ rri = threads_md.rtp->rris + receiver_idx;
+ rri->status = RPAL_RECEIVER_UNINITIALIZED;
+ rri->tid = syscall(SYS_gettid);
+ rri->tls_base = read_tls_base();
+
+ rpal_uevent_queue_init(&rri->ueventq, &rri->uqlock);
+
+ rri->rc->rpal_ep_poll_magic = 0;
+ rri->rc->receiver_state = RPAL_RECEIVER_STATE_RUNNING;
+ rri->rc->ep_pending = 0;
+ __atomic_store_n(&rri->rc->sender_state, RPAL_SENDER_STATE_RUNNING,
+ __ATOMIC_RELAXED);
+ ret = rpal_register_receiver(rri);
+ if (ret < 0) {
+ errprint("rpal thread %ld register failed %d\n", rri->tid, ret);
+ goto error_out;
+ }
+ ret = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+ if (ret < 0) {
+ errprint("rpal thread %ld eventfd failed %d\n", rri->tid,
+ errno);
+ goto eventfd_failed;
+ }
+ threads_md.eventfds[receiver_idx] = ret;
+ rri->status = RPAL_RECEIVER_INITIALIZED;
+ return ret;
+
+eventfd_failed:
+ rpal_unregister_receiver();
+error_out:
+ return RPAL_FAILURE;
+}
+
+void rpal_receiver_exit(void)
+{
+ rpal_receiver_info_t *rri = current_rpal_thread();
+ int id, fd;
+
+ if (!rpal_receiver_inited(rri))
+ return;
+ rri->status = RPAL_RECEIVER_UNINITIALIZED;
+ id = rri->rc->receiver_id;
+ fd = threads_md.eventfds[id];
+ close(fd);
+ threads_md.eventfds[id] = 0;
+ rpal_unregister_receiver();
+ return;
+}
+
+static inline void set_task_context(volatile task_context_t *tc, void *src)
+{
+ fiber_stack_t *fstack = src;
+ tc->r15 = fstack->r15;
+ tc->r14 = fstack->r14;
+ tc->r13 = fstack->r13;
+ tc->r12 = fstack->r12;
+ tc->rbx = fstack->rbx;
+ tc->rbp = fstack->rbp;
+ tc->rip = fstack->rip;
+ tc->rsp = (unsigned long)(src + 0x40);
+}
+
+static transfer_t _syscall_epoll_wait(transfer_t t)
+{
+ rpal_receiver_info_t *rri = t.ud;
+ volatile receiver_context_t *rc = rri->rc;
+ long ret;
+
+ rc->rpal_ep_poll_magic = RPAL_EP_POLL_MAGIC;
+ ret = epoll_wait(rc->epfd, rc->ep_events, rc->maxevents,
+ rc->timeout);
+ t = jump_fcontext(rri->main_ctx, (void *)ret);
+ return t;
+}
+
+extern void rpal_ret_critical(volatile receiver_context_t *rc,
+ rpal_call_info_t *rci);
+
+static transfer_t syscall_epoll_wait(transfer_t t)
+{
+ rpal_receiver_info_t *rri = t.ud;
+ volatile receiver_context_t *rc = rri->rc;
+ rpal_call_info_t *rci = &rri->rci;
+ task_t *estk = rri->ep_stack;
+
+ set_task_context(&rri->rc->task_context, t.fctx);
+ rri->main_ctx = t.fctx;
+
+ rpal_ret_critical(rc, rci);
+
+ estk->fctx = make_fcontext(estk->sp, 0, NULL);
+ t = ontop_fcontext(rri->ep_stack->fctx, rri, _syscall_epoll_wait);
+ return t;
+}
+
+static inline int ep_kernel_events_available(volatile int *ep_pending)
+{
+ return (RPAL_KERNEL_PENDING &
+ __atomic_load_n(ep_pending, __ATOMIC_ACQUIRE));
+}
+
+static inline int ep_user_events_available(volatile int *ep_pending)
+{
+ return (RPAL_USER_PENDING &
+ __atomic_load_n(ep_pending, __ATOMIC_ACQUIRE));
+}
+
+static inline int rpal_ep_send_events(epoll_uevent_queue_t *uq, fd_table_t *fdt,
+ volatile receiver_context_t *rc,
+ struct epoll_event *events, int maxevents)
+{
+ int fd = -1;
+ int ret = 0;
+ int res = 0;
+ fd_event_t *fde = NULL;
+
+ __atomic_and_fetch(&rc->ep_pending, ~RPAL_USER_PENDING,
+ __ATOMIC_ACQUIRE);
+ while (uevent_queue_len(uq) && ret < maxevents) {
+ fd = uevent_queue_del(uq);
+ if (fd == -1) {
+ errprint("uevent get failed\n");
+ continue;
+ }
+ fde = fd_event_get(fdt, fd);
+ if (!fde)
+ continue;
+ res = __atomic_exchange_n(&fde->events, 0, __ATOMIC_RELAXED);
+ res &= fde->epev.events;
+ if (res) {
+ events[ret].data = fde->epev.data;
+ events[ret].events = res;
+ ret++;
+ }
+ fd_event_put(fdt, fde);
+ }
+ if (uevent_queue_len(uq) || ret == maxevents) {
+ dbprint(RPAL_DEBUG_RECVER,
+ "uevent queue still have events, len: %d, ret: %d, maxevents: %d\n",
+ uevent_queue_len(uq), ret, maxevents);
+ __atomic_fetch_or(&rc->ep_pending, RPAL_USER_PENDING,
+ __ATOMIC_RELAXED);
+ }
+ return ret;
+}
+
+extern void rpal_call_critical(volatile receiver_context_t *rc,
+ rpal_receiver_info_t *rri);
+
+int rpal_epoll_wait(int epfd, struct epoll_event *events, int maxevents,
+ int timeout)
+{
+ transfer_t t;
+ rpal_call_info_t *rci;
+ task_t *estk, *trampoline;
+ volatile receiver_context_t *rc;
+ epoll_uevent_queue_t *ueventq;
+ rpal_receiver_info_t *rri = current_rpal_thread();
+ long ret = 0;
+ unsigned int mxcsr = 0, fpucw = 0;
+
+ if (!rpal_receiver_inited(rri))
+ return epoll_wait(epfd, events, maxevents, timeout);
+
+ rc = rri->rc;
+ estk = rri->ep_stack;
+ trampoline = rri->trampoline;
+ rci = &rri->rci;
+ ueventq = &rri->ueventq;
+
+ rc->epfd = epfd;
+ rc->ep_events = events;
+ rc->maxevents = maxevents;
+ rc->timeout = timeout;
+
+ if (!rpal_receiver_available(rri)) {
+ rri->status = RPAL_RECEIVER_AVAILABLE;
+ estk->fctx = make_fcontext(estk->sp, 0, NULL);
+ SAVE_FPU(mxcsr, fpucw);
+ trampoline->fctx = make_fcontext(trampoline->sp, 0, NULL);
+ t = ontop_fcontext(trampoline->fctx, rri, syscall_epoll_wait);
+ } else {
+ // kernel pending events
+ if (ep_kernel_events_available(&rc->ep_pending)) {
+ rc->rpal_ep_poll_magic =
+ RPAL_EP_POLL_MAGIC; // clear KERNEL_PENDING
+ ret = epoll_wait(epfd, events, maxevents, 0);
+ rc->rpal_ep_poll_magic = 0;
+ goto send_user_events;
+ }
+ // user pending events
+ if (ep_user_events_available(&rc->ep_pending)) {
+ goto send_user_events;
+ }
+ SAVE_FPU(mxcsr, fpucw);
+ trampoline->fctx = make_fcontext(trampoline->sp, 0, NULL);
+ t = ontop_fcontext(trampoline->fctx, rri, syscall_epoll_wait);
+ }
+ rc->rpal_ep_poll_magic = 0;
+
+ /*
+ * Here is where sender starts after user context switch.
+ * The TLS may still be sender's. We should not do anything
+ * that may use TLS, otherwise the result cannot be controlled.
+ */
+
+ switch (rc->receiver_state & RPAL_RECEIVER_STATE_MASK) {
+ case RPAL_RECEIVER_STATE_RUNNING: // syscall kernel ret
+ ret = (long)t.ud;
+ break;
+ case RPAL_RECEIVER_STATE_KERNEL_RET: // receiver kernel ret
+ RESTORE_FPU(mxcsr, fpucw);
+ ret = (long)t.fctx;
+ break;
+ case RPAL_RECEIVER_STATE_CALL: // rpalcall user jmp
+ rci->sender_tls_base = read_tls_base();
+ rci->pkru = rdpkru();
+ write_tls_base(rri->tls_base);
+ wrpkru(rpal_pkey_to_pkru(rri->rtp->pkey));
+ rci->sender_fctx = t.fctx;
+ break;
+ default:
+ errprint("Error ep_status: %ld\n",
+ rc->receiver_state & RPAL_RECEIVER_STATE_MASK);
+ return -1;
+ }
+
+send_user_events:
+ if (ret < maxevents && ret >= 0)
+ ret += rpal_ep_send_events(ueventq, rri->rtp->fdt, rc,
+ events + ret, maxevents - ret);
+ return ret;
+}
+
+int rpal_epoll_wait_user(int epfd, struct epoll_event *events, int maxevents,
+ int timeout)
+{
+ volatile receiver_context_t *rc;
+ epoll_uevent_queue_t *ueventq;
+ rpal_receiver_info_t *rri = current_rpal_thread();
+
+ if (!rpal_receiver_inited(rri))
+ return 0;
+
+ if (!rpal_receiver_available(rri))
+ return 0;
+
+ rc = rri->rc;
+ ueventq = &rri->ueventq;
+ if (ep_user_events_available(&rc->ep_pending)) {
+ return rpal_ep_send_events(ueventq, rri->rtp->fdt, rc, events,
+ maxevents);
+ }
+ return 0;
+}
+
+int rpal_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)
+{
+ fd_table_t *fdt;
+ int ret;
+
+ ret = epoll_ctl(epfd, op, fd, event);
+ if (ret || !rpal_inited()) {
+ return ret;
+ }
+ fdt = threads_md.rtp->fdt;
+ switch (op) {
+ case EPOLL_CTL_ADD:
+ if (event->events & EPOLLRPALINOUT_BITS) {
+ ret = fd_event_install(fdt, fd, epfd, event);
+ if (ret == RPAL_FAILURE)
+ goto install_error;
+ }
+ break;
+ case EPOLL_CTL_MOD:
+ fd_event_modify(fdt, fd, event);
+ break;
+ case EPOLL_CTL_DEL:
+ fd_event_uninstall(fdt, fd);
+ break;
+ }
+ return ret;
+install_error:
+ epoll_ctl(epfd, EPOLL_CTL_DEL, fd, event);
+ return RPAL_FAILURE;
+}
+
+static transfer_t set_fcontext(transfer_t t)
+{
+ sender_context_t *sc = t.ud;
+
+ set_task_context(&sc->task_context, t.fctx);
+ return t;
+}
+
+static void uq_lock(volatile uint64_t *uqlock, uint64_t key)
+{
+ uint64_t init = 0;
+
+ while (1) {
+ if (__atomic_compare_exchange_n(
+ uqlock, &init, (1UL << 63 | key), 1,
+ __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ return;
+ asm volatile("rep; nop");
+ init = 0;
+ }
+}
+
+static void uq_unlock(volatile uint64_t *uqlock)
+{
+ __atomic_store_n(uqlock, (uint64_t)0, __ATOMIC_RELAXED);
+}
+
+static status_t do_rpal_call_jump(rpal_sender_info_t *rsi,
+ rpal_receiver_info_t *rri,
+ volatile receiver_context_t *rc)
+{
+ int desired, expected;
+ int64_t diff;
+
+WAKE_AGAIN:
+ desired = RPAL_BUILD_CALL_STATE(rsi->sc.sender_id,
+ threads_md.service_id);
+ expected = RPAL_RECEIVER_STATE_WAIT;
+ if (__atomic_compare_exchange_n(&rc->receiver_state, &expected, desired, 1,
+ __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+ __atomic_store_n(&rc->sender_state, RPAL_SENDER_STATE_CALL,
+ __ATOMIC_RELAXED);
+ rsi->sc.start_time = _rdtsc();
+ ontop_fcontext(rri->main_ctx, &rsi->sc, set_fcontext);
+
+ if (__atomic_load_n(&rc->sender_state, __ATOMIC_RELAXED) ==
+ RPAL_SENDER_STATE_RUNNING) {
+ if (rc->receiver_state == RPAL_RECEIVER_STATE_LAZY_SWITCH)
+ read(-1, NULL, 0);
+ diff = _rdtsc() - rsi->sc.start_time;
+ rsi->sc.total_time += diff;
+ rri->rc->total_time += diff;
+ expected = desired;
+ desired = RPAL_RECEIVER_STATE_WAIT;
+ __atomic_compare_exchange_n(&rc->receiver_state, &expected,
+ desired, 1,
+ __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST);
+
+ if (ep_user_events_available(&rc->ep_pending)) {
+ goto WAKE_AGAIN;
+ }
+ }
+ dbprint(RPAL_DEBUG_SENDER, "app return: 0x%x, %d, %d\n",
+ rc->receiver_state, rc->sender_state, sfd);
+ }
+ return RPAL_SUCCESS;
+}
+
+static inline void set_fde_trigger(fd_event_t *fde)
+{
+ __atomic_store_n(&fde->wait, FDE_TRIGGER_OUT, __ATOMIC_RELEASE);
+ return;
+}
+
+static inline int clear_fde_trigger(fd_event_t *fde)
+{
+ int expected = FDE_TRIGGER_OUT;
+
+ return __atomic_compare_exchange_n(&fde->wait, &expected,
+ FDE_NO_TRIGGER, 1, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST);
+}
+
+static int do_rpal_call(va_list va)
+{
+ rpal_sender_info_t *rsi;
+ rpal_receiver_info_t *rri;
+ fd_event_t *fde;
+ volatile receiver_context_t *rc;
+ struct rpal_thread_pool *srtp;
+ uint16_t stamp;
+ uint8_t rid;
+ int sfd;
+ int ret = 0;
+ int fall = 0;
+ int pkey;
+
+ int service_id = va_arg(va, int);
+ uint64_t rpalfd = va_arg(va, uint64_t);
+ int64_t *ptrs = va_arg(va, int64_t *);
+ int len = va_arg(va, int);
+ int flags = va_arg(va, int);
+
+ rsi = current_rpal_sender();
+ if (!rsi) {
+ ret = RPAL_INVAL_THREAD;
+ goto ERROR;
+ }
+ srtp = get_service_from_id(service_id);
+ if (!srtp) {
+ ret = RPAL_INVAL_SERVICE;
+ goto ERROR;
+ }
+ pkey = get_service_pkey_from_id(service_id);
+
+ rid = get_rid(rpalfd);
+ sfd = get_sfd(rpalfd);
+ wrpkru(rpal_pkru_union(rdpkru(), rpal_pkey_to_pkru(pkey)));
+ rri = srtp->rris + rid;
+ if (!rri) {
+ errprint("INVALID rid: %u, rri is NULL\n", rid);
+ ret = RPAL_INVALID_ARG;
+ goto ERROR;
+ }
+ rc = rri->rc;
+ rsi->sc.ec.tls_base = rri->tls_base;
+
+ fde = fd_event_get(srtp->fdt, sfd);
+ if (!fde) {
+ ret = RPAL_INVALID_ARG;
+ goto ERROR;
+ }
+ stamp = get_fdtimestamp(rpalfd);
+ if (fde->timestamp != stamp) {
+ ret = RPAL_FDE_OUTDATED;
+ goto FDE_PUT;
+ }
+
+ uq_lock(&rri->uqlock, threads_md.service_key);
+ if (uevent_queue_len(&rri->ueventq) == MAX_RDY) {
+ errprint("rdylist is full: [%u, %u]\n", rri->ueventq.l_beg,
+ rri->ueventq.l_end);
+ ret = RPAL_CACHE_FULL;
+ goto UNLOCK;
+ }
+ if (likely(flags & RCALL_IN)) {
+ if (unlikely(rpal_queue_unused(&fde->q) < (uint32_t)len)) {
+ set_fde_trigger(fde);
+ fall = 1;
+ /* fall through: try to put data to queue */
+ }
+ ret = rpal_queue_put(&fde->q, ptrs, len);
+ if (ret != len) {
+ errprint("fde queue put error: %d, data: %lx\n", ret,
+ (unsigned long)fde->q.data);
+ ret = RPAL_QUEUE_PUT_FAILED;
+ goto UNLOCK;
+ }
+ if (unlikely(fall)) {
+ clear_fde_trigger(fde);
+ }
+ fde->events |= EPOLLRPALIN;
+ } else if (unlikely(flags & RCALL_OUT)) {
+ ret = 0;
+ fde->events |= EPOLLRPALOUT;
+ } else {
+ errprint("rpal call failed, ptrs: %lx, len: %d",
+ (unsigned long)ptrs, len);
+ ret = RPAL_INVALID_ARG;
+ goto UNLOCK;
+ }
+
+ uevent_queue_add(&rri->ueventq, sfd);
+ uq_unlock(&rri->uqlock);
+ fd_event_put(srtp->fdt, fde);
+
+ __atomic_fetch_or(&rc->ep_pending, RPAL_USER_PENDING,
+ __ATOMIC_RELEASE);
+ do_rpal_call_jump(rsi, rri, rc);
+ return ret;
+
+UNLOCK:
+ uq_unlock(&rri->uqlock);
+FDE_PUT:
+ fd_event_put(srtp->fdt, fde);
+ERROR:
+ return -ret;
+}
+
+static int __rpal_write_ptrs_common(int service_id, uint64_t rpalfd,
+ int64_t *ptrs, int len, int flags)
+{
+ int ret = RPAL_FAILURE;
+ status_t access = RPAL_FAILURE;
+
+ if (unlikely(NULL == ptrs)) {
+ dbprint(RPAL_DEBUG_SENDER, "%s: ptrs is NULL\n", __FUNCTION__);
+ return -RPAL_INVALID_ARG;
+ }
+ if (unlikely(len <= 0 || ((uint32_t)len) > DEFAULT_QUEUE_SIZE)) {
+ dbprint(RPAL_DEBUG_SENDER,
+ "%s: data len less than or equal to zero\n",
+ __FUNCTION__);
+ return -RPAL_INVALID_ARG;
+ }
+
+ access = rpal_write_access_safety(do_rpal_call, &ret, service_id,
+ rpalfd, ptrs, len, flags);
+ if (access == RPAL_FAILURE) {
+ return -RPAL_ERR_PEER_MEM;
+ }
+ return ret;
+}
+
+int rpal_write_ptrs(int service_id, uint64_t rpalfd, int64_t *ptrs, int len)
+{
+ return __rpal_write_ptrs_common(service_id, rpalfd, ptrs, len,
+ RCALL_IN);
+}
+
+int rpal_read_ptrs(int fd, int64_t *dptrs, int len)
+{
+ fd_event_t *fde;
+ fd_table_t *fdt = threads_md.rtp->fdt;
+ int ret;
+
+ if (!rpal_inited())
+ return -1;
+
+ fde = fd_event_get(fdt, fd);
+ if (!fde)
+ return -1;
+
+ ret = rpal_queue_get(&fde->q, dptrs, len);
+ fd_event_put(fdt, fde);
+ return ret;
+}
+
+int rpal_read_ptrs_trigger_out(int fd, int64_t *dptrs, int len, int service_id,
+ uint64_t rpalfd)
+{
+ fd_event_t *fde;
+ fd_table_t *fdt = threads_md.rtp->fdt;
+ int access, ret = -1;
+ int nread;
+
+ if (!rpal_inited())
+ return -1;
+
+ fde = fd_event_get(fdt, fd);
+ if (!fde)
+ return -1;
+
+ nread = rpal_queue_get(&fde->q, dptrs, len);
+ if (nread > 0 && clear_fde_trigger(fde)) {
+ access =
+ rpal_write_access_safety(do_rpal_call, &ret, service_id,
+ rpalfd, NULL, 0, RCALL_OUT);
+ if (access == RPAL_FAILURE || ret < 0) {
+ set_fde_trigger(fde);
+ errprint(
+ "trigger out failed! access: %d, ret: %d, id: %d, rpalfd: %lx\n",
+ access, ret, service_id, rpalfd);
+ }
+ }
+ fd_event_put(fdt, fde);
+
+ return nread;
+}
+
+static inline int pkey_is_invalid(const int pkey)
+{
+ return (pkey < 0 || pkey > 15);
+}
+
+static status_t rpal_thread_metadata_init(int nr_rpalthread,
+ rpal_error_code_t *error)
+{
+ uint64_t key;
+ struct rpal_thread_pool *rtp;
+ key = __rpal_get_service_key();
+ if (key >= 1UL << 63) {
+ ERRREPORT(
+ error, RPAL_ERR_SERVICE_KEY,
+ "rpal service key error. Service key: 0x%lx, oeverflow, should less than 2^63\n",
+ key);
+ goto error_out;
+ }
+ threads_md.service_key = key;
+ threads_md.service_id = __rpal_get_service_id();
+ pthread_mutex_init(&release_lock, NULL);
+ rpal_get_critical_addr(&rcs);
+ rtp = rpal_thread_pool_create(nr_rpalthread, &threads_md);
+ if (rtp == NULL) {
+ goto error_out;
+ }
+ rtp->service_key = threads_md.service_key;
+ rtp->service_id = threads_md.service_id;
+ threads_md.rtp = rtp;
+ if (rpal_enable_service(error) == RPAL_FAILURE)
+ goto destroy_thread_pool;
+ threads_md.pid = getpid();
+ return RPAL_SUCCESS;
+
+destroy_thread_pool:
+ rpal_thread_pool_destory(&threads_md);
+error_out:
+ return RPAL_FAILURE;
+}
+
+static void rpal_thread_metadata_exit(void)
+{
+ rpal_disable_service();
+ rpal_thread_pool_destory(&threads_md);
+}
+
+static status_t rpal_senders_metadata_init(rpal_error_code_t *error)
+{
+ if (senders_md) {
+ ERRREPORT(error, RPAL_ERR_SENDERS_METADATA,
+ "senders metadata is already initialized.\n");
+ return RPAL_FAILURE;
+ }
+
+ senders_md = malloc(sizeof(struct rpal_senders_metadata));
+ if (!senders_md) {
+ ERRREPORT(error, RPAL_ERR_NOMEM,
+ "senders metadata alloc failed.\n");
+ goto sendes_alloc_failed;
+ }
+ senders_md->sdpage_order = SENDERS_PAGE_ORDER;
+ memset(senders_md->bitmap, 0xFF,
+ sizeof(unsigned long) * BITS_TO_LONGS(MAX_SENDERS));
+ pthread_mutex_init(&senders_md->lock, NULL);
+ senders_md->senders = rpal_get_shared_page(senders_md->sdpage_order);
+ if (!senders_md->senders) {
+ ERRREPORT(error, RPAL_ERR_SENDER_PAGES,
+ "get senders share page error.\n");
+ goto pages_alloc_failed;
+ }
+ dbprint(RPAL_DEBUG_MANAGEMENT, "senders pages addr: 0x%016lx\n",
+ (unsigned long)senders_md->senders);
+ return RPAL_SUCCESS;
+
+pages_alloc_failed:
+ free(senders_md);
+sendes_alloc_failed:
+ return RPAL_FAILURE;
+}
+
+static void rpal_senders_metadata_exit(void)
+{
+ if (!senders_md)
+ return;
+
+ rpal_free_shared_page((void *)senders_md->senders,
+ senders_md->sdpage_order);
+ pthread_mutex_destroy(&senders_md->lock);
+ free(senders_md);
+}
+
+static int rpal_get_version_cap(rpal_capability_t *version)
+{
+ return rpal_ioctl(RPAL_IOCTL_GET_API_VERSION_AND_CAP,
+ (unsigned long)version);
+}
+
+static status_t rpal_version_check(rpal_capability_t *ver)
+{
+ if (ver->compat_version != MIN_RPAL_KERNEL_API_VERSION)
+ return RPAL_FAILURE;
+ if (ver->api_version < TARGET_RPAL_KERNEL_API_VERSION)
+ return RPAL_FAILURE;
+ return RPAL_SUCCESS;
+}
+
+static status_t rpal_capability_check(rpal_capability_t *ver)
+{
+ unsigned long cap = ver->cap;
+
+ if (!(cap & (1 << RPAL_CAP_PKU))) {
+ return RPAL_FAILURE;
+ }
+ return RPAL_SUCCESS;
+}
+
+static status_t rpal_check_version_cap(rpal_error_code_t *error)
+{
+ int ret;
+
+ ret = rpal_get_version_cap(&version);
+ if (ret < 0) {
+ ERRREPORT(error, RPAL_ERR_GET_CAP_VERSION,
+ "rpal get version failed: %d\n", ret);
+ ret = RPAL_FAILURE;
+ goto out;
+ }
+ ret = rpal_version_check(&version);
+ if (ret == RPAL_FAILURE) {
+ ERRREPORT(
+ error, RPAL_KERNEL_API_NOTSUPPORT,
+ "kernel rpal(version: %d-%d) API is not compatible with librpal(version: %d-%d)\n",
+ version.compat_version, version.api_version,
+ MIN_RPAL_KERNEL_API_VERSION,
+ TARGET_RPAL_KERNEL_API_VERSION);
+ goto out;
+ }
+ ret = rpal_capability_check(&version);
+ if (ret == RPAL_FAILURE) {
+ ERRREPORT(error, RPAL_HARDWARE_NOTSUPPORT,
+ "hardware do not support RPAL\n");
+ goto out;
+ }
+out:
+ return ret;
+}
+
+static status_t rpal_mgtfd_init(rpal_error_code_t *error)
+{
+ int err, n;
+ int mgtfd;
+ char name[1024];
+
+ mgtfd = open(RPAL_MGT_FILE, O_RDWR);
+ if (mgtfd == -1) {
+ err = errno;
+ switch (err) {
+ case EPERM:
+ n = readlink("/proc/self/exe", name, sizeof(name) - 1);
+ if (n < 0) {
+ n = 0;
+ }
+ name[n] = 0;
+ errprint("%s is not a RPAL binary\n", name);
+ break;
+ case ENOENT:
+ errprint("Not in RPAL Environment\n");
+ break;
+ default:
+ errprint("open %s fail, %d, %s\n", RPAL_MGT_FILE, err,
+ strerror(err));
+ }
+ if (error) {
+ *error = RPAL_ERR_RPALFILE_OPS;
+ }
+ return RPAL_FAILURE;
+ }
+ rpal_mgtfd = mgtfd;
+ return RPAL_SUCCESS;
+}
+
+static void rpal_mgtfd_destroy(void)
+{
+ if (rpal_mgtfd != -1) {
+ close(rpal_mgtfd);
+ }
+ return;
+}
+
+#define RPAL_SECTION_SIZE (512 * 1024 * 1024 * 1024UL)
+
+static inline status_t rpal_check_address(uint64_t start, uint64_t end,
+ uint64_t check)
+{
+ if (check >= start && check < end) {
+ return RPAL_SUCCESS;
+ }
+ return RPAL_FAILURE;
+}
+
+static status_t rpal_managment_init(rpal_error_code_t *error)
+{
+ int i = 0;
+
+ if (rpal_mgtfd_init(error) == RPAL_FAILURE) {
+ goto mgtfd_init_failed;
+ }
+ if (pthread_key_create(&rpal_key, NULL))
+ goto rpal_key_failed;
+
+ for (i = 0; i < MAX_SERVICEID; i++) {
+ requested_services[i].key = 0;
+ requested_services[i].service = NULL;
+ requested_services[i].pkey = -1;
+ }
+ if (rpal_check_version_cap(error) == RPAL_FAILURE) {
+ goto rpal_check_failed;
+ }
+ return RPAL_SUCCESS;
+
+rpal_check_failed:
+ pthread_key_delete(rpal_key);
+rpal_key_failed:
+ rpal_mgtfd_destroy();
+mgtfd_init_failed:
+ return RPAL_FAILURE;
+}
+
+static void rpal_managment_exit(void)
+{
+ pthread_key_delete(rpal_key);
+ rpal_mgtfd_destroy();
+ return;
+}
+
+int rpal_init(int nr_rpalthread, int flags, rpal_error_code_t *error)
+{
+ if (nr_rpalthread <= 0) {
+ dbprint(RPAL_DEBUG_MANAGEMENT,
+ "%s: nr_rpalthread(%d) less than or equal to 0\n",
+ __FUNCTION__, nr_rpalthread);
+ return RPAL_FAILURE;
+ }
+ if (rpal_managment_init(error) == RPAL_FAILURE) {
+ goto error_out;
+ }
+ if (rpal_thread_metadata_init(nr_rpalthread, error) == RPAL_FAILURE)
+ goto managment_exit;
+
+ if (rpal_senders_metadata_init(error) == RPAL_FAILURE)
+ goto thread_md_exit;
+
+ inited = 1;
+ dbprint(RPAL_DEBUG_MANAGEMENT,
+ "rpal init success, service key: 0x%lx, service id: %d, "
+ "critical_start: 0x%016lx, critical_end: 0x%016lx\n",
+ threads_md.service_key, threads_md.service_id, rcs.ret_begin,
+ rcs.ret_end);
+ return rpal_mgtfd;
+
+thread_md_exit:
+ rpal_thread_metadata_exit();
+managment_exit:
+ rpal_managment_exit();
+error_out:
+ return RPAL_FAILURE;
+}
+
+void rpal_exit(void)
+{
+ if (rpal_inited()) {
+ dbprint(RPAL_DEBUG_MANAGEMENT,
+ "rpal exit, service key: 0x%lx, service id: %d\n",
+ threads_md.service_key, threads_md.service_id);
+ rpal_senders_metadata_exit();
+ rpal_thread_metadata_exit();
+ rpal_managment_exit();
+ }
+}
diff --git a/samples/rpal/librpal/rpal.h b/samples/rpal/librpal/rpal.h
new file mode 100644
index 000000000000..e91a206b8370
--- /dev/null
+++ b/samples/rpal/librpal/rpal.h
@@ -0,0 +1,149 @@
+#ifndef RPAL_H_INCLUDED
+#define RPAL_H_INCLUDED
+
+#ifdef __cplusplus
+#if __cplusplus
+extern "C" {
+#endif
+#endif /* __cplusplus */
+
+#include <stdint.h>
+#include <stdarg.h>
+#include <sys/epoll.h>
+
+typedef enum rpal_error_code {
+ RPAL_ERR_NONE = 0,
+ RPAL_ERR_BAD_ARG = 1,
+ RPAL_ERR_NO_SERVICE = 2,
+ RPAL_ERR_MAPPED = 3,
+ RPAL_ERR_RETRY = 4,
+ RPAL_ERR_BAD_SERVICE_STATUS = 5,
+ RPAL_ERR_BAD_THREAD_STATUS = 6,
+ RPAL_ERR_REACH_LIMIT = 7,
+ RPAL_ERR_NOMEM = 8,
+ RPAL_ERR_NOMAPPING = 9,
+ RPAL_ERR_INVAL = 10,
+
+ RPAL_ERR_KERNEL_MAX_CODE = 100,
+
+ RPAL_ERR_RPALFILE_OPS, /**< Failed to open /proc/self/rpal */
+ RPAL_ERR_RPAL_DISABLED,
+ RPAL_ERR_GET_CAP_VERSION,
+ RPAL_KERNEL_API_NOTSUPPORT,
+ RPAL_HARDWARE_NOTSUPPORT,
+ RPAL_ERR_SERVICE_KEY, /**< Failed to get service key */
+ RPAL_ERR_SENDERS_METADATA,
+ RPAL_ERR_ENABLE_SERVICE,
+ RPAL_ERR_SENDER_PAGES,
+ RPAL_DONT_INITED,
+ RPAL_ERR_SENDER_INIT,
+ RPAL_ERR_SENDER_REG,
+ RPAL_INVALID_ARG,
+ RPAL_CACHE_FULL,
+ RPAL_FDE_OUTDATED,
+ RPAL_QUEUE_PUT_FAILED,
+ RPAL_ERR_PEER_MEM,
+ RPAL_ERR_NOTIFY_RECVER,
+ RPAL_INVAL_THREAD,
+ RPAL_INVAL_SERVICE,
+} rpal_error_code_t;
+
+#define EPOLLRPALIN 0x00020000
+#define EPOLLRPALOUT 0x00040000
+
+typedef enum rpal_features {
+ RPAL_SENDER_RECEIVER = 0x1 << 0,
+} rpal_features_t;
+
+typedef enum status {
+ RPAL_FAILURE = -1, /**< return value indicating failure */
+ RPAL_SUCCESS /**< return value indicating success */
+} status_t;
+
+#define RPAL_PUBLIC __attribute__((visibility("default")))
+
+RPAL_PUBLIC
+int rpal_init(int nr_rpalthread, int flags, rpal_error_code_t *error);
+
+RPAL_PUBLIC
+void rpal_exit(void);
+
+RPAL_PUBLIC
+int rpal_receiver_init(void);
+
+RPAL_PUBLIC
+void rpal_receiver_exit(void);
+
+RPAL_PUBLIC
+int rpal_request_service(uint64_t key);
+
+RPAL_PUBLIC
+status_t rpal_release_service(uint64_t key);
+
+RPAL_PUBLIC
+status_t rpal_clean_service_start(int64_t *ptr);
+
+RPAL_PUBLIC
+void rpal_clean_service_end(int64_t *ptr);
+
+RPAL_PUBLIC
+int rpal_get_service_id(void);
+
+RPAL_PUBLIC
+status_t rpal_get_service_key(uint64_t *service_key);
+
+RPAL_PUBLIC
+int rpal_get_request_service_id(uint64_t key);
+
+RPAL_PUBLIC
+status_t rpal_uds_fdmap(uint64_t sid_fd, uint64_t *rpalfd);
+
+RPAL_PUBLIC
+int rpal_get_peer_rid(uint64_t sid_fd);
+
+RPAL_PUBLIC
+status_t rpal_sender_init(rpal_error_code_t *error);
+
+RPAL_PUBLIC
+status_t rpal_sender_exit(void);
+
+/* Hook epoll syscall */
+RPAL_PUBLIC
+int rpal_epoll_wait(int epfd, struct epoll_event *events, int maxevents,
+ int timeout);
+
+RPAL_PUBLIC
+int rpal_epoll_wait_user(int epfd, struct epoll_event *events, int maxevents,
+ int timeout);
+
+RPAL_PUBLIC
+int rpal_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
+
+RPAL_PUBLIC
+status_t rpal_copy_prepare(int service_id);
+
+RPAL_PUBLIC
+status_t rpal_copy_finish(void);
+
+RPAL_PUBLIC
+int rpal_write_ptrs(int service_id, uint64_t rpalfd, int64_t *ptrs, int len);
+
+RPAL_PUBLIC
+int rpal_read_ptrs(int fd, int64_t *ptrs, int len);
+
+typedef int (*access_fn)(va_list args);
+RPAL_PUBLIC
+status_t rpal_read_access_safety(access_fn do_access, int *do_access_ret, ...);
+
+RPAL_PUBLIC
+void rpal_recver_count_print(void);
+
+RPAL_PUBLIC
+void rpal_sender_count_print(void);
+
+#ifdef __cplusplus
+#if __cplusplus
+}
+#endif
+#endif
+#endif //!_RPAL_H_INCLUDED
diff --git a/samples/rpal/librpal/rpal_pkru.h b/samples/rpal/librpal/rpal_pkru.h
new file mode 100644
index 000000000000..9590aa7203bb
--- /dev/null
+++ b/samples/rpal/librpal/rpal_pkru.h
@@ -0,0 +1,78 @@
+#include <x86intrin.h>
+#include "private.h"
+
+#define RPAL_PKRU_BASE_CODE_READ 0xAAAAAAAA
+#define RPAL_PKRU_BASE_CODE 0xFFFFFFFF
+#define RPAL_NO_PKEY -1
+
+typedef uint32_t u32;
+/*
+ * extern __inline unsigned int
+ * __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ * _rdpkru_u32 (void)
+ * {
+ * return __builtin_ia32_rdpkru ();
+ * }
+ *
+ * extern __inline void
+ * __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ * _wrpkru (unsigned int __key)
+ * {
+ * __builtin_ia32_wrpkru (__key);
+ * }
+ */
+// #define rdpkru _rdpkru_u32
+// #define wrpkru _wrpkru
+static inline uint32_t rdpkru(void)
+{
+ uint32_t ecx = 0;
+ uint32_t edx, pkru;
+
+ /*
+ * "rdpkru" instruction. Places PKRU contents in to EAX,
+ * clears EDX and requires that ecx=0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a"(pkru), "=d"(edx)
+ : "c"(ecx));
+ return pkru;
+}
+
+static inline void wrpkru(uint32_t pkru)
+{
+ uint32_t ecx = 0, edx = 0;
+
+ /*
+ * "wrpkru" instruction. Loads contents in EAX to PKRU,
+ * requires that ecx = edx = 0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ :
+ : "a"(pkru), "c"(ecx), "d"(edx));
+}
+
+static inline u32 rpal_pkey_to_pkru(int pkey)
+{
+ int offset = pkey * 2;
+ u32 mask = 0x3 << offset;
+
+ return RPAL_PKRU_BASE_CODE & ~mask;
+}
+
+static inline u32 rpal_pkey_to_pkru_read(int pkey)
+{
+ int offset = pkey * 2;
+ u32 mask = 0x3 << offset;
+
+ return RPAL_PKRU_BASE_CODE_READ & ~mask;
+}
+
+static inline u32 rpal_pkru_union(u32 pkru0, u32 pkru1)
+{
+ return pkru0 & pkru1;
+}
+
+static inline u32 rpal_pkru_intersect(u32 pkru0, u32 pkru1)
+{
+ return pkru0 | pkru1;
+}
diff --git a/samples/rpal/librpal/rpal_queue.c b/samples/rpal/librpal/rpal_queue.c
new file mode 100644
index 000000000000..07a90122aa16
--- /dev/null
+++ b/samples/rpal/librpal/rpal_queue.c
@@ -0,0 +1,239 @@
+#include "rpal_queue.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#define min(X, Y) ({ ((X) > (Y)) ? (Y) : (X); })
+
+static unsigned int roundup_pow_of_two(unsigned int data)
+{
+ unsigned int msb_position;
+
+ if (data <= 1)
+ return 1;
+ if (!(data & (data - 1)))
+ return data;
+
+ msb_position = 31 - __builtin_clz(data);
+ assert(msb_position < 31);
+ return 1 << (msb_position + 1);
+}
+
+QUEUE_UINT rpal_queue_unused(rpal_queue_t *q)
+{
+ return (q->mask + 1) - (q->tail - q->head);
+}
+
+QUEUE_UINT rpal_queue_len(rpal_queue_t *q)
+{
+ return (q->tail - q->head);
+}
+
+int rpal_queue_init(rpal_queue_t *q, void *data, QUEUE_UINT_INC usize)
+{
+ QUEUE_UINT_INC size;
+ if (usize > QUEUE_UINT_MAX || !data) {
+ return -1;
+ }
+ size = roundup_pow_of_two(usize);
+ if (usize != size) {
+ return -1;
+ }
+ q->data = data;
+ memset(q->data, 0, size * sizeof(int64_t));
+ q->head = 0;
+ q->tail = 0;
+ q->mask = size - 1;
+ return 0;
+}
+
+void *rpal_queue_destroy(rpal_queue_t *q)
+{
+ void *data = q->data;
+ if (q->data) {
+ q->data = NULL;
+ }
+ q->mask = 0;
+ q->head = 0;
+ q->tail = 0;
+ return data;
+}
+
+int rpal_queue_alloc(rpal_queue_t *q, QUEUE_UINT_INC size)
+{
+ assert(q && size);
+ if (size > QUEUE_UINT_MAX) {
+ return -1;
+ }
+ size = roundup_pow_of_two(size);
+ q->data = malloc(size * sizeof(int64_t));
+ if (!q->data)
+ return -1;
+ memset(q->data, 0, size * sizeof(int64_t));
+ q->head = 0;
+ q->tail = 0;
+ q->mask = size - 1;
+ return 0;
+}
+
+void rpal_queue_free(rpal_queue_t *q)
+{
+ if (q->data) {
+ free(q->data);
+ q->data = NULL;
+ }
+ q->mask = 0;
+ q->head = 0;
+ q->tail = 0;
+}
+
+static void rpal_queue_copy_in(rpal_queue_t *q, const int64_t *buf,
+ QUEUE_UINT_INC len, QUEUE_UINT off)
+{
+ QUEUE_UINT_INC l;
+ QUEUE_UINT_INC size = q->mask + 1;
+
+ off &= q->mask;
+ l = min(len, size - off);
+
+ memcpy(q->data + off, buf, l << 3);
+ memcpy(q->data, buf + l, (len - l) << 3);
+ asm volatile("" : : : "memory");
+}
+
+QUEUE_UINT_INC rpal_queue_put(rpal_queue_t *q, const int64_t *buf,
+ QUEUE_UINT_INC len)
+{
+ QUEUE_UINT_INC l;
+
+ if (!q->data) {
+ return 0;
+ }
+ l = rpal_queue_unused(q);
+ if (len > l) {
+ return 0;
+ }
+ l = len;
+ rpal_queue_copy_in(q, buf, l, q->tail);
+ q->tail += l;
+ return l;
+}
+
+static QUEUE_UINT_INC rpal_queue_copy_out(rpal_queue_t *q, int64_t *buf,
+ QUEUE_UINT_INC len, QUEUE_UINT head)
+{
+ unsigned int l;
+ QUEUE_UINT tail;
+ QUEUE_UINT off;
+ QUEUE_UINT_INC size = q->mask + 1;
+
+ tail = __atomic_load_n(&q->tail, __ATOMIC_RELAXED);
+ len = min((QUEUE_UINT)(tail - head), len);
+ if (head == tail)
+ return 0;
+ off = head & q->mask;
+ l = min(len, size - off);
+
+ memcpy(buf, q->data + off, l << 3);
+ memcpy(buf + l, q->data, (len - l) << 3);
+
+ return len;
+}
+
+QUEUE_UINT_INC rpal_queue_peek(rpal_queue_t *q, int64_t *buf,
+ QUEUE_UINT_INC len, QUEUE_UINT *phead)
+{
+ QUEUE_UINT_INC copied;
+ QUEUE_UINT head;
+
+ head = __atomic_load_n(&q->head, __ATOMIC_RELAXED);
+ copied = rpal_queue_copy_out(q, buf, len, head);
+ if (phead) {
+ *phead = head;
+ }
+ return copied;
+}
+
+QUEUE_UINT_INC rpal_queue_skip(rpal_queue_t *q, QUEUE_UINT head,
+ QUEUE_UINT_INC skip)
+{
+ if (skip > rpal_queue_len(q)) {
+ return 0;
+ }
+ if (__atomic_compare_exchange_n(&q->head, &head, head + skip, 1,
+ __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+ return skip;
+ }
+ return 0;
+}
+
+QUEUE_UINT_INC rpal_queue_get(rpal_queue_t *q, int64_t *buf, QUEUE_UINT_INC len)
+{
+ QUEUE_UINT_INC copied;
+ QUEUE_UINT head;
+
+ while (1) {
+ head = __atomic_load_n(&q->head, __ATOMIC_RELAXED);
+ copied = rpal_queue_copy_out(q, buf, len, head);
+ if (__atomic_compare_exchange_n(&q->head, &head, head + copied,
+ 1, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED)) {
+ return copied;
+ }
+ }
+}
+
+void rpal_uevent_queue_init(epoll_uevent_queue_t *ueventq,
+ volatile uint64_t *uqlock)
+{
+ int i;
+ __atomic_store_n(uqlock, (uint64_t)0, __ATOMIC_RELAXED);
+ ueventq->l_beg = 0;
+ ueventq->l_end = 0;
+ ueventq->l_end_cache = 0;
+ for (i = 0; i < MAX_RDY; ++i) {
+ ueventq->fds[i] = -1;
+ }
+ return;
+}
+
+QUEUE_UINT uevent_queue_len(epoll_uevent_queue_t *ueventq)
+{
+ return (ueventq->l_end - ueventq->l_beg);
+}
+
+QUEUE_UINT uevent_queue_add(epoll_uevent_queue_t *ueventq, int fd)
+{
+ unsigned int pos;
+ if (uevent_queue_len(ueventq) == MAX_RDY)
+ return MAX_RDY;
+ pos = __sync_fetch_and_add(&ueventq->l_end_cache, 1);
+ pos %= MAX_RDY;
+ ueventq->fds[pos] = fd;
+ asm volatile("" : : : "memory");
+ __sync_fetch_and_add(&ueventq->l_end, 1);
+ return (pos);
+}
+
+int uevent_queue_del(epoll_uevent_queue_t *ueventq)
+{
+ int fd = -1;
+ int pos;
+ if (uevent_queue_len(ueventq) == 0) {
+ return -1;
+ }
+ pos = ueventq->l_beg % MAX_RDY;
+ fd = ueventq->fds[pos];
+ asm volatile("" : : : "memory");
+ __sync_fetch_and_add(&ueventq->l_beg, 1);
+ return fd;
+}
+
+int uevent_queue_fix(epoll_uevent_queue_t *ueventq)
+{
+ __atomic_store_n(&ueventq->l_end_cache, ueventq->l_end,
+ __ATOMIC_SEQ_CST);
+ return 0;
+}
diff --git a/samples/rpal/librpal/rpal_queue.h b/samples/rpal/librpal/rpal_queue.h
new file mode 100644
index 000000000000..224e7b449d50
--- /dev/null
+++ b/samples/rpal/librpal/rpal_queue.h
@@ -0,0 +1,55 @@
+#ifndef RPAL_QUEUE_H
+#define RPAL_QUEUE_H
+
+#include <stdint.h>
+
+// typedef uint8_t QUEUE_UINT;
+// typedef uint16_t QUEUE_UINT_INC;
+// #define QUEUE_UINT_MAX UINT8_MAX
+
+// typedef uint16_t QUEUE_UINT;
+// typedef uint32_t QUEUE_UINT_INC;
+// #define QUEUE_UINT_MAX UINT16_MAX
+
+typedef uint32_t QUEUE_UINT;
+typedef uint64_t QUEUE_UINT_INC;
+#define QUEUE_UINT_MAX UINT32_MAX
+
+typedef struct rpal_queue {
+ QUEUE_UINT head;
+ QUEUE_UINT tail;
+ QUEUE_UINT mask;
+ uint64_t *data;
+} rpal_queue_t;
+
+QUEUE_UINT rpal_queue_len(rpal_queue_t *q);
+QUEUE_UINT rpal_queue_unused(rpal_queue_t *q);
+int rpal_queue_init(rpal_queue_t *q, void *data, QUEUE_UINT_INC usize);
+void *rpal_queue_destroy(rpal_queue_t *q);
+int rpal_queue_alloc(rpal_queue_t *q, QUEUE_UINT_INC size);
+void rpal_queue_free(rpal_queue_t *q);
+QUEUE_UINT_INC rpal_queue_put(rpal_queue_t *q, const int64_t *buf,
+ QUEUE_UINT_INC len);
+QUEUE_UINT_INC rpal_queue_get(rpal_queue_t *q, int64_t *buf,
+ QUEUE_UINT_INC len);
+QUEUE_UINT_INC rpal_queue_peek(rpal_queue_t *q, int64_t *buf,
+ QUEUE_UINT_INC len, QUEUE_UINT *phead);
+QUEUE_UINT_INC rpal_queue_skip(rpal_queue_t *q, QUEUE_UINT head,
+ QUEUE_UINT_INC skip);
+
+#define MAX_RDY 4096
+typedef struct epoll_uevent_queue {
+ int fds[MAX_RDY];
+ volatile QUEUE_UINT l_beg;
+ volatile QUEUE_UINT l_end;
+ volatile QUEUE_UINT l_end_cache;
+} epoll_uevent_queue_t;
+
+void rpal_uevent_queue_init(epoll_uevent_queue_t *ueventq,
+ volatile uint64_t *uqlock);
+QUEUE_UINT uevent_queue_len(epoll_uevent_queue_t *ueventq);
+QUEUE_UINT uevent_queue_add(epoll_uevent_queue_t *ueventq, int fd);
+int uevent_queue_del(epoll_uevent_queue_t *ueventq);
+int uevent_queue_fix(epoll_uevent_queue_t *ueventq);
+
+#endif
diff --git a/samples/rpal/librpal/rpal_x86_64_call_ret.S b/samples/rpal/librpal/rpal_x86_64_call_ret.S
new file mode 100644
index 000000000000..a7c09a1b033d
--- /dev/null
+++ b/samples/rpal/librpal/rpal_x86_64_call_ret.S
@@ -0,0 +1,45 @@
+#ifdef __x86_64__
+#define __ASSEMBLY__
+#include "asm_define.h"
+#define RPAL_SENDER_STATE_RUNNING $0x0
+#define RPAL_SENDER_STATE_CALL $0x1
+
+.text
+.globl rpal_ret_critical
+.type rpal_ret_critical,@function
+.align 16
+
+//void rpal_ret_criticalreceiver_context_t *rc, rpal_call_info_t *rci
+
+rpal_ret_critical:
+ mov RPAL_SENDER_STATE_CALL, %eax
+ mov RPAL_SENDER_STATE_RUNNING, %ecx
+ lock cmpxchg %ecx, RC_SENDER_STATE(%rdi)
+ret_begin:
+ jne 2f
+ movq RCI_PKRU(%rsi), %rax
+ xor %edx, %edx
+ .byte 0x0f,0x01,0xef
+ movq RCI_SENDER_TLS_BASE(%rsi), %rax
+ wrfsbase %rax
+ret_end:
+ movq RCI_SENDER_FCTX(%rsi), %rdi
+ call jump_fcontext@plt
+2:
+ ret
+
+.globl rpal_get_critical_addr
+.type rpal_get_critical_addr,@function
+.align 16
+rpal_get_critical_addr:
+ leaq ret_begin(%rip), %rax
+ movq %rax, RET_BEGIN(%rdi)
+ leaq ret_end(%rip), %rax
+ movq %rax, RET_END(%rdi)
+ ret
+
+.size rpal_ret_critical,.-rpal_ret_critical
+
+/* Mark that we don't need executable stack. */
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/samples/rpal/offset.sh b/samples/rpal/offset.sh
new file mode 100755
index 000000000000..f5ae77b893e8
--- /dev/null
+++ b/samples/rpal/offset.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+set -e
+CUR_DIR=$(dirname $(realpath -s "$0"))
+gcc -masm=intel -S $CUR_DIR/asm_define.c -o - | awk '($1 == "->") { print "#define " $2 " " $3 }' > $CUR_DIR/librpal/asm_define.h
\ No newline at end of file
diff --git a/samples/rpal/server.c b/samples/rpal/server.c
new file mode 100644
index 000000000000..82c5c9dec922
--- /dev/null
+++ b/samples/rpal/server.c
@@ -0,0 +1,249 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/epoll.h>
+#include <x86intrin.h>
+#include "librpal/rpal.h"
+
+#define SOCKET_PATH "/tmp/rpal_socket"
+#define MAX_EVENTS 10
+#define BUFFER_SIZE 1025
+#define MSG_LEN 32
+
+#define INIT_MSG "INIT"
+#define SUCC_MSG "SUCC"
+#define FAIL_MSG "FAIL"
+
+#define handle_error(s) \
+ do { \
+ perror(s); \
+ exit(EXIT_FAILURE); \
+ } while (0)
+
+uint64_t service_key;
+int server_fd;
+int epoll_fd;
+
+int rpal_epoll_add(int epfd, int fd)
+{
+ struct epoll_event ev;
+
+ ev.events = EPOLLRPALIN | EPOLLIN | EPOLLRDHUP | EPOLLET;
+ ev.data.fd = fd;
+
+ return rpal_epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev);
+}
+
+void rpal_server_init(int fd, int epoll_fd)
+{
+ char buffer[BUFFER_SIZE];
+ rpal_error_code_t err;
+ uint64_t remote_key, service_key;
+ int remote_id;
+ int proc_fd;
+ int ret;
+
+ proc_fd = rpal_init(1, 0, &err);
+ if (proc_fd < 0)
+ handle_error("rpal init fail");
+ rpal_get_service_key(&service_key);
+
+ rpal_epoll_add(epoll_fd, fd);
+
+ ret = read(fd, buffer, BUFFER_SIZE);
+ if (ret < 0)
+ handle_error("rpal init: read");
+
+ if (strncmp(buffer, INIT_MSG, strlen(INIT_MSG)) != 0) {
+ buffer[BUFFER_SIZE - 1] = 0;
+ handle_error("Invalid msg\n");
+ return;
+ }
+
+ remote_key = *(uint64_t *)(buffer + strlen(INIT_MSG));
+ ret = rpal_request_service(remote_key);
+ if (ret) {
+ uint64_t service_key = 0;
+ ret = write(fd, (char *)&service_key, sizeof(uint64_t));
+ handle_error("request service fail");
+ return;
+ }
+ ret = write(fd, (char *)&service_key, sizeof(uint64_t));
+ if (ret < 0)
+ handle_error("write error");
+
+ ret = read(fd, buffer, BUFFER_SIZE);
+ if (ret < 0)
+ handle_error("handshake read");
+
+ if (strncmp(SUCC_MSG, buffer, strlen(SUCC_MSG)) != 0)
+ handle_error("handshake");
+
+ remote_id = rpal_get_request_service_id(remote_key);
+ if (remote_id < 0)
+ handle_error("remote id get fail");
+ rpal_receiver_init();
+}
+
+void run_rpal_server(int msg_len)
+{
+ struct epoll_event ev, events[MAX_EVENTS];
+ int new_socket;
+ int nfds;
+ uint64_t tsc, total_tsc = 0;
+ int count = 0;
+
+ while (1) {
+ nfds = rpal_epoll_wait(epoll_fd, events, MAX_EVENTS, -1);
+ if (nfds == -1) {
+ perror("epoll_wait");
+ exit(EXIT_FAILURE);
+ }
+
+ for (int n = 0; n < nfds; ++n) {
+ if (events[n].data.fd == server_fd) {
+ new_socket = accept(server_fd, NULL, NULL);
+ if (new_socket == -1) {
+ perror("accept");
+ continue;
+ }
+
+ rpal_server_init(new_socket, epoll_fd);
+ } else if (events[n].events & EPOLLRDHUP) {
+ close(events[n].data.fd);
+ goto finish;
+ } else if (events[n].events & EPOLLRPALIN) {
+ char buffer[BUFFER_SIZE] = { 0 };
+
+ ssize_t valread = rpal_read_ptrs(
+ events[n].data.fd, (int64_t *)buffer,
+ MSG_LEN / sizeof(int64_t *));
+ if (valread <= 0) {
+ close(events[n].data.fd);
+ epoll_ctl(epoll_fd, EPOLL_CTL_DEL,
+ events[n].data.fd, NULL);
+ goto finish;
+ } else {
+ count++;
+ sscanf(buffer, "0x%016lx", &tsc);
+ total_tsc += __rdtsc() - tsc;
+ send(events[n].data.fd, buffer, msg_len,
+ 0);
+ }
+ } else {
+ perror("bad request\n");
+ }
+ }
+ }
+finish:
+ printf("RPAL: Message length: %d bytes, Total TSC cycles: %lu, "
+ "Message count: %d, Average latency: %lu cycles\n",
+ MSG_LEN, total_tsc, count, total_tsc / count);
+}
+
+void run_server(int msg_len)
+{
+ struct epoll_event ev, events[MAX_EVENTS];
+ int new_socket;
+ int nfds;
+ uint64_t tsc, total_tsc = 0;
+ int count = 0;
+
+ while (1) {
+ nfds = epoll_wait(epoll_fd, events, MAX_EVENTS, -1);
+ if (nfds == -1) {
+ perror("epoll_wait");
+ exit(EXIT_FAILURE);
+ }
+
+ for (int n = 0; n < nfds; ++n) {
+ if (events[n].data.fd == server_fd) {
+ new_socket = accept(server_fd, NULL, NULL);
+ if (new_socket == -1) {
+ perror("accept");
+ continue;
+ }
+
+ ev.events = EPOLLIN;
+ ev.data.fd = new_socket;
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD,
+ new_socket, &ev) == -1) {
+ close(new_socket);
+ perror("epoll_ctl: add new socket");
+ }
+ } else if (events[n].events & EPOLLRDHUP) {
+ close(events[n].data.fd);
+ goto finish;
+ } else {
+ char buffer[BUFFER_SIZE] = { 0 };
+
+ ssize_t valread = read(events[n].data.fd,
+ buffer, BUFFER_SIZE);
+ if (valread <= 0) {
+ close(events[n].data.fd);
+ epoll_ctl(epoll_fd, EPOLL_CTL_DEL,
+ events[n].data.fd, NULL);
+ goto finish;
+ } else {
+ count++;
+ sscanf(buffer, "0x%016lx", &tsc);
+ total_tsc += __rdtsc() - tsc;
+ send(events[n].data.fd, buffer, msg_len,
+ 0);
+ }
+ }
+ }
+ }
+finish:
+ printf("EPOLL: Message length: %d bytes, Total TSC cycles: %lu, "
+ "Message count: %d, Average latency: %lu cycles\n",
+ MSG_LEN, total_tsc, count, total_tsc / count);
+}
+
+int main()
+{
+ struct sockaddr_un address;
+ struct epoll_event ev;
+
+ if ((server_fd = socket(AF_UNIX, SOCK_STREAM, 0)) == 0) {
+ perror("socket failed");
+ exit(EXIT_FAILURE);
+ }
+
+ memset(&address, 0, sizeof(address));
+ address.sun_family = AF_UNIX;
+ strncpy(address.sun_path, SOCKET_PATH, sizeof(SOCKET_PATH));
+
+ if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
+ perror("bind failed");
+ exit(EXIT_FAILURE);
+ }
+
+ if (listen(server_fd, 3) < 0) {
+ perror("listen");
+ exit(EXIT_FAILURE);
+ }
+
+ epoll_fd = epoll_create(1024);
+ if (epoll_fd == -1) {
+ perror("epoll_create");
+ exit(EXIT_FAILURE);
+ }
+
+ ev.events = EPOLLIN;
+ ev.data.fd = server_fd;
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, server_fd, &ev) == -1) {
+ perror("epoll_ctl: listen_sock");
+ exit(EXIT_FAILURE);
+ }
+
+ run_server(MSG_LEN);
+ run_rpal_server(MSG_LEN);
+
+ close(server_fd);
+ unlink(SOCKET_PATH);
+ return 0;
+}
--
2.20.1

Return-Path: <linux-kernel+bounces-667909-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 94BCF41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:43:50 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 47DB61C0199D
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:43:39 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 7BE8E221F04;
Fri, 30 May 2025 09:38:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=qualcomm.com header.i=@qualcomm.com header.b="m+mie/MQ"
Received: from mx0a-0031df01.pphosted.com (mx0a-0031df01.pphosted.com [205.220.168.131])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id E22AE220F4B
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:38:06 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=205.220.168.131
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597888; cv=none; b=O7YgQG3ov5PTHE3iow33xi/wn0lUkg787G3c2eY6Rv+91qCooj4yZf1UXHz63dnKmDgU0RCs4VOCrps/h/E7ype8Qj7mOhjTt9OIb54LFCewKiCTP10asvReyLRjr3AjkIG9sCg8rwJexpywlCQ3vYsGeF7Ek/XF9zNVkCHKM8g=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597888; c=relaxed/simple;
bh=FiXibMwqe7Bzxv2bD86eTxQ7GxvzILN7JJ8MXnunTTg=;
h=Message-ID:Date:MIME-Version:Subject:To:Cc:References:From:
In-Reply-To:Content-Type; b=ClFqM4Ri1hT/YmyE2rBAx9jwxOpHQdQDBtMOcwSIJTqQpZMJJQVcJ4JN96e+HfvbpW31nj8E0NRjHNxNdrvRH9oFiH1I6ePOgSKDU60U9maO/FmY2jr7bBq5W/EHoH1KTybfL4/33weIUdtDiG+hKz4iloxGgC959VKlgxX4e30=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=oss.qualcomm.com; spf=pass smtp.mailfrom=oss.qualcomm.com; dkim=pass (2048-bit key) header.d=qualcomm.com header.i=@qualcomm.com header.b=m+mie/MQ; arc=none smtp.client-ip=205.220.168.131
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=reject dis=none) header.from=oss.qualcomm.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=oss.qualcomm.com
Received: from pps.filterd (m0279865.ppops.net [127.0.0.1])
by mx0a-0031df01.pphosted.com (8.18.1.2/8.18.1.2) with ESMTP id 54U0aeQx007943
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:38:06 GMT
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=qualcomm.com; h=
cc:content-transfer-encoding:content-type:date:from:in-reply-to
:message-id:mime-version:references:subject:to; s=qcppdkim1; bh=
zfYXyl7VrRhi9hOpPAB1dRuMBvVrvMKjYSFM2OAmP3g=; b=m+mie/MQgL01XB4R
H0b1nhJl21Axh657OCSabkJDPAY37Ky5czOBYYlHWVWZQLNfHH2ctOW0l8mO76gY
zpxEVBxa9YmA0En+bv7wHx4iiD+zrs3o4H7e//46uXASpHEsPzpA+5/gkMX5DcA5
omhQ9Gmjhh7pl741ah6g1O3wWjwZwbWRzv8exE1IhLeUqKOc75lWff2VrK9VACgI
0bdeMGWUMlYxxOusCwU69NkFWeaSRF3bMnVdbm5j0TJExJlZlRDHja5PsHVuLBHX
WcVnLMi2LfQLBjEuYC98un7KXlJuHoY234MsqoB3ZPC7pHf1dyGpHs2ABSrW9vRl
qPkyIg==
Received: from mail-pf1-f200.google.com (mail-pf1-f200.google.com [209.85.210.200])
by mx0a-0031df01.pphosted.com (PPS) with ESMTPS id 46w992tpaq-1
(version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT)
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:38:05 +0000 (GMT)
Received: by mail-pf1-f200.google.com with SMTP id d2e1a72fcca58-74620e98ec8so1707451b3a.1
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 02:38:05 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
d=1e100.net; s=20230601; t=1748597885; x=1749202685;
h=content-transfer-encoding:in-reply-to:from:content-language
:references:cc:to:subject:user-agent:mime-version:date:message-id
:x-gm-message-state:from:to:cc:subject:date:message-id:reply-to;
bh=zfYXyl7VrRhi9hOpPAB1dRuMBvVrvMKjYSFM2OAmP3g=;
b=ilscGh9xQN/HXspAz1zpEXlB2EeM94wBCtV2jJft0FqiRwSj540W4hJ+a2inzVXXZw
vjHcKF3MUruJb2mSGCFIQmyk36uRgg57FLkFYAme7tOwqL+fZ2rQVhMIf0qN63q9ZIw2
xx8mX0Pli9JOuoRfEgSEwOVGhXFhEo72cocWEG8W8oaSBsdfrBE84UbMEscxLFWMKx5W
p9dOi61KKdUpCwHIeqGztxvELGA/TbKPx9FG5Hf5vZUFTPne5AqJwiigJtnsal1cOByK
uMNe0DyyKVVJa1GduDzaZErrZDZRThvDUXooaXZ+DdOaTahEhC1eT7ElqwVPj8bxu4LZ
hE1w==
X-Forwarded-Encrypted: i=1; AJvYcCX2AxZHCjC+NkHNz+yteI6gE6XYOix8GVX8BTkCyCM9kTbGUmJqkZV7jJh805dAg1e/3wkhTUGSuE8U0Wk=@vger.kernel.org
X-Gm-Message-State: AOJu0Yxm6B43CY97DC7rRTGojXXRgIIq61kpvRm3G6slV7lgN4qwnQdI
mVOu0Gc/CnW4XwZ7xOWAcXxaTiEtbqcrDQBDEZjIeJ8F36gou+FBDmWgMepmzV/Kko/mPD0Kf18
FQWcRpPCFyaFAoCmdiDGr+1AtXYfG8Tt4s4+fqnhjn0zWus4D3jcvYFjs74T5kIcT+QI=
X-Gm-Gg: ASbGncuoeju9G2KrADFK4SnDOOGUn8sl5atB/33vVnmOGgTyDWI1A39AOehXlxAmVjK
fMEnlAVz8/qbSB8hlZ99Qx4zGvyYRma600KmiMA+oW9hzKrhSRBj1tSLw4A11p8DnjPrwoKrNMF
aDkhYFEohj3XFuG0cqzIHZ7x+JrZL8awGohfzsPK/iJKjVMLMuMbIXLPMQmbwY2PwIyiDJOSie8
PszBBQ6HtDkQ/vf4aBQ3joYOYwHcAnHtQ3RKavmX9LUy7eWokHtHTsWCFTIuWrElx9+sURgUXsJ
RuwK/G7tL2wTJvriNshgpM1UrUojemEQbr4TBeTZq58vaSZO3FKoBHFofsS8U0vlqnGyxYG7+HI
/05EoFV8xitk=
X-Received: by 2002:a05:6a00:a87:b0:742:ae7e:7da1 with SMTP id d2e1a72fcca58-747bdbe8035mr3778277b3a.0.1748597885058;
Fri, 30 May 2025 02:38:05 -0700 (PDT)
X-Google-Smtp-Source: AGHT+IH7NO4N0v3Y1dOO+Iz5dJNKpo+fSGcvNyKt4RkjS35cwzJzfWOV/qn3qrS95ChBrq+phaPJRQ==
X-Received: by 2002:a05:6a00:a87:b0:742:ae7e:7da1 with SMTP id d2e1a72fcca58-747bdbe8035mr3778244b3a.0.1748597884623;
Fri, 30 May 2025 02:38:04 -0700 (PDT)
Received: from [10.133.33.104] (tpe-colo-wan-fw-bordernet.qualcomm.com. [103.229.16.4])
by smtp.gmail.com with ESMTPSA id d2e1a72fcca58-747affd437asm2661533b3a.150.2025.05.30.02.38.00
(version=TLS1_3 cipher=TLS_AES_128_GCM_SHA256 bits=128/128);
Fri, 30 May 2025 02:38:03 -0700 (PDT)
Message-ID: <3df56548-49ea-498c-9ee3-b7e1d2d85d2e@xxxxxxxxxxxxxxxx>
Date: Fri, 30 May 2025 17:37:58 +0800
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
User-Agent: Mozilla Thunderbird
Subject: Re: [PATCH v2 5/8] power: supply: qcom_battmgr: Add charge control
support
To: Bryan O'Donoghue <bryan.odonoghue@xxxxxxxxxx>,
Sebastian Reichel <sre@xxxxxxxxxx>,
Bjorn Andersson <andersson@xxxxxxxxxx>,
Konrad Dybcio <konradybcio@xxxxxxxxxx>, Rob Herring <robh@xxxxxxxxxx>,
Krzysztof Kozlowski <krzk+dt@xxxxxxxxxx>,
Conor Dooley
<conor+dt@xxxxxxxxxx>,
Heikki Krogerus <heikki.krogerus@xxxxxxxxxxxxxxx>,
Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
Cc: Subbaraman Narayanamurthy <subbaraman.narayanamurthy@xxxxxxxxxxxxxxxx>,
David Collins <david.collins@xxxxxxxxxxxxxxxx>,
linux-pm@xxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx,
linux-arm-msm@xxxxxxxxxxxxxxx, kernel@xxxxxxxxxxxxxxxx,
devicetree@xxxxxxxxxxxxxxx, linux-usb@xxxxxxxxxxxxxxx
References: <20250530-qcom_battmgr_update-v2-0-9e377193a656@xxxxxxxxxxxxxxxx>
<497BF3hThnrmYe-YHKmdOyZwdjP3ivm1hFYDDy3-HkSOvkCOMVSkokyhb859mcTarGb55Go5nJLfgsc553u7ZA==@protonmail.internalid>
<20250530-qcom_battmgr_update-v2-5-9e377193a656@xxxxxxxxxxxxxxxx>
<8b396edf-e344-47e9-b497-3f7fb35783ed@xxxxxxxxxx>
Content-Language: en-US
From: Fenglin Wu <fenglin.wu@xxxxxxxxxxxxxxxx>
In-Reply-To: <8b396edf-e344-47e9-b497-3f7fb35783ed@xxxxxxxxxx>
Content-Type: text/plain; charset=UTF-8; format=flowed
Content-Transfer-Encoding: 8bit
X-Proofpoint-Spam-Details-Enc: AW1haW4tMjUwNTMwMDA4MSBTYWx0ZWRfX5w8o4jrwtVLC
xgJg+pIVRbagNIJVF0BoFJ+DZS4OkedWJFl27QXvSBNIEKAABga3LLCoY0OwXBbSVv6Nmo10afj
nWnQX+CcpildZaeAQdZvwfcdlR/2uWm5841u1/40ddKMB4qtxNzovjUFnpMbF6Qmmh2O5RtXVdZ
Yr4P21KyGCvNO1uxPwhrPESjQiyztOS5XseCxWSvKF1Cex/mKnrtuGDcWql6zRFdb3QrTNF2xgw
nkqLsBNx5oN5rFDHYqtEDdTsXL/GdZCGcj3ahAY0lJqLMSSM8YUVFJSIqS9/OEAn8B/roguwDDK
t/TswvWjKpYyMDssDLklNkdwJrKLvFomWPSwZyF9gxuJ59ZRAQUkFvMTsHA/bFjDf0ASAto1l30
bUmxAVqOQ6RmyIIGbZzoaV5UUxlm0yCslXeKL0hqXnidCsLag9+Wp0QpJw2UYUwHAzr08LEB
X-Authority-Analysis: v=2.4 cv=Fes3xI+6 c=1 sm=1 tr=0 ts=68397c7e cx=c_pps
a=mDZGXZTwRPZaeRUbqKGCBw==:117 a=nuhDOHQX5FNHPW3J6Bj6AA==:17
a=IkcTkHD0fZMA:10 a=dt9VzEwgFbYA:10 a=EUspDBNiAAAA:8 a=fnrE3p8kPbNp4-9vzRIA:9
a=3ZKOabzyN94A:10 a=QEXdDO2ut3YA:10 a=zc0IvFSfCIW2DFIPzwfm:22
X-Proofpoint-GUID: jdbC0pmyt3_-Og3snHbYM8MaFz_kQn3t
X-Proofpoint-ORIG-GUID: jdbC0pmyt3_-Og3snHbYM8MaFz_kQn3t
X-Proofpoint-Virus-Version: vendor=baseguard
engine=ICAP:2.0.293,Aquarius:18.0.1099,Hydra:6.0.736,FMLib:17.12.80.40
definitions=2025-05-30_04,2025-05-29_01,2025-03-28_01
X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0
mlxscore=0 malwarescore=0 impostorscore=0 phishscore=0 clxscore=1015
lowpriorityscore=0 bulkscore=0 priorityscore=1501 mlxlogscore=999 spamscore=0
adultscore=0 suspectscore=0 classifier=spam authscore=0 authtc=n/a authcc=
route=outbound adjust=0 reason=mlx scancount=1 engine=8.19.0-2505160000
definitions=main-2505300081
X-Spam-Status: No, score=-3.3 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,RCVD_IN_DNSWL_MED,
RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,RCVD_IN_VALIDITY_RPBL_BLOCKED,
SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

Thanks for reviewing the change!

On 5/30/2025 4:48 PM, Bryan O'Donoghue wrote:
> On 30/05/2025 08:35, Fenglin Wu via B4 Relay wrote:
>> From: Fenglin Wu <fenglin.wu@xxxxxxxxxxxxxxxx>
>>
>> Add charge control support for SM8550 and X1E80100. It's supported
>> with below two power supply properties:
>>
>> charge_control_end_threshold: SOC threshold at which the charging
>> should be terminated.
>>
>> charge_control_start_threshold: SOC threshold at which the charging
>> should be resumed.
>
> Maybe this is very obvious to battery charger experts but what does
> SOC mean here ?
>
> Reading your patch you pass a "int soc" and compare it to a threshold
> value, without 'soc' having an obvious meaning.
>
> Its a threshold right ? Why not just call it threshold ?
>
"SOC" stands for battery State of Charge, I will rephrase the commit
text for better explanation.
>>
>> Signed-off-by: Fenglin Wu <fenglin.wu@xxxxxxxxxxxxxxxx>
>> ---
>> Â drivers/power/supply/qcom_battmgr.c | 256
>> ++++++++++++++++++++++++++++++++++--
>> Â 1 file changed, 248 insertions(+), 8 deletions(-)
>>
>> -Â Â Â if (battmgr->variant == QCOM_BATTMGR_SC8280XP)
>> +Â Â Â if (battmgr->variant == QCOM_BATTMGR_SC8280XP ||
>> +Â Â Â Â Â Â Â Â Â Â Â battmgr->variant == QCOM_BATTMGR_X1E80100)
>
> Please run your series through checkpatch
>
I actually did that before sending the patches out. I run checkpatch
with below two commands and I saw no issues:

git format -1 xxxx --stdtout | ./script/checkpatch.pl -

b4 prep --check

Can you let me know what specific command that you ran with it?

> 0004-power-supply-qcom_battmgr-Add-state_of_health-proper.patch has no
> obvious style problems and is ready for submission.
> CHECK: Alignment should match open parenthesis
> #95: FILE: drivers/power/supply/qcom_battmgr.c:521:
> +Â Â Â if (battmgr->variant == QCOM_BATTMGR_SC8280XP ||
> +Â Â Â Â Â Â Â Â Â Â Â battmgr->variant == QCOM_BATTMGR_X1E80100)
>
>>
>> +static int qcom_battmgr_set_charge_start_threshold(struct
>> qcom_battmgr *battmgr, int soc)
>> +{
>> +Â Â Â u32 target_soc, delta_soc;
>> +Â Â Â int ret;
>> +
>> +Â Â Â if (soc < CHARGE_CTRL_START_THR_MIN ||
>> +Â Â Â Â Â Â Â Â Â Â Â soc > CHARGE_CTRL_START_THR_MAX) {
>> +Â Â Â Â Â Â Â dev_err(battmgr->dev, "charge control start threshold exceed
>> range: [%u - %u]\n",
>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â CHARGE_CTRL_START_THR_MIN, CHARGE_CTRL_START_THR_MAX);
>> +Â Â Â Â Â Â Â return -EINVAL;
>> +Â Â Â }
>
> 'soc' is what - a threshold as far as I can tell.

I will update it with a more meaningful name

>>
>> Â Â Â Â Â if (opcode == BATTMGR_NOTIFICATION)
>> Â Â Â Â Â Â Â Â Â qcom_battmgr_notification(battmgr, data, len);
>> -Â Â Â else if (battmgr->variant == QCOM_BATTMGR_SC8280XP)
>> +Â Â Â else if (battmgr->variant == QCOM_BATTMGR_SC8280XP ||
>> +Â Â Â Â Â Â Â Â Â Â Â battmgr->variant == QCOM_BATTMGR_X1E80100)
>> Â Â Â Â Â Â Â Â Â qcom_battmgr_sc8280xp_callback(battmgr, data, len);
>> Â Â Â Â Â else
>> Â Â Â Â Â Â Â Â Â qcom_battmgr_sm8350_callback(battmgr, data, len);
>> @@ -1333,7 +1560,8 @@ static void qcom_battmgr_pdr_notify(void *priv,
>> int state)
>> Â static const struct of_device_id qcom_battmgr_of_variants[] = {
>> Â Â Â Â Â { .compatible = "qcom,sc8180x-pmic-glink", .data = (void
>> *)QCOM_BATTMGR_SC8280XP },
>> Â Â Â Â Â { .compatible = "qcom,sc8280xp-pmic-glink", .data = (void
>> *)QCOM_BATTMGR_SC8280XP },
>> -Â Â Â { .compatible = "qcom,x1e80100-pmic-glink", .data = (void
>> *)QCOM_BATTMGR_SC8280XP },
>> +Â Â Â { .compatible = "qcom,x1e80100-pmic-glink", .data = (void
>> *)QCOM_BATTMGR_X1E80100 },
>> +Â Â Â { .compatible = "qcom,sm8550-pmic-glink", .data = (void
>> *)QCOM_BATTMGR_SM8550 },
>
> Please separate compat string addition from functional changes.
>
The compatible string "qcom,sm8550-pmic-glink" has been present in the
binding for a while and it was added as a fallback of "qcom,pmic-glink".
The battmgr function has been also supported well on SM8550 for a while.
The change here is only specifying a different match data for SM8550 so
the driver can handle some new features differently. Does it also need
to add it in a separate change? If so,Â this change would be split into
following 3 patches I think:

1) add QCOM_BATTMGR_SM8550/X1E80100 variants definition in
qcom_battmgr_variant.

2) add compatible string with corresponding match data for SM8550.

3) add the charge control function support.

>> Â Â Â Â Â /* Unmatched devices falls back to QCOM_BATTMGR_SM8350 */
>> Â Â Â Â Â {}
>> Â };
>>
>>
>> --
>> 2.34.1
>>
>>
>>
>

Return-Path: <linux-kernel+bounces-667910-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from ny.mirrors.kernel.org (ny.mirrors.kernel.org [147.75.199.223])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 9A76F41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:44:02 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by ny.mirrors.kernel.org (Postfix) with ESMTPS id D0DBC1690A4
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:43:44 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 8795B220F2A;
Fri, 30 May 2025 09:38:33 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="inxk4Yp8"
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id BA27021D5BE;
Fri, 30 May 2025 09:38:32 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748597912; cv=none; b=tVWu17GHlKlmpDj7En4aEekW76bYWAN6pooFhDE2ZvDoil94KSsUYPhw+e1KQ9wM9NACHylMJ82eFctnz4gX7AVecSeVvgY8mAe22mCGnKrUS2kcfwyNOuwOe2qPBXnRRKSrAkiUBV+AQ1MVaUpYLGXtQ8ukwLM/kBbEOpQFejU=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748597912; c=relaxed/simple;
bh=ICSGQ0Zax6RfiU1eNFI15hxlnpkjaqK5BGkuN+umx5U=;
h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version:
Content-Type:Content-Disposition:In-Reply-To; b=gRsH+xDKixXLZ0tlL7p36OM/vsDJUfB7StBh36e/qOgamJPeYOeFFT7JFOm9GeS/XuHOWOQZhmjaIihaeVSlo+HVpv9DsgkCl4Uowzw1BAoB8ngbItmQjE4hKwmG2zzRrl++KyEWFbWYEXZj7zo/sZhfWsldz1LUklwSBxOiNQw=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=inxk4Yp8; arc=none smtp.client-ip=10.30.226.201
Received: by smtp.kernel.org (Postfix) with ESMTPSA id C537CC4CEE9;
Fri, 30 May 2025 09:38:31 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
s=k20201202; t=1748597912;
bh=ICSGQ0Zax6RfiU1eNFI15hxlnpkjaqK5BGkuN+umx5U=;
h=Date:From:To:Cc:Subject:References:In-Reply-To:From;
b=inxk4Yp8xtNFcmPalhY09ck1qo1uYRCOcIWDRzIjZrLR7sPqhhXwCHPFDqMHLJd9g
sHZFVXK6FD1UizbfB6DLmzqePp11GrU+bAvzTM2yyDUq4QFT+Mmn1w1ZCzuMd3Z++r
2rfN38whXYJrJfbDDPu65/kN4bHjMqPADKnsMQtO+wyQwtLzyKI5dwaU3yVgl+CY6c
mwSjpaGlZA383yCa/07eaFq5rD2szRfYhwQl84F4vK2iGD8hYyW2MfktL8fRWru7Gh
0Cd2BNMrPJjHKZ+QZivVRi85ri4A4dqVmlF2ll/8UBkqvHhrhjueT0WLFbZE49Ec2S
It9sSic/aD+Tg==
Date: Fri, 30 May 2025 11:38:29 +0200
From: Uwe =?utf-8?Q?Kleine-K=C3=B6nig?= <ukleinek@xxxxxxxxxx>
To: Chris Packham <Chris.Packham@xxxxxxxxxxxxxxxxxxx>
Cc: "jdelvare@xxxxxxxx" <jdelvare@xxxxxxxx>,
"linux@xxxxxxxxxxxx" <linux@xxxxxxxxxxxx>, "robh@xxxxxxxxxx" <robh@xxxxxxxxxx>,
"krzk+dt@xxxxxxxxxx" <krzk+dt@xxxxxxxxxx>, "conor+dt@xxxxxxxxxx" <conor+dt@xxxxxxxxxx>,
"linux-hwmon@xxxxxxxxxxxxxxx" <linux-hwmon@xxxxxxxxxxxxxxx>, "devicetree@xxxxxxxxxxxxxxx" <devicetree@xxxxxxxxxxxxxxx>,
"linux-kernel@xxxxxxxxxxxxxxx" <linux-kernel@xxxxxxxxxxxxxxx>, "linux-pwm@xxxxxxxxxxxxxxx" <linux-pwm@xxxxxxxxxxxxxxx>
Subject: Re: [PATCH v7 1/3] dt-bindings: hwmon: Add adt7475 fan/pwm properties
Message-ID: <dirkbdd5oeofjhy5pk6jiaixbuhmuq7axewhrd7bdghc3dp5x6@ok2uhywwz5ls>
References: <20240722221737.3407958-1-chris.packham@xxxxxxxxxxxxxxxxxxx>
<20240722221737.3407958-2-chris.packham@xxxxxxxxxxxxxxxxxxx>
<jzxu6mcbxf5zwyirnb2jjpm2i7sln3v5mz3gyhc5xhpqexicvb@atrcjvh7wuh5>
<bc99a27e-74ec-45a0-b77c-48f993269586@xxxxxxxxxxxxxxxxxxx>
<jmxmxzzfyobuheqe75lj7qcq5rlt625wddb3rlhiernunjdodu@tgxghvfef4tl>
<4858ce06-2081-4335-af09-f118872317ea@xxxxxxxxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Type: multipart/signed; micalg=pgp-sha512;
protocol="application/pgp-signature"; boundary="iwycug4hm3mtqr5n"
Content-Disposition: inline
In-Reply-To: <4858ce06-2081-4335-af09-f118872317ea@xxxxxxxxxxxxxxxxxxx>
X-Spam-Status: No, score=-6.4 required=5.0 tests=DKIMWL_WL_HIGH,DKIM_SIGNED,
DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

--iwycug4hm3mtqr5n
Content-Type: text/plain; protected-headers=v1; charset=iso-8859-1
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable
Subject: Re: [PATCH v7 1/3] dt-bindings: hwmon: Add adt7475 fan/pwm properties
MIME-Version: 1.0

Hello Chris,

On Wed, May 28, 2025 at 09:18:37PM +0000, Chris Packham wrote:
> On 28/05/2025 18:10, Uwe Kleine-K=F6nig wrote:
> > If I understand correctly you need the default value for duty to
> > statically setup (or only initialize?) a fan, right?
>=20
> Correct.
>=20
> > I'm not sure I like
> > extending #pwm-cells for a default duty value. Thinking about that a
> > while I'd prefer a binding that looks more like the clock configuration
> > stuff because actually having the period and flags as part of the
> > reference to the PWM to be used is also a bit strange. So I imagine
> > something like:
> >
> > mypwm: pwm {
> > compatible =3D "...."
> > #pwm-cells =3D <1>;
> > };
> >
> > fan {
> > compatible =3D "pwm-fan";
> > pwms =3D <&mypwm 1>;
> > assigned-pwms =3D <&mypwm>;
> > assigned-pwm-default-period-lengths-ns =3D <40000>;
> > assigned-pwm-default-flags =3D <PWM_POLARITY_INVERTED>;
> > };
> >
> > Then specifying a period (or later a duty cycle length) would be
> > optional and could be provided iff the device needs that for operation.
>=20
> The frequency and flags were already part of the standard #pwm-cells=20
> which I think is why I was encouraged to use them.

Yeah, that part is fine. This might not be the long-term future, but
today that's the norm.

> I was also trying to get something that would work as an ACPI overlay
> which turned out to be really hard.

I don't know enough about ACPI to be helpful with this quest.

> > My mail was just me being frustrated about another special case that I'd
> > have to handle if I go into that direction. I should have been more
> > attentive to that development before it entered the mainline.
>=20
> I'd be happy to deprecate the 4 cell thing and replace it with 3 cell +=
=20
> vendor property for the default period if that helps.

I wonder how other similar devices determine the default duty cycle.
Isn't the norm to make the fan rotate at max speed and then when
userspace takes over it's speeded down?

Best regards
Uwe

--iwycug4hm3mtqr5n
Content-Type: application/pgp-signature; name="signature.asc"

-----BEGIN PGP SIGNATURE-----

iQEzBAABCgAdFiEEP4GsaTp6HlmJrf7Tj4D7WH0S/k4FAmg5fJIACgkQj4D7WH0S
/k7Q/QgAhpRbbtxTmdd1TU+JKciJM1ubiQ6suwb+RqEXC/4zfLvLc7QwkWAm16v3
MMCqJxvwSwXVWPxPoaaFEU9k4S9YHi5ggLfT4/1Bde79ynsdCFHbL6zfaH3Fq3gH
m15Q2/Z9yPQ2z3tWe0b2PskubMtRGXpzWsEk3M2SwTb09J421hWW8qFxV//OqMf+
PM8qkChq3fe9ZZgkHzNepPYfmJEl6uhs1mEN7FinZi6ZHqxRSF2L92celgIcmYWK
6VqNq8381esfPA9OeA2oLFEuz2sQv5DtDE2PVsSea8iGggRFrYbGMC/oYTBLwjBp
1WTJLq7hrYJSTtyNu1HKGvIDfj+Gew==
=y7cT
-----END PGP SIGNATURE-----

--iwycug4hm3mtqr5n--

Return-Path: <linux-kernel+bounces-667911-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from am.mirrors.kernel.org (am.mirrors.kernel.org [147.75.80.249])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id 4BEF341E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:44:25 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by am.mirrors.kernel.org (Postfix) with ESMTPS id 07A381885716
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:44:18 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 59F1B2222C1;
Fri, 30 May 2025 09:40:59 +0000 (UTC)
Received: from metis.whiteo.stw.pengutronix.de (metis.whiteo.stw.pengutronix.de [185.203.201.7])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3014021B9FD
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:40:55 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=185.203.201.7
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748598058; cv=none; b=AQk6INVCoKmML7R6nazt1CrgC+B+mK/A0J5C3s9pXj89npPQ3nu+Pu/cdHMS08QPlXXY6fjpZTehFIei1LMv6VnWY5oiUo2doj7YxSSePg9/RVpVkkCjM1smaXsDoC7VMaStH3Tsi3wfAiBwICc7HHBYVZFvS9vfydxdYrHo9MU=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748598058; c=relaxed/simple;
bh=9L46h2bC/ZzRvJPhqqnbE6d9vQTHTHOjJrh/9UoHbpk=;
h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version:
Content-Type:Content-Disposition:In-Reply-To; b=SGggUStCKgnRistLmeDPkHqY1unK4mJZVdE6VmMyXKFfvOLDsjWUVzOZ10hNI38kPLwgELGCCxlLxQQ/jFsg4UXYhxfKji6BmjnVJ8z1w0xmoemOPJxNeJEqvQpzrB2q50Xwk/+vnzAz9Vkb+Tbpyc+c7Lrzj9UuoVFLQx3kTSM=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=pengutronix.de; spf=pass smtp.mailfrom=pengutronix.de; arc=none smtp.client-ip=185.203.201.7
Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=pengutronix.de
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=pengutronix.de
Received: from drehscheibe.grey.stw.pengutronix.de ([2a0a:edc0:0:c01:1d::a2])
by metis.whiteo.stw.pengutronix.de with esmtps (TLS1.3:ECDHE_RSA_AES_256_GCM_SHA384:256)
(Exim 4.92)
(envelope-from <mfe@xxxxxxxxxxxxxx>)
id 1uKwED-0000T2-LD; Fri, 30 May 2025 11:40:41 +0200
Received: from pty.whiteo.stw.pengutronix.de ([2a0a:edc0:2:b01:1d::c5])
by drehscheibe.grey.stw.pengutronix.de with esmtps (TLS1.3) tls TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
(Exim 4.96)
(envelope-from <mfe@xxxxxxxxxxxxxx>)
id 1uKwEB-000xGk-2I;
Fri, 30 May 2025 11:40:39 +0200
Received: from mfe by pty.whiteo.stw.pengutronix.de with local (Exim 4.96)
(envelope-from <mfe@xxxxxxxxxxxxxx>)
id 1uKwEB-000pOI-1s;
Fri, 30 May 2025 11:40:39 +0200
Date: Fri, 30 May 2025 11:40:39 +0200
From: Marco Felsch <m.felsch@xxxxxxxxxxxxxx>
To: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
Cc: Luis Chamberlain <mcgrof@xxxxxxxxxx>,
Russ Weight <russ.weight@xxxxxxxxx>,
"Rafael J. Wysocki" <rafael@xxxxxxxxxx>,
Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>,
Rob Herring <robh@xxxxxxxxxx>,
Krzysztof Kozlowski <krzk+dt@xxxxxxxxxx>,
Conor Dooley <conor+dt@xxxxxxxxxx>,
Dmitry Torokhov <dmitry.torokhov@xxxxxxxxx>,
Kamel Bouhara <kamel.bouhara@xxxxxxxxxxx>,
Marco Felsch <kernel@xxxxxxxxxxxxxx>,
Henrik Rydberg <rydberg@xxxxxxxxxxx>,
Danilo Krummrich <dakr@xxxxxxxxxx>, linux-kernel@xxxxxxxxxxxxxxx,
devicetree@xxxxxxxxxxxxxxx, linux-input@xxxxxxxxxxxxxxx
Subject: Re: [PATCH v2 4/4] Input: Add TouchNetix aXiom I2C Touchscreen
support
Message-ID: <20250530094039.n5236kxskha4vrhd@xxxxxxxxxxxxxx>
References: <20250529-v6-10-topic-touchscreen-axiom-v2-0-a5edb105a600@xxxxxxxxxxxxxx>
<20250529-v6-10-topic-touchscreen-axiom-v2-4-a5edb105a600@xxxxxxxxxxxxxx>
<2025052902-dizzy-baggie-15ee@gregkh>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <2025052902-dizzy-baggie-15ee@gregkh>
X-SA-Exim-Connect-IP: 2a0a:edc0:0:c01:1d::a2
X-SA-Exim-Mail-From: mfe@xxxxxxxxxxxxxx
X-SA-Exim-Scanned: No (on metis.whiteo.stw.pengutronix.de); SAEximRunCond expanded to false
X-PTX-Original-Recipient: linux-kernel@xxxxxxxxxxxxxxx
X-Spam-Status: No, score=-3.3 required=5.0 tests=HEADER_FROM_DIFFERENT_DOMAINS,
MAILING_LIST_MULTI,RCVD_IN_DNSWL_MED,
RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,RCVD_IN_VALIDITY_RPBL_BLOCKED,
SPF_HELO_NONE,SPF_PASS autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

On 25-05-29, Greg Kroah-Hartman wrote:
> On Thu, May 29, 2025 at 12:08:45AM +0200, Marco Felsch wrote:
> > + if (!entry->info)
> > + WARN(1, "Unsupported usage u%x used, driver bug!", i);
>
> You just crashed the system and caused all data to be lost if this is
> ever hit :(
>
> As you did detect this, please handle the error and recover. It's a bit
> rude for a single i2c driver to take down a whole system, right?

Good point.

> > +#define AXIOM_SIMPLE_FW_DEVICE_ATTR(attr) \
> > + static ssize_t \
> > + fw_ ## attr ## _show(struct device *dev, \
> > + struct device_attribute *_attr, char *buf) \
> > + { \
> > + struct i2c_client *i2c = to_i2c_client(dev); \
> > + struct axiom_data *ts = i2c_get_clientdata(i2c); \
> > + \
> > + return sprintf(buf, "%u\n", ts->fw_##attr); \
>
> sysfs_emit() please for all sysfs show functions.

Sure.

> > + axiom_u42_get_touchslots(ts);
> > + if (!ts->num_slots && update_in_process) {
> > + input_free_device(input);
> > + /*
> > + * Skip input device registration but don't throw an error to
> > + * not abort the update since some FW updates require a
> > + * following CFG update to re-initialize the touchslot handling.
> > + */
> > + if (update_in_process) {
> > + dev_info(dev, "No touchslots found after FW or CFG update, skip registering input device\n");
>
> Why is this info? What can a user do with this? Shouldn't this be a
> dev_warn() call at the least?

Please see below.

> > + return 0;
>
> You return success, but the device is NOT set up properly, how is that
> going to work?

As explained in the comment. If a firmware update changes the register
layout you may end up in such a situation. A subsequent CFG update is
required to provide a correct FW+CFG match.

We don't throw an error because the FW update itself was successful but
a CFG update is required, therefore I went with the dev_info() but I can
change this to dev_warn().

We don't know which combination requires a subsequent CFG update, e.g.
there is a FW version 4.8.9 which comes in a 2D and a 3D flavour. Tests
showed that updating from a 4.8.9 2D FW to a 4.8.9 3D FW don't require
updating the CFG. Also minor FW updates may not require to update the
CFG.

Regards,
Marco

Return-Path: <linux-kernel+bounces-667912-lkml=lkml.rescloud.iu.edu@xxxxxxxxxxxxxxx>
X-Original-To: lkml@xxxxxxxxxxxxxxxxxxxx
Delivered-To: lkml@xxxxxxxxxxxxxxxxxxxx
Received: from ny.mirrors.kernel.org (ny.mirrors.kernel.org [147.75.199.223])
by lkml.rescloud.iu.edu (Postfix) with ESMTPS id EE7BB41E003FA
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 05:44:37 -0400 (EDT)
Received: from smtp.subspace.kernel.org (relay.kernel.org [52.25.139.140])
(using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits))
(No client certificate requested)
by ny.mirrors.kernel.org (Postfix) with ESMTPS id 1B16B4E4C1E
for <lkml@xxxxxxxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:44:21 +0000 (UTC)
Received: from localhost.localdomain (localhost.localdomain [127.0.0.1])
by smtp.subspace.kernel.org (Postfix) with ESMTP id 7E65E22489A;
Fri, 30 May 2025 09:41:41 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
dkim=pass (1024-bit key) header.d=suse.de header.i=@suse.de header.b="QQ35yy3D";
dkim=permerror (0-bit key) header.d=suse.de header.i=@suse.de header.b="T+/sF2sH";
dkim=pass (1024-bit key) header.d=suse.de header.i=@suse.de header.b="QQ35yy3D";
dkim=permerror (0-bit key) header.d=suse.de header.i=@suse.de header.b="T+/sF2sH"
Received: from smtp-out1.suse.de (smtp-out1.suse.de [195.135.223.130])
(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
(No client certificate requested)
by smtp.subspace.kernel.org (Postfix) with ESMTPS id C925221FF25
for <linux-kernel@xxxxxxxxxxxxxxx>; Fri, 30 May 2025 09:41:35 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=195.135.223.130
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
t=1748598099; cv=none; b=evGz3SIjdJ2ZALedegRV9fzhwHzVxFkj3Szwj8TZXc0NBUGJF/uYWfyH1/oHRj5Z6mH60+rlvgOfE+ThL0aPMeAYDsRtRTLAP2pt6JxXk0oxtKNF5LRUhUIzn34MO8RO2VuywHAGQrrBmJFqJma5gZ4LySdOFCi3wV7ZUD9aCh8=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
s=arc-20240116; t=1748598099; c=relaxed/simple;
bh=p8bPfn+z3z+2ASSm8ilonYZ6I3XKs306Xt2PCMPzeIA=;
h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version:
Content-Type:Content-Disposition:In-Reply-To; b=otSjFeZmJbo2XPN/hdSAtTRJBs/7PT5rcEgVLcpBP1b/1weFj0BNHOv3f3zLuZRigTz5bLkLOwmpsQmd+riu/RAeEEYfpaYMaiJmrl2HOM0FE3anP9ykJgNxaITkWVV0rYreW4C/3ebPUXBdI6UmIfQadCrhIOY954rqQSxkOUw=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=suse.de; spf=pass smtp.mailfrom=suse.de; dkim=pass (1024-bit key) header.d=suse.de header.i=@suse.de header.b=QQ35yy3D; dkim=permerror (0-bit key) header.d=suse.de header.i=@suse.de header.b=T+/sF2sH; dkim=pass (1024-bit key) header.d=suse.de header.i=@suse.de header.b=QQ35yy3D; dkim=permerror (0-bit key) header.d=suse.de header.i=@suse.de header.b=T+/sF2sH; arc=none smtp.client-ip=195.135.223.130
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=suse.de
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=suse.de
Received: from imap1.dmz-prg2.suse.org (unknown [10.150.64.97])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256)
(No client certificate requested)
by smtp-out1.suse.de (Postfix) with ESMTPS id D6CEB2122E;
Fri, 30 May 2025 09:41:33 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.de; s=susede2_rsa;
t=1748598093; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
mime-version:mime-version:content-type:content-type:
content-transfer-encoding:content-transfer-encoding:
in-reply-to:in-reply-to:references:references;
bh=pS0O2ADLUauezUFF1y8I2YgDVhIi5NtA88euL27PlvI=;
b=QQ35yy3D14JKmdJBf+P1w1ZmF+yoUnDVNLavwiNgZ6UnZ1H1mebQN4+GsGl6Plm6Rmg00J
nrktGD1ZZ1tnqPd1ZbAA+jvBEOgQ1K4oZdA4YZb5A0eiumZbvrPOQiMNemElC/YUZTjFth
JOeX0aKwBjJvKtQInvPwq0/K+cGryHE=
DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=suse.de;
s=susede2_ed25519; t=1748598093;
h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
mime-version:mime-version:content-type:content-type:
content-transfer-encoding:content-transfer-encoding:
in-reply-to:in-reply-to:references:references;
bh=pS0O2ADLUauezUFF1y8I2YgDVhIi5NtA88euL27PlvI=;
b=T+/sF2sHKR5z37gD2Ul2PFeh99HsEcAVAVXJz3KUDvrxttRmnv6xGf7urNAOXw40iDruh3
/CKHRkKall2Qw8Dg==
Authentication-Results: smtp-out1.suse.de;
none
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.de; s=susede2_rsa;
t=1748598093; h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
mime-version:mime-version:content-type:content-type:
content-transfer-encoding:content-transfer-encoding:
in-reply-to:in-reply-to:references:references;
bh=pS0O2ADLUauezUFF1y8I2YgDVhIi5NtA88euL27PlvI=;
b=QQ35yy3D14JKmdJBf+P1w1ZmF+yoUnDVNLavwiNgZ6UnZ1H1mebQN4+GsGl6Plm6Rmg00J
nrktGD1ZZ1tnqPd1ZbAA+jvBEOgQ1K4oZdA4YZb5A0eiumZbvrPOQiMNemElC/YUZTjFth
JOeX0aKwBjJvKtQInvPwq0/K+cGryHE=
DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=suse.de;
s=susede2_ed25519; t=1748598093;
h=from:from:reply-to:date:date:message-id:message-id:to:to:cc:cc:
mime-version:mime-version:content-type:content-type:
content-transfer-encoding:content-transfer-encoding:
in-reply-to:in-reply-to:references:references;
bh=pS0O2ADLUauezUFF1y8I2YgDVhIi5NtA88euL27PlvI=;
b=T+/sF2sHKR5z37gD2Ul2PFeh99HsEcAVAVXJz3KUDvrxttRmnv6xGf7urNAOXw40iDruh3
/CKHRkKall2Qw8Dg==
Received: from imap1.dmz-prg2.suse.org (localhost [127.0.0.1])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256)
(No client certificate requested)
by imap1.dmz-prg2.suse.org (Postfix) with ESMTPS id F1781132D8;
Fri, 30 May 2025 09:41:30 +0000 (UTC)
Received: from dovecot-director2.suse.de ([2a07:de40:b281:106:10:150:64:167])
by imap1.dmz-prg2.suse.org with ESMTPSA
id vTMuN0p9OWgidQAAD6G6ig
(envelope-from <pfalcato@xxxxxxx>); Fri, 30 May 2025 09:41:30 +0000
Date: Fri, 30 May 2025 10:41:25 +0100
From: Pedro Falcato <pfalcato@xxxxxxx>
To: Bo Li <libo.gcs85@xxxxxxxxxxxxx>
Cc: tglx@xxxxxxxxxxxxx, mingo@xxxxxxxxxx, bp@xxxxxxxxx,
dave.hansen@xxxxxxxxxxxxxxx, x86@xxxxxxxxxx, luto@xxxxxxxxxx, kees@xxxxxxxxxx,
akpm@xxxxxxxxxxxxxxxxxxxx, david@xxxxxxxxxx, juri.lelli@xxxxxxxxxx,
vincent.guittot@xxxxxxxxxx, peterz@xxxxxxxxxxxxx, dietmar.eggemann@xxxxxxx, hpa@xxxxxxxxx,
acme@xxxxxxxxxx, namhyung@xxxxxxxxxx, mark.rutland@xxxxxxx,
alexander.shishkin@xxxxxxxxxxxxxxx, jolsa@xxxxxxxxxx, irogers@xxxxxxxxxx, adrian.hunter@xxxxxxxxx,
kan.liang@xxxxxxxxxxxxxxx, viro@xxxxxxxxxxxxxxxxxx, brauner@xxxxxxxxxx, jack@xxxxxxx,
lorenzo.stoakes@xxxxxxxxxx, Liam.Howlett@xxxxxxxxxx, vbabka@xxxxxxx, rppt@xxxxxxxxxx,
surenb@xxxxxxxxxx, mhocko@xxxxxxxx, rostedt@xxxxxxxxxxx, bsegall@xxxxxxxxxx,
mgorman@xxxxxxx, vschneid@xxxxxxxxxx, jannh@xxxxxxxxxx, riel@xxxxxxxxxxx,
harry.yoo@xxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx, linux-perf-users@xxxxxxxxxxxxxxx,
linux-fsdevel@xxxxxxxxxxxxxxx, linux-mm@xxxxxxxxx, duanxiongchun@xxxxxxxxxxxxx,
yinhongbo@xxxxxxxxxxxxx, dengliang.1214@xxxxxxxxxxxxx, xieyongji@xxxxxxxxxxxxx,
chaiwen.cc@xxxxxxxxxxxxx, songmuchun@xxxxxxxxxxxxx, yuanzhu@xxxxxxxxxxxxx,
chengguozhu@xxxxxxxxxxxxx, sunjiadong.lff@xxxxxxxxxxxxx
Subject: Re: [RFC v2 00/35] optimize cost of inter-process communication
Message-ID: <4fh5aagswxyecc5ffqngpyvd2ojs5rx3xihi3eat2foyh232da@5vz26lupjwwr>
References: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
Precedence: bulk
X-Mailing-List: linux-kernel@xxxxxxxxxxxxxxx
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@xxxxxxxxxxxxxxx>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@xxxxxxxxxxxxxxx>
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline
Content-Transfer-Encoding: 8bit
In-Reply-To: <cover.1748594840.git.libo.gcs85@xxxxxxxxxxxxx>
X-Spamd-Result: default: False [-3.80 / 50.00];
BAYES_HAM(-3.00)[100.00%];
NEURAL_HAM_LONG(-1.00)[-1.000];
MID_RHS_NOT_FQDN(0.50)[];
NEURAL_HAM_SHORT(-0.20)[-1.000];
MIME_GOOD(-0.10)[text/plain];
ARC_NA(0.00)[];
RCVD_VIA_SMTP_AUTH(0.00)[];
MISSING_XM_UA(0.00)[];
TO_DN_SOME(0.00)[];
MIME_TRACE(0.00)[0:+];
RCVD_TLS_ALL(0.00)[];
DKIM_SIGNED(0.00)[suse.de:s=susede2_rsa,suse.de:s=susede2_ed25519];
FUZZY_BLOCKED(0.00)[rspamd.com];
FROM_EQ_ENVFROM(0.00)[];
FROM_HAS_DN(0.00)[];
R_RATELIMIT(0.00)[to_ip_from(RL3mhzhn45zpqpmgqn4z7synfm)];
RCVD_COUNT_TWO(0.00)[2];
RCPT_COUNT_GT_50(0.00)[52];
TO_MATCH_ENVRCPT_SOME(0.00)[];
DBL_BLOCKED_OPENRESOLVER(0.00)[imap1.dmz-prg2.suse.org:helo]
X-Spam-Level:
X-Spam-Score: -3.80
X-Spam-Status: No, score=-3.4 required=5.0 tests=DKIM_SIGNED,DKIM_VALID,
DKIM_VALID_AU,HEADER_FROM_DIFFERENT_DOMAINS,MAILING_LIST_MULTI,
RCVD_IN_DNSWL_MED,RCVD_IN_VALIDITY_CERTIFIED_BLOCKED,
RCVD_IN_VALIDITY_RPBL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on lkml.rescloud.iu.edu

On Fri, May 30, 2025 at 05:27:28PM +0800, Bo Li wrote:
> Changelog:
>
> v2:
> - Port the RPAL functions to the latest v6.15 kernel.
> - Add a supplementary introduction to the application scenarios and
> security considerations of RPAL.
>
> link to v1:
> https://lore.kernel.org/lkml/CAP2HCOmAkRVTci0ObtyW=3v6GFOrt9zCn2NwLUbZ+Di49xkBiw@xxxxxxxxxxxxxx/
>
> --------------------------------------------------------------------------
>
> # Introduction
>
> We mainly apply RPAL to the service mesh architecture widely adopted in
> modern cloud-native data centers. Before the rise of the service mesh
> architecture, network functions were usually integrated into monolithic
> applications as libraries, and the main business programs invoked them
> through function calls. However, to facilitate the independent development
> and operation and maintenance of the main business programs and network
> functions, the service mesh removed the network functions from the main
> business programs and made them independent processes (called sidecars).
> Inter-process communication (IPC) is used for interaction between the main
> business program and the sidecar, and the introduced inter-process
> communication has led to a sharp increase in resource consumption in
> cloud-native data centers, and may even occupy more than 10% of the CPU of
> the entire microservice cluster.
>
> To achieve the efficient function call mechanism of the monolithic
> architecture under the service mesh architecture, we introduced the RPAL
> (Running Process As Library) architecture, which implements the sharing of
> the virtual address space of processes and the switching threads in user
> mode. Through the analysis of the service mesh architecture, we found that
> the process memory isolation between the main business program and the
> sidecar is not particularly important because they are split from one
> application and were an integral part of the original monolithic
> application. It is more important for the two processes to be independent
> of each other because they need to be independently developed and
> maintained to ensure the architectural advantages of the service mesh.
> Therefore, RPAL breaks the isolation between processes while preserving the
> independence between them. We think that RPAL can also be applied to other
> scenarios featuring sidecar-like architectures, such as distributed file
> storage systems in LLM infra.
>
> In RPAL architecture, multiple processes share a virtual address space, so
> this architecture can be regarded as an advanced version of the Linux
> shared memory mechanism:
>
> 1. Traditional shared memory requires two processes to negotiate to ensure
> the mapping of the same piece of memory. In RPAL architecture, two RPAL
> processes still need to reach a consensus before they can successfully
> invoke the relevant system calls of RPAL to share the virtual address
> space.
> 2. Traditional shared memory only shares part of the data. However, in RPAL
> architecture, processes that have established an RPAL communication
> relationship share a virtual address space, and all user memory (such as
> data segments and code segments) of each RPAL process is shared among these
> processes. However, a process cannot access the memory of other processes
> at any time. We use the MPK mechanism to ensure that the memory of other
> processes can only be accessed when special RPAL functions are called.
> Otherwise, a page fault will be triggered.
> 3. In RPAL architecture, to ensure the consistency of the execution context
> of the shared code (such as the stack and thread local storage), we further
> implement the thread context switching in user mode based on the ability to
> share the virtual address space of different processes, enabling the
> threads of different processes to directly perform fast switching in user
> mode without falling into kernel mode for slow switching.
>
> # Background
>
> In traditional inter-process communication (IPC) scenarios, Unix domain
> sockets are commonly used in conjunction with the epoll() family for event
> multiplexing. IPC operations involve system calls on both the data and
> control planes, thereby imposing a non-trivial overhead on the interacting
> processes. Even when shared memory is employed to optimize the data plane,
> two data copies still remain. Specifically, data is initially copied from
> a process's private memory space into the shared memory area, and then it
> is copied from the shared memory into the private memory of another
> process.
>
> This poses a question: Is it possible to reduce the overhead of IPC with
> only minimal modifications at the application level? To address this, we
> observed that the functionality of IPC, which encompasses data transfer
> and invocation of the target thread, is similar to a function call, where
> arguments are passed and the callee function is invoked to process them.
> Inspired by this analogy, we introduce RPAL (Run Process As Library), a
> framework designed to enable one process to invoke another as if making
> a local function call, all without going through the kernel.
>
> # Design
>
> First, letâ??s formalize RPALâ??s core objectives:
>
> 1. Data-plane efficiency: Reduce the number of data copies from two (in the
> shared memory solution) to one.
> 2. Control-plane optimization: Eliminate the overhead of system calls and
> kernel's thread switches.
> 3. Application compatibility: Minimize the modifications to existing
> applications that utilize Unix domain sockets and the epoll() family.
>
> To attain the first objective, processes that use RPAL share the same
> virtual address space. So one process can access another's data directly
> via a data pointer. This means data can be transferred from one process to
> another with just one copy operation.
>
> To meet the second goal, RPAL relies on the shared address space to do
> lightweight context switching in user space, which we call an "RPAL call".
> This allows one process to execute another process's code just like a
> local function call.
>
> To achieve the third target, RPAL stays compatible with the epoll family
> of functions, like epoll_create(), epoll_wait(), and epoll_ctl(). If an
> application uses epoll for IPC, developers can switch to RPAL with just a
> few small changes. For instance, you can just replace epoll_wait() with
> rpal_epoll_wait(). The basic epoll procedure, where a process waits for
> another to write to a monitored descriptor using an epoll file descriptor,
> still works fine with RPAL.
>
> ## Address space sharing
>
> For address space sharing, RPAL partitions the entire userspace virtual
> address space and allocates non-overlapping memory ranges to each process.
> On x86_64 architectures, RPAL uses a memory range size covered by a
> single PUD (Page Upper Directory) entry, which is 512GB. This restricts
> each processâ??s virtual address space to 512GB on x86_64, sufficient for
> most applications in our scenario. The rationale is straightforward:
> address space sharing can be simply achieved by copying the PUD from one
> processâ??s page table to anotherâ??s. So one process can directly use the
> data pointer to access another's memory.
>
>
> |------------| <- 0
> |------------| <- 512 GB
> | Process A |
> |------------| <- 2*512 GB
> |------------| <- n*512 GB
> | Process B |
> |------------| <- (n+1)*512 GB
> |------------| <- STACK_TOP
> | Kernel |
> |------------|
>
> ## RPAL call
>
> We refer to the lightweight userspace context switching mechanism as RPAL
> call. It enables the caller (or sender) thread of one process to directly
> switch to the callee (or receiver) thread of another process.
>
> When Process Aâ??s caller thread initiates an RPAL call to Process Bâ??s
> callee thread, the CPU saves the callerâ??s context and loads the calleeâ??s
> context. This enables direct userspace control flow transfer from the
> caller to the callee. After the callee finishes data processing, the CPU
> saves Process Bâ??s callee context and switches back to Process Aâ??s caller
> context, completing a full IPC cycle.
>
>
> |------------| |---------------------|
> | Process A | | Process B |
> | |-------| | | |-------| |
> | | caller| --- RPAL call --> | | callee| handle |
> | | thread| <------------------ | thread| -> event |
> | |-------| | | |-------| |
> |------------| |---------------------|
>
> # Security and compatibility with kernel subsystems
>
> ## Memory protection between processes
>
> Since processes using RPAL share the address space, unintended
> cross-process memory access may occur and corrupt the data of another
> process. To mitigate this, we leverage Memory Protection Keys (MPK) on x86
> architectures.
>
> MPK assigns 4 bits in each page table entry to a "protection key", which
> is paired with a userspace register (PKRU). The PKRU register defines
> access permissions for memory regions protected by specific keys (for
> detailed implementation, refer to the kernel documentation "Memory
> Protection Keys"). With MPK, even though the address space is shared
> among processes, cross-process access is restricted: a process can only
> access the memory protected by a key if its PKRU register is configured
> with the corresponding permission. This ensures that processes cannot
> access each otherâ??s memory unless an explicit PKRU configuration is set.
>
> ## Page fault handling and TLB flushing
>
> Due to the shared address space architecture, both page fault handling and
> TLB flushing require careful consideration. For instance, when Process A
> accesses Process Bâ??s memory, a page fault may occur in Process A's
> context, but the faulting address belongs to Process B. In this case, we
> must pass Process B's mm_struct to the page fault handler.
>
> TLB flushing is more complex. When a thread flushes the TLB, since the
> address space is shared, not only other threads in the current process but
> also other processes that share the address space may access the
> corresponding memory (related to the TLB flush). Therefore, the cpuset used
> for TLB flushing should be the union of the mm_cpumasks of all processes
> that share the address space.
>
> ## Lazy switch of kernel context
>
> In RPAL, a mismatch may arise between the user context and the kernel
> context. The RPAL call is designed solely to switch the user context,
> leaving the kernel context unchanged. For instance, when a RPAL call takes
> place, transitioning from caller thread to callee thread, and subsequently
> a system call is initiated within callee thread, the kernel will
> incorrectly utilize the caller's kernel context (such as the kernel stack)
> to process the system call.
>
> To resolve context mismatch issues, a kernel context switch is triggered at
> the kernel entry point when the callee initiates a syscall or an
> exception/interrupt occurs. This mechanism ensures context consistency
> before processing system calls, interrupts, or exceptions. We refer to this
> kernel context switch as a "lazy switch" because it defers the switching
> operation from the traditional thread switch point to the next kernel entry
> point.
>
> Lazy switch should be minimized as much as possible, as it significantly
> degrades performance. We currently utilize RPAL in an RPC framework, in
> which the RPC sender thread relies on the RPAL call to invoke the RPC
> receiver thread entirely in user space. In most cases, the receiver
> thread is free of system calls and the code execution time is relatively
> short. This characteristic effectively reduces the probability of a lazy
> switch occurring.
>
> ## Time slice correction
>
> After an RPAL call, the callee's user mode code executes. However, the
> kernel incorrectly attributes this CPU time to the caller due to the
> unchanged kernel context.
>
> To resolve this, we use the Time Stamp Counter (TSC) register to measure
> CPU time consumed by the callee thread in user space. The kernel then uses
> this user-reported timing data to adjust the CPU accounting for both the
> caller and callee thread, similar to how CPU steal time is implemented.
>
> ## Process recovery
>
> Since processes can access each otherâ??s memory, there is a risk that the
> target processâ??s memory may become invalid at the access time (e.g., if
> the target process has exited unexpectedly). The kernel must handle such
> cases; otherwise, the accessing process could be terminated due to
> failures originating from another process.
>
> To address this issue, each thread of the process should pre-establish a
> recovery point when accessing the memory of other processes. When such an
> invalid access occurs, the thread traps into the kernel. Inside the page
> fault handler, the kernel restores the user context of the thread to the
> recovery point. This mechanism ensures that processes maintain mutual
> independence, preventing cascading failures caused by cross-process memory
> issues.
>
> # Performance
>
> To quantify the performance improvements driven by RPAL, we measured
> latency both before and after its deployment. Experiments were conducted on
> a server equipped with two Intel(R) Xeon(R) Platinum 8336C CPUs (2.30 GHz)
> and 1 TB of memory. Latency was defined as the duration from when the
> client thread initiates a message to when the server thread is invoked and
> receives it.
>
> During testing, the client transmitted 1 million 32-byte messages, and we
> computed the per-message average latency. The results are as follows:
>
> *****************
> Without RPAL: Message length: 32 bytes, Total TSC cycles: 19616222534,
> Message count: 1000000, Average latency: 19616 cycles
> With RPAL: Message length: 32 bytes, Total TSC cycles: 1703459326,
> Message count: 1000000, Average latency: 1703 cycles
> *****************
>
> These results confirm that RPAL delivers substantial latency improvements
> over the current epoll implementationâ??achieving a 17,913-cycle reduction
> (an ~91.3% improvement) for 32-byte messages.
>
> We have applied RPAL to an RPC framework that is widely used in our data
> center. With RPAL, we have successfully achieved up to 15.5% reduction in
> the CPU utilization of processes in real-world microservice scenario. The
> gains primarily stem from minimizing control plane overhead through the
> utilization of userspace context switches. Additionally, by leveraging
> address space sharing, the number of memory copies is significantly
> reduced.
>
> # Future Work
>
> Currently, RPAL requires the MPK (Memory Protection Key) hardware feature,
> which is supported by a range of Intel CPUs. For AMD architectures, MPK is
> supported only on the latest processor, specifically, 3th Generation AMD
> EPYCâ?¢ Processors and subsequent generations. Patch sets that extend RPAL
> support to systems lacking MPK hardware will be provided later.
>
> Accompanying test programs are also provided in the samples/rpal/
> directory. And the user-mode RPAL library, which realizes user-space RPAL
> call, is in the samples/rpal/librpal directory.
>
> We hope to get some community discussions and feedback on RPAL's
> optimization approaches and architecture.
>
> Look forward to your comments.

The first time you posted, you got two NACKs (from Dave Hansen and Lorenzo).
You didn't reply and now you post this flood of patches? Please don't?

Next message: Unknown: "[no subject]"
Previous message: Unknown: "[no subject]"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]