Re: Linux 6.18.37

From: Greg Kroah-Hartman

Date: Sat Jun 27 2026 - 06:23:12 EST


diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 0b86a8022fa1..8256e857e2d7 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -553,7 +553,7 @@ otherwise.
kernel flags associated with the particular virtual memory area in two letter
encoded manner. The codes are the following:

- == =======================================
+ == =============================================================
rd readable
wr writeable
ex executable
@@ -591,7 +591,8 @@ encoded manner. The codes are the following:
sl sealed
lf lock on fault pages
dp always lazily freeable mapping
- == =======================================
+ gu maybe contains guard regions (if not set, definitely doesn't)
+ == =============================================================

Note that there is no guarantee that every flag and associated mnemonic will
be present in all further kernel releases. Things get changed, the flags may
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 7c527a01d1cf..d896b9991a4f 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -229,7 +229,6 @@ Code Seq# Include File Comments
<mailto:gregkh@xxxxxxxxxxxxxxxxxxx>
'a' all linux/atm*.h, linux/sonet.h ATM on linux
<http://lrcwww.epfl.ch/>
-'a' 00-0F drivers/crypto/qat/qat_common/adf_cfg_common.h conflict! qat driver
'b' 00-FF conflict! bit3 vme host bridge
<mailto:natalia@xxxxxxxxxxxxxxxxx>
'b' 00-0F linux/dma-buf.h conflict!
diff --git a/Makefile b/Makefile
index 7f5abef39fd3..9c16001ccdcd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 6
PATCHLEVEL = 18
-SUBLEVEL = 36
+SUBLEVEL = 37
EXTRAVERSION =
NAME = Baby Opossum Posse

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 50f60da6cf27..16704c2a730c 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -5,6 +5,7 @@

#define pr_fmt(fmt) "ACPI: " fmt

+#include <linux/async.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
@@ -2360,46 +2361,34 @@ static int acpi_dev_get_next_consumer_dev_cb(struct acpi_dep_data *dep, void *da
return 0;
}

-struct acpi_scan_clear_dep_work {
- struct work_struct work;
- struct acpi_device *adev;
-};
-
-static void acpi_scan_clear_dep_fn(struct work_struct *work)
+static void acpi_scan_clear_dep_fn(void *dev, async_cookie_t cookie)
{
- struct acpi_scan_clear_dep_work *cdw;
-
- cdw = container_of(work, struct acpi_scan_clear_dep_work, work);
+ struct acpi_device *adev = to_acpi_device(dev);

acpi_scan_lock_acquire();
- acpi_bus_attach(cdw->adev, (void *)true);
+ acpi_bus_attach(adev, (void *)true);
acpi_scan_lock_release();

- acpi_dev_put(cdw->adev);
- kfree(cdw);
+ acpi_dev_put(adev);
}

static bool acpi_scan_clear_dep_queue(struct acpi_device *adev)
{
- struct acpi_scan_clear_dep_work *cdw;
-
if (adev->dep_unmet)
return false;

- cdw = kmalloc(sizeof(*cdw), GFP_KERNEL);
- if (!cdw)
- return false;
-
- cdw->adev = adev;
- INIT_WORK(&cdw->work, acpi_scan_clear_dep_fn);
/*
- * Since the work function may block on the lock until the entire
- * initial enumeration of devices is complete, put it into the unbound
- * workqueue.
+ * Async schedule the deferred acpi_scan_clear_dep_fn() since:
+ * - acpi_bus_attach() needs to hold acpi_scan_lock which cannot
+ * be acquired under acpi_dep_list_lock (held here)
+ * - the deferred work at boot stage is ensured to be finished
+ * before userspace init task by the async_synchronize_full()
+ * barrier
+ *
+ * Use _nocall variant since it'll return on failure instead of
+ * run the function synchronously.
*/
- queue_work(system_unbound_wq, &cdw->work);
-
- return true;
+ return async_schedule_dev_nocall(acpi_scan_clear_dep_fn, &adev->dev);
}

static void acpi_scan_delete_dep_data(struct acpi_dep_data *dep)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 0ecb43556736..fdbec49f5f5b 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -828,7 +828,6 @@ static int add_memory_block(unsigned long block_id, int nid, unsigned long state
mem->start_section_nr = block_id * sections_per_block;
mem->state = state;
mem->nid = nid;
- mem->altmap = altmap;
INIT_LIST_HEAD(&mem->group_next);

#ifndef CONFIG_NUMA
@@ -846,6 +845,8 @@ static int add_memory_block(unsigned long block_id, int nid, unsigned long state
if (ret)
return ret;

+ mem->altmap = altmap;
+
if (group) {
mem->group = group;
list_add(&mem->group_next, &group->memory_blocks);
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 2505df1f4e69..6741270e0a98 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -546,7 +546,7 @@ static int agp_amd64_probe(struct pci_dev *pdev,
/* Fill in the mode register */
pci_read_config_dword(pdev, bridge->capndx+PCI_AGP_STATUS, &bridge->mode);

- if (cache_nbs(pdev, cap_ptr) == -1) {
+ if (cache_nbs(pdev, cap_ptr) < 0) {
agp_put_bridge(bridge);
return -ENODEV;
}
diff --git a/drivers/crypto/intel/qat/qat_common/adf_cfg.c b/drivers/crypto/intel/qat/qat_common/adf_cfg.c
index b0fc453fa3fb..97273da40b43 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_cfg.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_cfg.c
@@ -103,16 +103,6 @@ static void adf_cfg_section_del_all(struct list_head *head);
static void adf_cfg_section_del_all_except(struct list_head *head,
const char *section_name);

-void adf_cfg_del_all(struct adf_accel_dev *accel_dev)
-{
- struct adf_cfg_device_data *dev_cfg_data = accel_dev->cfg;
-
- down_write(&dev_cfg_data->lock);
- adf_cfg_section_del_all(&dev_cfg_data->sec_list);
- up_write(&dev_cfg_data->lock);
- clear_bit(ADF_STATUS_CONFIGURED, &accel_dev->status);
-}
-
void adf_cfg_del_all_except(struct adf_accel_dev *accel_dev,
const char *section_name)
{
diff --git a/drivers/crypto/intel/qat/qat_common/adf_cfg.h b/drivers/crypto/intel/qat/qat_common/adf_cfg.h
index 2afa6f0d15c5..108032b39242 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_cfg.h
+++ b/drivers/crypto/intel/qat/qat_common/adf_cfg.h
@@ -34,7 +34,6 @@ void adf_cfg_dev_remove(struct adf_accel_dev *accel_dev);
void adf_cfg_dev_dbgfs_add(struct adf_accel_dev *accel_dev);
void adf_cfg_dev_dbgfs_rm(struct adf_accel_dev *accel_dev);
int adf_cfg_section_add(struct adf_accel_dev *accel_dev, const char *name);
-void adf_cfg_del_all(struct adf_accel_dev *accel_dev);
void adf_cfg_del_all_except(struct adf_accel_dev *accel_dev,
const char *section_name);
int adf_cfg_add_key_value_param(struct adf_accel_dev *accel_dev,
diff --git a/drivers/crypto/intel/qat/qat_common/adf_cfg_common.h b/drivers/crypto/intel/qat/qat_common/adf_cfg_common.h
index 81e9e9d7eccd..d63f4dcccbb5 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_cfg_common.h
+++ b/drivers/crypto/intel/qat/qat_common/adf_cfg_common.h
@@ -4,18 +4,11 @@
#define ADF_CFG_COMMON_H_

#include <linux/types.h>
-#include <linux/ioctl.h>

#define ADF_CFG_MAX_STR_LEN 64
#define ADF_CFG_MAX_KEY_LEN_IN_BYTES ADF_CFG_MAX_STR_LEN
#define ADF_CFG_MAX_VAL_LEN_IN_BYTES ADF_CFG_MAX_STR_LEN
#define ADF_CFG_MAX_SECTION_LEN_IN_BYTES ADF_CFG_MAX_STR_LEN
-#define ADF_CFG_BASE_DEC 10
-#define ADF_CFG_BASE_HEX 16
-#define ADF_CFG_ALL_DEVICES 0xFE
-#define ADF_CFG_NO_DEVICE 0xFF
-#define ADF_CFG_AFFINITY_WHATEVER 0xFF
-#define MAX_DEVICE_NAME_SIZE 32
#define ADF_MAX_DEVICES (32 * 32)
#define ADF_DEVS_ARRAY_SIZE BITS_TO_LONGS(ADF_MAX_DEVICES)

@@ -51,29 +44,4 @@ enum adf_device_type {
DEV_420XX,
DEV_6XXX,
};
-
-struct adf_dev_status_info {
- enum adf_device_type type;
- __u32 accel_id;
- __u32 instance_id;
- __u8 num_ae;
- __u8 num_accel;
- __u8 num_logical_accel;
- __u8 banks_per_accel;
- __u8 state;
- __u8 bus;
- __u8 dev;
- __u8 fun;
- char name[MAX_DEVICE_NAME_SIZE];
-};
-
-#define ADF_CTL_IOC_MAGIC 'a'
-#define IOCTL_CONFIG_SYS_RESOURCE_PARAMETERS _IOW(ADF_CTL_IOC_MAGIC, 0, \
- struct adf_user_cfg_ctl_data)
-#define IOCTL_STOP_ACCEL_DEV _IOW(ADF_CTL_IOC_MAGIC, 1, \
- struct adf_user_cfg_ctl_data)
-#define IOCTL_START_ACCEL_DEV _IOW(ADF_CTL_IOC_MAGIC, 2, \
- struct adf_user_cfg_ctl_data)
-#define IOCTL_STATUS_ACCEL_DEV _IOW(ADF_CTL_IOC_MAGIC, 3, __u32)
-#define IOCTL_GET_NUM_DEVICES _IOW(ADF_CTL_IOC_MAGIC, 4, __s32)
#endif
diff --git a/drivers/crypto/intel/qat/qat_common/adf_cfg_user.h b/drivers/crypto/intel/qat/qat_common/adf_cfg_user.h
deleted file mode 100644
index 421f4fb8b4dd..000000000000
--- a/drivers/crypto/intel/qat/qat_common/adf_cfg_user.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only) */
-/* Copyright(c) 2014 - 2020 Intel Corporation */
-#ifndef ADF_CFG_USER_H_
-#define ADF_CFG_USER_H_
-
-#include "adf_cfg_common.h"
-#include "adf_cfg_strings.h"
-
-struct adf_user_cfg_key_val {
- char key[ADF_CFG_MAX_KEY_LEN_IN_BYTES];
- char val[ADF_CFG_MAX_VAL_LEN_IN_BYTES];
- union {
- struct adf_user_cfg_key_val *next;
- __u64 padding3;
- };
- enum adf_cfg_val_type type;
-} __packed;
-
-struct adf_user_cfg_section {
- char name[ADF_CFG_MAX_SECTION_LEN_IN_BYTES];
- union {
- struct adf_user_cfg_key_val *params;
- __u64 padding1;
- };
- union {
- struct adf_user_cfg_section *next;
- __u64 padding3;
- };
-} __packed;
-
-struct adf_user_cfg_ctl_data {
- union {
- struct adf_user_cfg_section *config_section;
- __u64 padding;
- };
- __u8 device_id;
-} __packed;
-#endif
diff --git a/drivers/crypto/intel/qat/qat_common/adf_common_drv.h b/drivers/crypto/intel/qat/qat_common/adf_common_drv.h
index 6cf3a95489e8..c625324bd47f 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_common_drv.h
+++ b/drivers/crypto/intel/qat/qat_common/adf_common_drv.h
@@ -68,10 +68,7 @@ int adf_devmgr_add_dev(struct adf_accel_dev *accel_dev,
void adf_devmgr_rm_dev(struct adf_accel_dev *accel_dev,
struct adf_accel_dev *pf);
struct list_head *adf_devmgr_get_head(void);
-struct adf_accel_dev *adf_devmgr_get_dev_by_id(u32 id);
struct adf_accel_dev *adf_devmgr_pci_to_accel_dev(struct pci_dev *pci_dev);
-int adf_devmgr_verify_id(u32 id);
-void adf_devmgr_get_num_dev(u32 *num);
int adf_devmgr_in_reset(struct adf_accel_dev *accel_dev);
int adf_dev_started(struct adf_accel_dev *accel_dev);
int adf_dev_restarting_notify(struct adf_accel_dev *accel_dev);
diff --git a/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c b/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c
index c2e6f0cb7480..f01f2946de6e 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c
@@ -2,410 +2,13 @@
/* Copyright(c) 2014 - 2020 Intel Corporation */

#include <crypto/algapi.h>
+#include <linux/errno.h>
#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/bitops.h>
-#include <linux/pci.h>
-#include <linux/cdev.h>
-#include <linux/uaccess.h>

-#include "adf_accel_devices.h"
#include "adf_common_drv.h"
-#include "adf_cfg.h"
-#include "adf_cfg_common.h"
-#include "adf_cfg_user.h"
-
-#define ADF_CFG_MAX_SECTION 512
-#define ADF_CFG_MAX_KEY_VAL 256
-
-#define DEVICE_NAME "qat_adf_ctl"
-
-static DEFINE_MUTEX(adf_ctl_lock);
-static long adf_ctl_ioctl(struct file *fp, unsigned int cmd, unsigned long arg);
-
-static const struct file_operations adf_ctl_ops = {
- .owner = THIS_MODULE,
- .unlocked_ioctl = adf_ctl_ioctl,
- .compat_ioctl = compat_ptr_ioctl,
-};
-
-static const struct class adf_ctl_class = {
- .name = DEVICE_NAME,
-};
-
-struct adf_ctl_drv_info {
- unsigned int major;
- struct cdev drv_cdev;
-};
-
-static struct adf_ctl_drv_info adf_ctl_drv;
-
-static void adf_chr_drv_destroy(void)
-{
- device_destroy(&adf_ctl_class, MKDEV(adf_ctl_drv.major, 0));
- cdev_del(&adf_ctl_drv.drv_cdev);
- class_unregister(&adf_ctl_class);
- unregister_chrdev_region(MKDEV(adf_ctl_drv.major, 0), 1);
-}
-
-static int adf_chr_drv_create(void)
-{
- dev_t dev_id;
- struct device *drv_device;
- int ret;
-
- if (alloc_chrdev_region(&dev_id, 0, 1, DEVICE_NAME)) {
- pr_err("QAT: unable to allocate chrdev region\n");
- return -EFAULT;
- }
-
- ret = class_register(&adf_ctl_class);
- if (ret)
- goto err_chrdev_unreg;
-
- adf_ctl_drv.major = MAJOR(dev_id);
- cdev_init(&adf_ctl_drv.drv_cdev, &adf_ctl_ops);
- if (cdev_add(&adf_ctl_drv.drv_cdev, dev_id, 1)) {
- pr_err("QAT: cdev add failed\n");
- goto err_class_destr;
- }
-
- drv_device = device_create(&adf_ctl_class, NULL,
- MKDEV(adf_ctl_drv.major, 0),
- NULL, DEVICE_NAME);
- if (IS_ERR(drv_device)) {
- pr_err("QAT: failed to create device\n");
- goto err_cdev_del;
- }
- return 0;
-err_cdev_del:
- cdev_del(&adf_ctl_drv.drv_cdev);
-err_class_destr:
- class_unregister(&adf_ctl_class);
-err_chrdev_unreg:
- unregister_chrdev_region(dev_id, 1);
- return -EFAULT;
-}
-
-static struct adf_user_cfg_ctl_data *adf_ctl_alloc_resources(unsigned long arg)
-{
- struct adf_user_cfg_ctl_data *cfg_data;
-
- cfg_data = memdup_user((void __user *)arg, sizeof(*cfg_data));
- if (IS_ERR(cfg_data))
- pr_err("QAT: failed to copy from user cfg_data.\n");
- return cfg_data;
-}
-
-static int adf_add_key_value_data(struct adf_accel_dev *accel_dev,
- const char *section,
- const struct adf_user_cfg_key_val *key_val)
-{
- if (key_val->type == ADF_HEX) {
- long *ptr = (long *)key_val->val;
- long val = *ptr;
-
- if (adf_cfg_add_key_value_param(accel_dev, section,
- key_val->key, (void *)val,
- key_val->type)) {
- dev_err(&GET_DEV(accel_dev),
- "failed to add hex keyvalue.\n");
- return -EFAULT;
- }
- } else {
- if (adf_cfg_add_key_value_param(accel_dev, section,
- key_val->key, key_val->val,
- key_val->type)) {
- dev_err(&GET_DEV(accel_dev),
- "failed to add keyvalue.\n");
- return -EFAULT;
- }
- }
- return 0;
-}
-
-static int adf_copy_key_value_data(struct adf_accel_dev *accel_dev,
- struct adf_user_cfg_ctl_data *ctl_data)
-{
- struct adf_user_cfg_key_val key_val;
- struct adf_user_cfg_key_val *params_head;
- struct adf_user_cfg_section section, *section_head;
- int i, j;
-
- section_head = ctl_data->config_section;
-
- for (i = 0; section_head && i < ADF_CFG_MAX_SECTION; i++) {
- if (copy_from_user(&section, (void __user *)section_head,
- sizeof(*section_head))) {
- dev_err(&GET_DEV(accel_dev),
- "failed to copy section info\n");
- goto out_err;
- }
-
- if (adf_cfg_section_add(accel_dev, section.name)) {
- dev_err(&GET_DEV(accel_dev),
- "failed to add section.\n");
- goto out_err;
- }
-
- params_head = section.params;
-
- for (j = 0; params_head && j < ADF_CFG_MAX_KEY_VAL; j++) {
- if (copy_from_user(&key_val, (void __user *)params_head,
- sizeof(key_val))) {
- dev_err(&GET_DEV(accel_dev),
- "Failed to copy keyvalue.\n");
- goto out_err;
- }
- if (adf_add_key_value_data(accel_dev, section.name,
- &key_val)) {
- goto out_err;
- }
- params_head = key_val.next;
- }
- section_head = section.next;
- }
- return 0;
-out_err:
- adf_cfg_del_all(accel_dev);
- return -EFAULT;
-}
-
-static int adf_ctl_ioctl_dev_config(struct file *fp, unsigned int cmd,
- unsigned long arg)
-{
- struct adf_user_cfg_ctl_data *ctl_data;
- struct adf_accel_dev *accel_dev;
- int ret = 0;
-
- ctl_data = adf_ctl_alloc_resources(arg);
- if (IS_ERR(ctl_data))
- return PTR_ERR(ctl_data);
-
- accel_dev = adf_devmgr_get_dev_by_id(ctl_data->device_id);
- if (!accel_dev) {
- ret = -EFAULT;
- goto out;
- }
-
- if (adf_dev_started(accel_dev)) {
- ret = -EFAULT;
- goto out;
- }
-
- if (adf_copy_key_value_data(accel_dev, ctl_data)) {
- ret = -EFAULT;
- goto out;
- }
- set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status);
-out:
- kfree(ctl_data);
- return ret;
-}
-
-static int adf_ctl_is_device_in_use(int id)
-{
- struct adf_accel_dev *dev;
-
- list_for_each_entry(dev, adf_devmgr_get_head(), list) {
- if (id == dev->accel_id || id == ADF_CFG_ALL_DEVICES) {
- if (adf_devmgr_in_reset(dev) || adf_dev_in_use(dev)) {
- dev_info(&GET_DEV(dev),
- "device qat_dev%d is busy\n",
- dev->accel_id);
- return -EBUSY;
- }
- }
- }
- return 0;
-}
-
-static void adf_ctl_stop_devices(u32 id)
-{
- struct adf_accel_dev *accel_dev;
-
- list_for_each_entry(accel_dev, adf_devmgr_get_head(), list) {
- if (id == accel_dev->accel_id || id == ADF_CFG_ALL_DEVICES) {
- if (!adf_dev_started(accel_dev))
- continue;
-
- /* First stop all VFs */
- if (!accel_dev->is_vf)
- continue;
-
- adf_dev_down(accel_dev);
- }
- }
-
- list_for_each_entry(accel_dev, adf_devmgr_get_head(), list) {
- if (id == accel_dev->accel_id || id == ADF_CFG_ALL_DEVICES) {
- if (!adf_dev_started(accel_dev))
- continue;
-
- adf_dev_down(accel_dev);
- }
- }
-}
-
-static int adf_ctl_ioctl_dev_stop(struct file *fp, unsigned int cmd,
- unsigned long arg)
-{
- int ret;
- struct adf_user_cfg_ctl_data *ctl_data;
-
- ctl_data = adf_ctl_alloc_resources(arg);
- if (IS_ERR(ctl_data))
- return PTR_ERR(ctl_data);
-
- if (adf_devmgr_verify_id(ctl_data->device_id)) {
- pr_err("QAT: Device %d not found\n", ctl_data->device_id);
- ret = -ENODEV;
- goto out;
- }
-
- ret = adf_ctl_is_device_in_use(ctl_data->device_id);
- if (ret)
- goto out;
-
- if (ctl_data->device_id == ADF_CFG_ALL_DEVICES)
- pr_info("QAT: Stopping all acceleration devices.\n");
- else
- pr_info("QAT: Stopping acceleration device qat_dev%d.\n",
- ctl_data->device_id);
-
- adf_ctl_stop_devices(ctl_data->device_id);
-
-out:
- kfree(ctl_data);
- return ret;
-}
-
-static int adf_ctl_ioctl_dev_start(struct file *fp, unsigned int cmd,
- unsigned long arg)
-{
- int ret;
- struct adf_user_cfg_ctl_data *ctl_data;
- struct adf_accel_dev *accel_dev;
-
- ctl_data = adf_ctl_alloc_resources(arg);
- if (IS_ERR(ctl_data))
- return PTR_ERR(ctl_data);
-
- ret = -ENODEV;
- accel_dev = adf_devmgr_get_dev_by_id(ctl_data->device_id);
- if (!accel_dev)
- goto out;
-
- dev_info(&GET_DEV(accel_dev),
- "Starting acceleration device qat_dev%d.\n",
- ctl_data->device_id);
-
- ret = adf_dev_up(accel_dev, false);
-
- if (ret) {
- dev_err(&GET_DEV(accel_dev), "Failed to start qat_dev%d\n",
- ctl_data->device_id);
- adf_dev_down(accel_dev);
- }
-out:
- kfree(ctl_data);
- return ret;
-}
-
-static int adf_ctl_ioctl_get_num_devices(struct file *fp, unsigned int cmd,
- unsigned long arg)
-{
- u32 num_devices = 0;
-
- adf_devmgr_get_num_dev(&num_devices);
- if (copy_to_user((void __user *)arg, &num_devices, sizeof(num_devices)))
- return -EFAULT;
-
- return 0;
-}
-
-static int adf_ctl_ioctl_get_status(struct file *fp, unsigned int cmd,
- unsigned long arg)
-{
- struct adf_hw_device_data *hw_data;
- struct adf_dev_status_info dev_info;
- struct adf_accel_dev *accel_dev;
-
- if (copy_from_user(&dev_info, (void __user *)arg,
- sizeof(struct adf_dev_status_info))) {
- pr_err("QAT: failed to copy from user.\n");
- return -EFAULT;
- }
-
- accel_dev = adf_devmgr_get_dev_by_id(dev_info.accel_id);
- if (!accel_dev)
- return -ENODEV;
-
- hw_data = accel_dev->hw_device;
- dev_info.state = adf_dev_started(accel_dev) ? DEV_UP : DEV_DOWN;
- dev_info.num_ae = hw_data->get_num_aes(hw_data);
- dev_info.num_accel = hw_data->get_num_accels(hw_data);
- dev_info.num_logical_accel = hw_data->num_logical_accel;
- dev_info.banks_per_accel = hw_data->num_banks
- / hw_data->num_logical_accel;
- strscpy(dev_info.name, hw_data->dev_class->name, sizeof(dev_info.name));
- dev_info.instance_id = hw_data->instance_id;
- dev_info.type = hw_data->dev_class->type;
- dev_info.bus = accel_to_pci_dev(accel_dev)->bus->number;
- dev_info.dev = PCI_SLOT(accel_to_pci_dev(accel_dev)->devfn);
- dev_info.fun = PCI_FUNC(accel_to_pci_dev(accel_dev)->devfn);
-
- if (copy_to_user((void __user *)arg, &dev_info,
- sizeof(struct adf_dev_status_info))) {
- dev_err(&GET_DEV(accel_dev), "failed to copy status.\n");
- return -EFAULT;
- }
- return 0;
-}
-
-static long adf_ctl_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
-{
- int ret;
-
- if (mutex_lock_interruptible(&adf_ctl_lock))
- return -EFAULT;
-
- switch (cmd) {
- case IOCTL_CONFIG_SYS_RESOURCE_PARAMETERS:
- ret = adf_ctl_ioctl_dev_config(fp, cmd, arg);
- break;
-
- case IOCTL_STOP_ACCEL_DEV:
- ret = adf_ctl_ioctl_dev_stop(fp, cmd, arg);
- break;
-
- case IOCTL_START_ACCEL_DEV:
- ret = adf_ctl_ioctl_dev_start(fp, cmd, arg);
- break;
-
- case IOCTL_GET_NUM_DEVICES:
- ret = adf_ctl_ioctl_get_num_devices(fp, cmd, arg);
- break;
-
- case IOCTL_STATUS_ACCEL_DEV:
- ret = adf_ctl_ioctl_get_status(fp, cmd, arg);
- break;
- default:
- pr_err_ratelimited("QAT: Invalid ioctl %d\n", cmd);
- ret = -EFAULT;
- break;
- }
- mutex_unlock(&adf_ctl_lock);
- return ret;
-}

static int __init adf_register_ctl_device_driver(void)
{
- if (adf_chr_drv_create())
- goto err_chr_dev;
-
if (adf_init_misc_wq())
goto err_misc_wq;

@@ -437,15 +40,11 @@ static int __init adf_register_ctl_device_driver(void)
err_aer:
adf_exit_misc_wq();
err_misc_wq:
- adf_chr_drv_destroy();
-err_chr_dev:
- mutex_destroy(&adf_ctl_lock);
return -EFAULT;
}

static void __exit adf_unregister_ctl_device_driver(void)
{
- adf_chr_drv_destroy();
adf_exit_misc_wq();
adf_exit_aer();
adf_exit_vf_wq();
@@ -453,7 +52,6 @@ static void __exit adf_unregister_ctl_device_driver(void)
qat_crypto_unregister();
qat_compression_unregister();
adf_clean_vf_map(false);
- mutex_destroy(&adf_ctl_lock);
}

module_init(adf_register_ctl_device_driver);
diff --git a/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c b/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c
index 34b9f7731c78..81dd22d2083f 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c
@@ -45,19 +45,6 @@ static struct vf_id_map *adf_find_vf(u32 bdf)
return NULL;
}

-static int adf_get_vf_real_id(u32 fake)
-{
- struct list_head *itr;
-
- list_for_each(itr, &vfs_table) {
- struct vf_id_map *ptr =
- list_entry(itr, struct vf_id_map, list);
- if (ptr->fake_id == fake)
- return ptr->id;
- }
- return -1;
-}
-
/**
* adf_clean_vf_map() - Cleans VF id mappings
* @vf: flag indicating whether mappings is cleaned
@@ -304,63 +291,6 @@ struct adf_accel_dev *adf_devmgr_pci_to_accel_dev(struct pci_dev *pci_dev)
}
EXPORT_SYMBOL_GPL(adf_devmgr_pci_to_accel_dev);

-struct adf_accel_dev *adf_devmgr_get_dev_by_id(u32 id)
-{
- struct list_head *itr;
- int real_id;
-
- mutex_lock(&table_lock);
- real_id = adf_get_vf_real_id(id);
- if (real_id < 0)
- goto unlock;
-
- id = real_id;
-
- list_for_each(itr, &accel_table) {
- struct adf_accel_dev *ptr =
- list_entry(itr, struct adf_accel_dev, list);
- if (ptr->accel_id == id) {
- mutex_unlock(&table_lock);
- return ptr;
- }
- }
-unlock:
- mutex_unlock(&table_lock);
- return NULL;
-}
-
-int adf_devmgr_verify_id(u32 id)
-{
- if (id == ADF_CFG_ALL_DEVICES)
- return 0;
-
- if (adf_devmgr_get_dev_by_id(id))
- return 0;
-
- return -ENODEV;
-}
-
-static int adf_get_num_dettached_vfs(void)
-{
- struct list_head *itr;
- int vfs = 0;
-
- mutex_lock(&table_lock);
- list_for_each(itr, &vfs_table) {
- struct vf_id_map *ptr =
- list_entry(itr, struct vf_id_map, list);
- if (ptr->bdf != ~0 && !ptr->attached)
- vfs++;
- }
- mutex_unlock(&table_lock);
- return vfs;
-}
-
-void adf_devmgr_get_num_dev(u32 *num)
-{
- *num = num_devices - adf_get_num_dettached_vfs();
-}
-
/**
* adf_dev_in_use() - Check whether accel_dev is currently in use
* @accel_dev: Pointer to acceleration device.
diff --git a/drivers/firmware/samsung/exynos-acpm.c b/drivers/firmware/samsung/exynos-acpm.c
index d3bd4bafe281..b19f9a91a5ec 100644
--- a/drivers/firmware/samsung/exynos-acpm.c
+++ b/drivers/firmware/samsung/exynos-acpm.c
@@ -103,12 +103,12 @@ struct acpm_queue {
*
* @cmd: pointer to where the data shall be saved.
* @n_cmd: number of 32-bit commands.
- * @response: true if the client expects the RX data.
+ * @rxcnt: expected length of the response in 32-bit words.
*/
struct acpm_rx_data {
u32 *cmd;
size_t n_cmd;
- bool response;
+ size_t rxcnt;
};

#define ACPM_SEQNUM_MAX 64
@@ -196,7 +196,7 @@ static void acpm_get_saved_rx(struct acpm_chan *achan,
const struct acpm_rx_data *rx_data = &achan->rx_data[tx_seqnum - 1];
u32 rx_seqnum;

- if (!rx_data->response)
+ if (!rx_data->rxcnt)
return;

rx_seqnum = FIELD_GET(ACPM_PROTOCOL_SEQNUM, rx_data->cmd[0]);
@@ -253,7 +253,7 @@ static int acpm_get_rx(struct acpm_chan *achan, const struct acpm_xfer *xfer)
seqnum = rx_seqnum - 1;
rx_data = &achan->rx_data[seqnum];

- if (rx_data->response) {
+ if (rx_data->rxcnt) {
if (rx_seqnum == tx_seqnum) {
__ioread32_copy(xfer->rxd, addr,
xfer->rxlen / 4);
@@ -267,7 +267,7 @@ static int acpm_get_rx(struct acpm_chan *achan, const struct acpm_xfer *xfer)
* after the response is copied to the request.
*/
__ioread32_copy(rx_data->cmd, addr,
- xfer->rxlen / 4);
+ rx_data->rxcnt);
}
} else {
clear_bit(seqnum, achan->bitmap_seqnum);
@@ -379,8 +379,8 @@ static void acpm_prepare_xfer(struct acpm_chan *achan,
/* Clear data for upcoming responses */
rx_data = &achan->rx_data[achan->seqnum - 1];
memset(rx_data->cmd, 0, sizeof(*rx_data->cmd) * rx_data->n_cmd);
- if (xfer->rxd)
- rx_data->response = true;
+ /* zero means no response expected */
+ rx_data->rxcnt = xfer->rxlen / 4;

/* Flag the index based on seqnum. (seqnum: 1~63, bitmap: 0~62) */
set_bit(achan->seqnum - 1, achan->bitmap_seqnum);
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 62795f6cbb00..8b409a3d41f0 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -93,7 +93,7 @@ static void kvp_send_key(struct work_struct *dummy);
static void kvp_respond_to_host(struct hv_kvp_msg *msg, int error);
static void kvp_timeout_func(struct work_struct *dummy);
static void kvp_host_handshake_func(struct work_struct *dummy);
-static void kvp_register(int);
+static int kvp_register(int);

static DECLARE_DELAYED_WORK(kvp_timeout_work, kvp_timeout_func);
static DECLARE_DELAYED_WORK(kvp_host_handshake_work, kvp_host_handshake_func);
@@ -127,24 +127,26 @@ static void kvp_register_done(void)
hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
}

-static void
+static int
kvp_register(int reg_value)
{

struct hv_kvp_msg *kvp_msg;
char *version;
+ int ret;

kvp_msg = kzalloc(sizeof(*kvp_msg), GFP_KERNEL);
+ if (!kvp_msg)
+ return -ENOMEM;

- if (kvp_msg) {
- version = kvp_msg->body.kvp_register.version;
- kvp_msg->kvp_hdr.operation = reg_value;
- strcpy(version, HV_DRV_VERSION);
+ version = kvp_msg->body.kvp_register.version;
+ kvp_msg->kvp_hdr.operation = reg_value;
+ strcpy(version, HV_DRV_VERSION);

- hvutil_transport_send(hvt, kvp_msg, sizeof(*kvp_msg),
- kvp_register_done);
- kfree(kvp_msg);
- }
+ ret = hvutil_transport_send(hvt, kvp_msg, sizeof(*kvp_msg),
+ kvp_register_done);
+ kfree(kvp_msg);
+ return ret;
}

static void kvp_timeout_func(struct work_struct *dummy)
@@ -186,9 +188,8 @@ static int kvp_handle_handshake(struct hv_kvp_msg *msg)
*/
pr_debug("KVP: userspace daemon ver. %d connected\n",
msg->kvp_hdr.operation);
- kvp_register(dm_reg_value);

- return 0;
+ return kvp_register(dm_reg_value);
}


diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 8a090e2a28f9..137cf82e2292 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2291,8 +2291,8 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
return AE_NO_MEMORY;

/* If this range overlaps the virtual TPM, truncate it. */
- if (end > VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS)
- end = VTPM_BASE_ADDRESS;
+ if (end >= VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS)
+ end = VTPM_BASE_ADDRESS - 1;

new_res->name = "hyperv mmio";
new_res->flags = IORESOURCE_MEM;
@@ -2359,6 +2359,7 @@ static void vmbus_mmio_remove(void)
static void __maybe_unused vmbus_reserve_fb(void)
{
resource_size_t start = 0, size;
+ resource_size_t low_mmio_base;
struct pci_dev *pdev;

if (efi_enabled(EFI_BOOT)) {
@@ -2366,6 +2367,24 @@ static void __maybe_unused vmbus_reserve_fb(void)
if (IS_ENABLED(CONFIG_SYSFB)) {
start = screen_info.lfb_base;
size = max_t(__u32, screen_info.lfb_size, 0x800000);
+
+ low_mmio_base = hyperv_mmio->start;
+ if (!low_mmio_base || upper_32_bits(low_mmio_base) ||
+ (start && start < low_mmio_base)) {
+ pr_warn("Unexpected low mmio base %pa\n", &low_mmio_base);
+ } else {
+ /*
+ * If the kdump/kexec or CVM kernel's lfb_base
+ * is 0, fall back to the low mmio base.
+ */
+ if (!start)
+ start = low_mmio_base;
+ /*
+ * Reserve half of the space below 4GB for high
+ * resolutions, but cap the reservation to 128MB.
+ */
+ size = min((SZ_4G - start) / 2, SZ_128M);
+ }
}
} else {
/* Gen1 VM: get FB base from PCI */
@@ -2386,8 +2405,10 @@ static void __maybe_unused vmbus_reserve_fb(void)
pci_dev_put(pdev);
}

- if (!start)
+ if (!start) {
+ pr_warn("Unexpected framebuffer mmio base of zero\n");
return;
+ }

/*
* Make a claim for the frame buffer in the resource tree under the
@@ -2397,6 +2418,8 @@ static void __maybe_unused vmbus_reserve_fb(void)
*/
for (; !fb_mmio && (size >= 0x100000); size >>= 1)
fb_mmio = __request_region(hyperv_mmio, start, size, fb_mmio_name, 0);
+
+ pr_info("hv_mmio=%pR,%pR fb=%pR\n", hyperv_mmio, hyperv_mmio->sibling, fb_mmio);
}

/**
diff --git a/drivers/i2c/i2c-stub.c b/drivers/i2c/i2c-stub.c
index 09e7b7bf4c5f..a6e2ce4360ca 100644
--- a/drivers/i2c/i2c-stub.c
+++ b/drivers/i2c/i2c-stub.c
@@ -214,6 +214,11 @@ static s32 stub_xfer(struct i2c_adapter *adap, u16 addr, unsigned short flags,
* We ignore banks here, because banked chips don't use I2C
* block transfers
*/
+ if (data->block[0] == 0 ||
+ data->block[0] > I2C_SMBUS_BLOCK_MAX) {
+ ret = -EINVAL;
+ break;
+ }
if (data->block[0] > 256 - command) /* Avoid overrun */
data->block[0] = 256 - command;
len = data->block[0];
diff --git a/drivers/iio/adc/ti-ads1298.c b/drivers/iio/adc/ti-ads1298.c
index ae30b47e4514..731792f06993 100644
--- a/drivers/iio/adc/ti-ads1298.c
+++ b/drivers/iio/adc/ti-ads1298.c
@@ -279,6 +279,7 @@ static const u8 ads1298_pga_settings[] = { 6, 1, 2, 3, 4, 8, 12 };
static int ads1298_get_scale(struct ads1298_private *priv,
int channel, int *val, int *val2)
{
+ unsigned int pga_idx;
int ret;
unsigned int regval;
u8 gain;
@@ -302,7 +303,11 @@ static int ads1298_get_scale(struct ads1298_private *priv,
if (ret)
return ret;

- gain = ads1298_pga_settings[FIELD_GET(ADS1298_MASK_CH_PGA, regval)];
+ pga_idx = FIELD_GET(ADS1298_MASK_CH_PGA, regval);
+ if (pga_idx >= ARRAY_SIZE(ads1298_pga_settings))
+ return -EINVAL;
+
+ gain = ads1298_pga_settings[pga_idx];
*val /= gain; /* Full scale is VREF / gain */

*val2 = ADS1298_BITS_PER_SAMPLE - 1; /* Signed, hence the -1 */
diff --git a/drivers/iio/light/veml6075.c b/drivers/iio/light/veml6075.c
index edbb43407054..f7eb159e5cb4 100644
--- a/drivers/iio/light/veml6075.c
+++ b/drivers/iio/light/veml6075.c
@@ -100,7 +100,7 @@ static const struct iio_chan_spec veml6075_channels[] = {

static int veml6075_request_measurement(struct veml6075_data *data)
{
- int ret, conf, int_time;
+ int ret, conf, int_time, int_index;

ret = regmap_read(data->regmap, VEML6075_CMD_CONF, &conf);
if (ret < 0)
@@ -117,7 +117,11 @@ static int veml6075_request_measurement(struct veml6075_data *data)
* time for all possible configurations. Using a 1.50 factor simplifies
* operations and ensures reliability under all circumstances.
*/
- int_time = veml6075_it_ms[FIELD_GET(VEML6075_CONF_IT, conf)];
+ int_index = FIELD_GET(VEML6075_CONF_IT, conf);
+ if (int_index >= ARRAY_SIZE(veml6075_it_ms))
+ return -EINVAL;
+
+ int_time = veml6075_it_ms[int_index];
msleep(int_time + (int_time / 2));

/* shutdown again, data registers are still accessible */
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index ff91511bd338..c1645de495a7 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -4350,7 +4350,7 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)

uctx->rdev = rdev;

- uctx->shpg = (void *)__get_free_page(GFP_KERNEL);
+ uctx->shpg = (void *)get_zeroed_page(GFP_KERNEL);
if (!uctx->shpg) {
rc = -ENOMEM;
goto fail;
diff --git a/drivers/media/test-drivers/vidtv/vidtv_mux.c b/drivers/media/test-drivers/vidtv/vidtv_mux.c
index 7dad97881fdb..00ae52dbc96e 100644
--- a/drivers/media/test-drivers/vidtv/vidtv_mux.c
+++ b/drivers/media/test-drivers/vidtv/vidtv_mux.c
@@ -101,7 +101,8 @@ static int vidtv_mux_pid_ctx_init(struct vidtv_mux *m)
/* add a ctx for all PMT sections */
while (p) {
pid = vidtv_psi_get_pat_program_pid(p);
- vidtv_mux_create_pid_ctx_once(m, pid);
+ if (!vidtv_mux_create_pid_ctx_once(m, pid))
+ goto free;
p = p->next;
}

@@ -170,6 +171,9 @@ static u32 vidtv_mux_push_si(struct vidtv_mux *m)
nit_ctx = vidtv_mux_get_pid_ctx(m, VIDTV_NIT_PID);
eit_ctx = vidtv_mux_get_pid_ctx(m, VIDTV_EIT_PID);

+ if (!pat_ctx || !sdt_ctx || !nit_ctx || !eit_ctx)
+ return 0;
+
pat_args.offset = m->mux_buf_offset;
pat_args.continuity_counter = &pat_ctx->cc;

@@ -186,6 +190,8 @@ static u32 vidtv_mux_push_si(struct vidtv_mux *m)
}

pmt_ctx = vidtv_mux_get_pid_ctx(m, pmt_pid);
+ if (!pmt_ctx)
+ continue;

pmt_args.offset = m->mux_buf_offset;
pmt_args.pmt = m->si.pmt_secs[i];
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index ba8763cac9d9..3a132633c066 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -213,8 +213,8 @@ static void rmnet_dellink(struct net_device *dev, struct list_head *head)
ep = rmnet_get_endpoint(real_port, mux_id);
if (ep) {
hlist_del_init_rcu(&ep->hlnode);
- rmnet_vnd_dellink(mux_id, real_port, ep);
- kfree(ep);
+ real_port->nr_rmnet_devs--;
+ kfree_rcu(ep, rcu);
}

netdev_upper_dev_unlink(real_dev, dev);
@@ -238,9 +238,9 @@ static void rmnet_force_unassociate_device(struct net_device *real_dev)
hash_for_each_safe(port->muxed_ep, bkt_ep, tmp_ep, ep, hlnode) {
unregister_netdevice_queue(ep->egress_dev, &list);
netdev_upper_dev_unlink(real_dev, ep->egress_dev);
- rmnet_vnd_dellink(ep->mux_id, port, ep);
hlist_del_init_rcu(&ep->hlnode);
- kfree(ep);
+ port->nr_rmnet_devs--;
+ kfree_rcu(ep, rcu);
}
rmnet_unregister_real_device(real_dev);
unregister_netdevice_many(&list);
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
index ed112d51ac5a..f50fae1c6bdd 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
@@ -18,6 +18,7 @@ struct rmnet_endpoint {
u8 mux_id;
struct net_device *egress_dev;
struct hlist_node hlnode;
+ struct rcu_head rcu;
};

struct rmnet_egress_agg_params {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 41b270a48630..1ceedd74e429 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -7760,7 +7760,7 @@ int stmmac_suspend(struct device *dev)
u32 chan;

if (!ndev || !netif_running(ndev))
- return 0;
+ goto suspend_bsp;

mutex_lock(&priv->lock);

@@ -7803,6 +7803,7 @@ int stmmac_suspend(struct device *dev)
if (stmmac_fpe_supported(priv))
ethtool_mmsv_stop(&priv->fpe_cfg.mmsv);

+suspend_bsp:
if (priv->plat->suspend)
return priv->plat->suspend(dev, priv->plat->bsp_priv);

diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index 5b50d9186f12..cd16b3dd7720 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -502,7 +502,7 @@ static int net_failover_slave_register(struct net_device *slave_dev,

/* Align MTU of slave with failover dev */
orig_mtu = slave_dev->mtu;
- err = dev_set_mtu(slave_dev, failover_dev->mtu);
+ err = netif_set_mtu(slave_dev, failover_dev->mtu);
if (err) {
netdev_err(failover_dev, "unable to change mtu of %s to %u register failed\n",
slave_dev->name, failover_dev->mtu);
@@ -512,11 +512,11 @@ static int net_failover_slave_register(struct net_device *slave_dev,
dev_hold(slave_dev);

if (netif_running(failover_dev)) {
- err = dev_open(slave_dev, NULL);
+ err = netif_open(slave_dev, NULL);
if (err && (err != -EBUSY)) {
netdev_err(failover_dev, "Opening slave %s failed err:%d\n",
slave_dev->name, err);
- goto err_dev_open;
+ goto err_netif_open;
}
}

@@ -562,10 +562,10 @@ static int net_failover_slave_register(struct net_device *slave_dev,
err_vlan_add:
dev_uc_unsync(slave_dev, failover_dev);
dev_mc_unsync(slave_dev, failover_dev);
- dev_close(slave_dev);
-err_dev_open:
+ netif_close(slave_dev);
+err_netif_open:
dev_put(slave_dev);
- dev_set_mtu(slave_dev, orig_mtu);
+ netif_set_mtu(slave_dev, orig_mtu);
done:
return err;
}
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 765bd1b5deb3..761a3f905859 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2159,8 +2159,16 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
if (rdev->use_count) {
ret = regulator_enable(rdev->supply);
if (ret < 0) {
- _regulator_put(rdev->supply);
+ struct regulator *supply;
+
+ regulator_lock_two(rdev, rdev->supply->rdev, &ww_ctx);
+
+ supply = rdev->supply;
rdev->supply = NULL;
+
+ regulator_unlock_two(rdev, supply->rdev, &ww_ctx);
+
+ regulator_put(supply);
goto out;
}
}
diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index 337fa3261ac7..a34817a4428c 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -896,12 +896,9 @@ static void qcom_geni_serial_handle_rx_dma(struct uart_port *uport, bool drop)
port->rx_dma_addr = 0;

rx_in = readl(uport->membase + SE_DMA_RX_LEN_IN);
- if (!rx_in) {
- dev_warn(uport->dev, "serial engine reports 0 RX bytes in!\n");
- return;
- }
-
- if (!drop)
+ if (!rx_in)
+ dev_warn_ratelimited(uport->dev, "serial engine reports 0 RX bytes in!\n");
+ else if (!drop)
handle_rx_uart(uport, rx_in);

ret = geni_se_rx_dma_prep(&port->se, port->rx_buf,
diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c
index c814644ef4ee..60b12d40724c 100644
--- a/drivers/tty/vt/vc_screen.c
+++ b/drivers/tty/vt/vc_screen.c
@@ -686,7 +686,7 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
}
*ppos += written;
ret = written;
- if (written)
+ if (written && vc)
vcs_scr_updated(vc);

return ret;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8bd6cf17355f..527f60f7ad7b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1037,6 +1037,10 @@ static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop
if (WARN_ON(folio_test_mlocked(oldfolio)))
goto out_fallback_unlock;

+ err = lock_request(cs->req);
+ if (err)
+ goto out_fallback_unlock;
+
replace_page_cache_folio(oldfolio, newfolio);

folio_get(newfolio);
@@ -1050,20 +1054,7 @@ static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop
*/
pipe_buf_release(cs->pipe, buf);

- err = 0;
- spin_lock(&cs->req->waitq.lock);
- if (test_bit(FR_ABORTED, &cs->req->flags))
- err = -ENOENT;
- else
- *foliop = newfolio;
- spin_unlock(&cs->req->waitq.lock);
-
- if (err) {
- folio_unlock(newfolio);
- folio_put(newfolio);
- goto out_put_old;
- }
-
+ *foliop = newfolio;
folio_unlock(oldfolio);
/* Drop ref for ap->pages[] array */
folio_put(oldfolio);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 782178124512..79d1b502e731 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -374,8 +374,14 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
* aio and closes the fd before the aio completes. Since aio takes its
* own ref to the file, the IO completion has to drop the ref, which is
* how the fuse server can end up closing its clients' files.
+ *
+ * Exception is virtio-fs, which is not affected by the above (server is
+ * on host, cannot close open files in guest). Virtio-fs needs sync
+ * release, because the num_waiting mechanism to wait for all requests
+ * before commencing with fs shutdown doesn't work if submounts are
+ * used.
*/
- fuse_file_put(ff, false);
+ fuse_file_put(ff, ff->fm->fc->auto_submounts);
}

void fuse_release_common(struct file *file, bool isdir)
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 18933ca407be..2a1499f2ad19 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -36,30 +36,19 @@
* second map contains a reference to the entry in the first map.
*/

-static struct workqueue_struct *nfsd_export_wq;
-
#define EXPKEY_HASHBITS 8
#define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS)
#define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1)

-static void expkey_release(struct work_struct *work)
+static void expkey_put(struct kref *ref)
{
- struct svc_expkey *key = container_of(to_rcu_work(work),
- struct svc_expkey, ek_rwork);
+ struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);

if (test_bit(CACHE_VALID, &key->h.flags) &&
!test_bit(CACHE_NEGATIVE, &key->h.flags))
path_put(&key->ek_path);
auth_domain_put(key->ek_client);
- kfree(key);
-}
-
-static void expkey_put(struct kref *ref)
-{
- struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
-
- INIT_RCU_WORK(&key->ek_rwork, expkey_release);
- queue_rcu_work(nfsd_export_wq, &key->ek_rwork);
+ kfree_rcu(key, ek_rcu);
}

static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
@@ -364,13 +353,11 @@ static void export_stats_destroy(struct export_stats *stats)
EXP_STATS_COUNTERS_NUM);
}

-static void svc_export_release(struct work_struct *work)
+static void svc_export_release(struct rcu_head *rcu_head)
{
- struct svc_export *exp = container_of(to_rcu_work(work),
- struct svc_export, ex_rwork);
+ struct svc_export *exp = container_of(rcu_head, struct svc_export,
+ ex_rcu);

- path_put(&exp->ex_path);
- auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
export_stats_destroy(exp->ex_stats);
kfree(exp->ex_stats);
@@ -382,8 +369,9 @@ static void svc_export_put(struct kref *ref)
{
struct svc_export *exp = container_of(ref, struct svc_export, h.ref);

- INIT_RCU_WORK(&exp->ex_rwork, svc_export_release);
- queue_rcu_work(nfsd_export_wq, &exp->ex_rwork);
+ path_put(&exp->ex_path);
+ auth_domain_put(exp->ex_client);
+ call_rcu(&exp->ex_rcu, svc_export_release);
}

static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
@@ -1490,36 +1478,6 @@ const struct seq_operations nfs_exports_op = {
.show = e_show,
};

-/**
- * nfsd_export_wq_init - allocate the export release workqueue
- *
- * Called once at module load. The workqueue runs deferred svc_export and
- * svc_expkey release work scheduled by queue_rcu_work() in the cache put
- * callbacks.
- *
- * Return values:
- * %0: workqueue allocated
- * %-ENOMEM: allocation failed
- */
-int nfsd_export_wq_init(void)
-{
- nfsd_export_wq = alloc_workqueue("nfsd_export", WQ_UNBOUND, 0);
- if (!nfsd_export_wq)
- return -ENOMEM;
- return 0;
-}
-
-/**
- * nfsd_export_wq_shutdown - drain and free the export release workqueue
- *
- * Called once at module unload. Per-namespace teardown in
- * nfsd_export_shutdown() has already drained all deferred work.
- */
-void nfsd_export_wq_shutdown(void)
-{
- destroy_workqueue(nfsd_export_wq);
-}
-
/*
* Initialize the exports module.
*/
@@ -1581,9 +1539,6 @@ nfsd_export_shutdown(struct net *net)

cache_unregister_net(nn->svc_expkey_cache, net);
cache_unregister_net(nn->svc_export_cache, net);
- /* Drain deferred export and expkey release work. */
- rcu_barrier();
- flush_workqueue(nfsd_export_wq);
cache_destroy_net(nn->svc_expkey_cache, net);
cache_destroy_net(nn->svc_export_cache, net);
svcauth_unix_purge(net);
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index b05399374574..d2b09cd76145 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -7,7 +7,6 @@

#include <linux/sunrpc/cache.h>
#include <linux/percpu_counter.h>
-#include <linux/workqueue.h>
#include <uapi/linux/nfsd/export.h>
#include <linux/nfs4.h>

@@ -76,7 +75,7 @@ struct svc_export {
u32 ex_layout_types;
struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
- struct rcu_work ex_rwork;
+ struct rcu_head ex_rcu;
unsigned long ex_xprtsec_modes;
struct export_stats *ex_stats;
};
@@ -93,7 +92,7 @@ struct svc_expkey {
u32 ek_fsid[6];

struct path ek_path;
- struct rcu_work ek_rwork;
+ struct rcu_head ek_rcu;
};

#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC))
@@ -111,8 +110,6 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp,
/*
* Function declarations
*/
-int nfsd_export_wq_init(void);
-void nfsd_export_wq_shutdown(void);
int nfsd_export_init(struct net *);
void nfsd_export_shutdown(struct net *);
void nfsd_export_flush(struct net *);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4ed19d81577b..1da5c40d6722 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2262,12 +2262,9 @@ static int __init init_nfsd(void)
if (retval)
goto out_free_pnfs;
nfsd_lockd_init(); /* lockd->nfsd callbacks */
- retval = nfsd_export_wq_init();
- if (retval)
- goto out_free_lockd;
retval = register_pernet_subsys(&nfsd_net_ops);
if (retval < 0)
- goto out_free_export_wq;
+ goto out_free_lockd;
retval = register_cld_notifier();
if (retval)
goto out_free_subsys;
@@ -2296,8 +2293,6 @@ static int __init init_nfsd(void)
unregister_cld_notifier();
out_free_subsys:
unregister_pernet_subsys(&nfsd_net_ops);
-out_free_export_wq:
- nfsd_export_wq_shutdown();
out_free_lockd:
nfsd_lockd_shutdown();
nfsd_drc_slab_free();
@@ -2318,7 +2313,6 @@ static void __exit exit_nfsd(void)
nfsd4_destroy_laundry_wq();
unregister_cld_notifier();
unregister_pernet_subsys(&nfsd_net_ops);
- nfsd_export_wq_shutdown();
nfsd_drc_slab_free();
nfsd_lockd_shutdown();
nfsd4_free_slabs();
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b490245ff9be..4c5adfd4fc1f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1159,6 +1159,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MAYSHARE)] = "ms",
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
+ [ilog2(VM_MAYBE_GUARD)] = "gu",
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index c21bca5a34c4..c012c5a93cc6 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -613,6 +613,11 @@ int smb2_check_user_session(struct ksmbd_work *work)
sess_id, work->sess->id);
return -EINVAL;
}
+ if (work->sess->state != SMB2_SESSION_VALID) {
+ pr_err("compound request on a non-valid session (state %d)\n",
+ work->sess->state);
+ return -EINVAL;
+ }
return 1;
}

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d905cf528b23..92ba2489f237 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -269,6 +269,8 @@ extern struct rw_semaphore nommu_region_sem;
extern unsigned int kobjsize(const void *objp);
#endif

+#define VM_MAYBE_GUARD_BIT 11
+
/*
* vm_flags in vm_area_struct, see mm_types.h.
* When changing, update also include/trace/events/mmflags.h
@@ -294,6 +296,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_UFFD_MISSING 0
#endif /* CONFIG_MMU */
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
+#define VM_MAYBE_GUARD BIT(VM_MAYBE_GUARD_BIT) /* The VMA maybe contains guard regions. */
#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */

#define VM_LOCKED 0x00002000
@@ -498,12 +501,72 @@ extern unsigned int kobjsize(const void *objp);
/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)

+/* These flags can be updated atomically via VMA/mmap read lock. */
+#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD
+
/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR VM_NONE
#endif
#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)

+/*
+ * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
+ * possesses it but the other does not, the merged VMA should nonetheless have
+ * applied to it:
+ *
+ * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its
+ * references cleared via /proc/$pid/clear_refs, any merged VMA
+ * should be considered soft-dirty also as it operates at a VMA
+ * granularity.
+ *
+ * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that
+ * mapped page tables may contain metadata not described by the
+ * VMA and thus any merged VMA may also contain this metadata,
+ * and thus we must make this flag sticky.
+ */
+#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
+
+/*
+ * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
+ * of these flags and the other not does not preclude a merge.
+ *
+ * VM_STICKY - When merging VMAs, VMA flags must match, unless they are
+ * 'sticky'. If any sticky flags exist in either VMA, we simply
+ * set all of them on the merged VMA.
+ */
+#define VM_IGNORE_MERGE VM_STICKY
+
+/*
+ * Flags which should result in page tables being copied on fork. These are
+ * flags which indicate that the VMA maps page tables which cannot be
+ * reconsistuted upon page fault, so necessitate page table copying upon fork.
+ *
+ * Note that these flags should be compared with the DESTINATION VMA not the
+ * source, as VM_UFFD_WP may not be propagated to destination, while all other
+ * flags will be.
+ *
+ * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
+ * reasonably reconstructed on page fault.
+ *
+ * VM_UFFD_WP - Encodes metadata about an installed uffd
+ * write protect handler, which cannot be
+ * reconstructed on page fault.
+ *
+ * We always copy pgtables when dst_vma has uffd-wp
+ * enabled even if it's file-backed
+ * (e.g. shmem). Because when uffd-wp is enabled,
+ * pgtable contains uffd-wp protection information,
+ * that's something we can't retrieve from page cache,
+ * and skip copying will lose those info.
+ *
+ * VM_MAYBE_GUARD - Could contain page guard region markers which
+ * by design are a property of the page tables
+ * only and thus cannot be reconstructed on page
+ * fault.
+ */
+#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
+
/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
@@ -840,6 +903,47 @@ static inline void vm_flags_mod(struct vm_area_struct *vma,
__vm_flags_mod(vma, set, clear);
}

+static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma,
+ int bit)
+{
+ const vm_flags_t mask = BIT(bit);
+
+ /* Only specific flags are permitted */
+ if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED)))
+ return false;
+
+ return true;
+}
+
+/*
+ * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific
+ * valid flags are allowed to do this.
+ */
+static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit)
+{
+ /* mmap read lock/VMA read lock must be held. */
+ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
+ vma_assert_locked(vma);
+
+ if (__vma_flag_atomic_valid(vma, bit))
+ set_bit(bit, &ACCESS_PRIVATE(vma, __vm_flags));
+}
+
+/*
+ * Test for VMA flag atomically. Requires no locks. Only specific valid flags
+ * are allowed to do this.
+ *
+ * This is necessarily racey, so callers must ensure that serialisation is
+ * achieved through some other means, or that races are permissible.
+ */
+static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, int bit)
+{
+ if (__vma_flag_atomic_valid(vma, bit))
+ return test_bit(bit, &vma->vm_flags);
+
+ return false;
+}
+
static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
vma->vm_ops = NULL;
diff --git a/include/net/rose.h b/include/net/rose.h
index 2b5491bbf39a..95d1f9c582dc 100644
--- a/include/net/rose.h
+++ b/include/net/rose.h
@@ -160,6 +160,18 @@ static inline void rose_neigh_hold(struct rose_neigh *rose_neigh)
static inline void rose_neigh_put(struct rose_neigh *rose_neigh)
{
if (refcount_dec_and_test(&rose_neigh->use)) {
+ /* We are dropping the last reference, so we are about to free the
+ * neighbour. Its timers may still be armed -- t0timer in particular
+ * re-arms itself in rose_t0timer_expiry(). rose_remove_neigh()
+ * cancels them before its own put, but callers that drop the final
+ * reference without first calling rose_remove_neigh() (the socket
+ * heartbeat reaping path) would otherwise kfree() a neighbour with a
+ * live timer -> use-after-free. timer_delete_sync() (not the async
+ * variant) is required: it waits out a concurrently running handler
+ * and loops until the self-rearming timer stays stopped.
+ */
+ timer_delete_sync(&rose_neigh->ftimer);
+ timer_delete_sync(&rose_neigh->t0timer);
if (rose_neigh->ax25)
ax25_cb_put(rose_neigh->ax25);
kfree(rose_neigh->digipeat);
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index aa441f593e9a..a6e5a44c9b42 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -213,6 +213,7 @@ IF_HAVE_PG_ARCH_3(arch_3)
{VM_UFFD_MISSING, "uffd_missing" }, \
IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \
{VM_PFNMAP, "pfnmap" }, \
+ {VM_MAYBE_GUARD, "maybe_guard" }, \
{VM_UFFD_WP, "uffd_wp" }, \
{VM_LOCKED, "locked" }, \
{VM_IO, "io" }, \
diff --git a/io_uring/net.c b/io_uring/net.c
index a46c7e817040..3ab2bfca1bd5 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1771,7 +1771,7 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
- struct io_async_msghdr *io;
+ struct sockaddr_storage *addr;

if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
@@ -1780,17 +1780,17 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
conn->addr_len = READ_ONCE(sqe->addr2);
conn->in_progress = conn->seen_econnaborted = false;

- io = io_msg_alloc_async(req);
- if (unlikely(!io))
+ addr = io_uring_alloc_async_data(NULL, req);
+ if (unlikely(!addr))
return -ENOMEM;

- return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
+ return move_addr_to_kernel(conn->addr, conn->addr_len, addr);
}

int io_connect(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
- struct io_async_msghdr *io = req->async_data;
+ struct sockaddr_storage *addr = req->async_data;
unsigned file_flags;
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@@ -1804,8 +1804,7 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)

file_flags = force_nonblock ? O_NONBLOCK : 0;

- ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
- file_flags);
+ ret = __sys_connect_file(req->file, addr, connect->addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
&& force_nonblock) {
if (ret == -EINPROGRESS) {
@@ -1834,7 +1833,6 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
out:
if (ret < 0)
req_set_fail(req);
- io_req_msg_cleanup(req, issue_flags);
io_req_set_res(req, ret, 0);
return IOU_COMPLETE;
}
@@ -1844,15 +1842,15 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
* which in turn end up in mnt_want_write() which will grab the fs
* percpu start write sem. This can trigger a lockdep warning.
*/
-static int io_bind_file_create(const struct io_async_msghdr *io, int addr_len)
+static int io_bind_file_create(const struct sockaddr_storage *addr, int addr_len)
{
const struct sockaddr_un *sun;

- if (io->addr.ss_family != AF_UNIX)
+ if (addr->ss_family != AF_UNIX)
return 0;
if (addr_len <= offsetof(struct sockaddr_un, sun_path))
return 0;
- sun = (const struct sockaddr_un *) &io->addr;
+ sun = (const struct sockaddr_un *) addr;
return sun->sun_path[0] != '\0';
}

@@ -1860,7 +1858,7 @@ int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
struct sockaddr __user *uaddr;
- struct io_async_msghdr *io;
+ struct sockaddr_storage *addr;
int ret;

if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
@@ -1869,21 +1867,23 @@ int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
bind->addr_len = READ_ONCE(sqe->addr2);

- io = io_msg_alloc_async(req);
- if (unlikely(!io))
+ addr = io_uring_alloc_async_data(NULL, req);
+ if (unlikely(!addr))
return -ENOMEM;
- ret = move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
+
+ ret = move_addr_to_kernel(uaddr, bind->addr_len, addr);
if (unlikely(ret))
return ret;
- if (io_bind_file_create(io, bind->addr_len))
+ if (io_bind_file_create(addr, bind->addr_len))
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}

+
int io_bind(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
- struct io_async_msghdr *io = req->async_data;
+ struct sockaddr_storage *addr = req->async_data;
struct socket *sock;
int ret;

@@ -1891,7 +1891,7 @@ int io_bind(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;

- ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
+ ret = __sys_bind_socket(sock, addr, bind->addr_len);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 932319633eac..a57c820567f7 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -207,7 +207,7 @@ const struct io_issue_def io_issue_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
#if defined(CONFIG_NET)
- .async_size = sizeof(struct io_async_msghdr),
+ .async_size = sizeof(struct sockaddr_storage),
.prep = io_connect_prep,
.issue = io_connect,
#else
@@ -504,7 +504,7 @@ const struct io_issue_def io_issue_defs[] = {
.needs_file = 1,
.prep = io_bind_prep,
.issue = io_bind,
- .async_size = sizeof(struct io_async_msghdr),
+ .async_size = sizeof(struct sockaddr_storage),
#else
.prep = io_eopnotsupp_prep,
#endif
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index e4b7f77ece3b..17f116247bb4 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -711,6 +711,50 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket
return NULL;
}

+static inline bool debug_objects_is_pi_blocked_on(void)
+{
+#ifdef CONFIG_RT_MUTEXES
+ return current->pi_blocked_on != NULL;
+#else
+ return false;
+#endif
+}
+
+static inline bool can_fill_pool(void)
+{
+ /*
+ * On !RT enabled kernels there are no restrictions and spinlock_t and
+ * raw_spinlock_t are the same types.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ return true;
+
+ /*
+ * On RT enabled kernels, the task must not be blocked on a lock as
+ * that could corrupt the PI state when blocking on a lock in the
+ * allocation path.
+ */
+ if (debug_objects_is_pi_blocked_on())
+ return false;
+
+ /*
+ * On RT enabled kernels the pool refill should happen in preemptible
+ * context.
+ */
+ if (preemptible())
+ return true;
+
+ /*
+ * Though during system boot before scheduling is set up, preemption is
+ * disabled and the pool can get exhausted. Before scheduling is active
+ * a task cannot be blocked on a sleeping lock, but it might hold a lock
+ * and if interrupted then hard interrupt context might run into a lock
+ * inversion. So exclude hard interrupt context from allocations before
+ * scheduling is active.
+ */
+ return system_state < SYSTEM_SCHEDULING && !in_hardirq();
+}
+
static void debug_objects_fill_pool(void)
{
if (!static_branch_likely(&obj_cache_enabled))
@@ -725,19 +769,13 @@ static void debug_objects_fill_pool(void)
if (likely(!pool_should_refill(&pool_global)))
return;

- /*
- * On RT enabled kernels the pool refill must happen in preemptible
- * context -- for !RT kernels we rely on the fact that spinlock_t and
- * raw_spinlock_t are basically the same type and this lock-type
- * inversion works just fine.
- */
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+ if (can_fill_pool()) {
/*
* Annotate away the spinlock_t inside raw_spinlock_t warning
- * by temporarily raising the wait-type to WAIT_SLEEP, matching
- * the preemptible() condition above.
+ * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching
+ * the preemptible() condition in can_fill_pool().
*/
- static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP);
+ static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG);
lock_map_acquire_try(&fill_pool_map);
fill_pool();
lock_map_release(&fill_pool_map);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index abe54f0043c7..3dcd884c844e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1715,6 +1715,43 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
return result;
}

+/* Can we retract page tables for this file-backed VMA? */
+static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
+{
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth removing
+ * page tables from, as PMD-mapping is likely to be split later.
+ */
+ if (READ_ONCE(vma->anon_vma))
+ return false;
+
+ /*
+ * When a vma is registered with uffd-wp, we cannot recycle
+ * the page table because there may be pte markers installed.
+ * Other vmas can still have the same file mapped hugely, but
+ * skip this one: it will always be mapped in small page size
+ * for uffd-wp registered ranges.
+ */
+ if (userfaultfd_wp(vma))
+ return false;
+
+ /*
+ * If the VMA contains guard regions then we can't collapse it.
+ *
+ * This is set atomically on guard marker installation under mmap/VMA
+ * read lock, and here we may not hold any VMA or mmap lock at all.
+ *
+ * This is therefore serialised on the PTE page table lock, which is
+ * obtained on guard region installation after the flag is set, so this
+ * check being performed under this lock excludes races.
+ */
+ if (vma_flag_test_atomic(vma, VM_MAYBE_GUARD_BIT))
+ return false;
+
+ return true;
+}
+
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
@@ -1729,14 +1766,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *ptl;
bool success = false;

- /*
- * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
- * got written to. These VMAs are likely not worth removing
- * page tables from, as PMD-mapping is likely to be split later.
- */
- if (READ_ONCE(vma->anon_vma))
- continue;
-
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK ||
vma->vm_end < addr + HPAGE_PMD_SIZE)
@@ -1748,14 +1777,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)

if (hpage_collapse_test_exit(mm))
continue;
- /*
- * When a vma is registered with uffd-wp, we cannot recycle
- * the page table because there may be pte markers installed.
- * Other vmas can still have the same file mapped hugely, but
- * skip this one: it will always be mapped in small page size
- * for uffd-wp registered ranges.
- */
- if (userfaultfd_wp(vma))
+
+ if (!file_backed_vma_is_retractable(vma))
continue;

/* PTEs were notified when unmapped; but now for the PMD? */
@@ -1782,15 +1805,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

/*
- * Huge page lock is still held, so normally the page table
- * must remain empty; and we have already skipped anon_vma
- * and userfaultfd_wp() vmas. But since the mmap_lock is not
- * held, it is still possible for a racing userfaultfd_ioctl()
- * to have inserted ptes or markers. Now that we hold ptlock,
- * repeating the anon_vma check protects from one category,
- * and repeating the userfaultfd_wp() check from another.
+ * Huge page lock is still held, so normally the page table must
+ * remain empty; and we have already skipped anon_vma and
+ * userfaultfd_wp() vmas. But since the mmap_lock is not held,
+ * it is still possible for a racing userfaultfd_ioctl() or
+ * madvise() to have inserted ptes or markers. Now that we hold
+ * ptlock, repeating the retractable checks protects us from
+ * races against the prior checks.
*/
- if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
+ if (likely(file_backed_vma_is_retractable(vma))) {
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
success = true;
diff --git a/mm/madvise.c b/mm/madvise.c
index fb1c86e630b6..5dbe40be7c65 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -167,7 +167,7 @@ static int madvise_update_vma(vm_flags_t new_flags,
range->start, range->end, anon_name);
else
vma = vma_modify_flags(&vmi, madv_behavior->prev, vma,
- range->start, range->end, new_flags);
+ range->start, range->end, &new_flags);

if (IS_ERR(vma))
return PTR_ERR(vma);
@@ -1141,15 +1141,21 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
return -EINVAL;

/*
- * If we install guard markers, then the range is no longer
- * empty from a page table perspective and therefore it's
- * appropriate to have an anon_vma.
- *
- * This ensures that on fork, we copy page tables correctly.
+ * Set atomically under read lock. All pertinent readers will need to
+ * acquire an mmap/VMA write lock to read it. All remaining readers may
+ * or may not see the flag set, but we don't care.
+ */
+ vma_flag_set_atomic(vma, VM_MAYBE_GUARD_BIT);
+
+ /*
+ * If anonymous and we are establishing page tables the VMA ought to
+ * have an anon_vma associated with it.
*/
- err = anon_vma_prepare(vma);
- if (err)
- return err;
+ if (vma_is_anonymous(vma)) {
+ err = anon_vma_prepare(vma);
+ if (err)
+ return err;
+ }

/*
* Optimistically try to install the guard marker pages first. If any
diff --git a/mm/memory.c b/mm/memory.c
index 42b733b78523..9b071e269d1e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1480,17 +1480,15 @@ static bool
vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
/*
- * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
- * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
- * contains uffd-wp protection information, that's something we can't
- * retrieve from page cache, and skip copying will lose those info.
+ * We check against dst_vma as while sane VMA flags will have been
+ * copied, VM_UFFD_WP may be set only on dst_vma.
*/
- if (userfaultfd_wp(dst_vma))
+ if (dst_vma->vm_flags & VM_COPY_ON_FORK)
return true;
-
- if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
- return true;
-
+ /*
+ * The presence of an anon_vma indicates an anonymous VMA has page
+ * tables which naturally cannot be reconstituted on page fault.
+ */
if (src_vma->anon_vma)
return true;

diff --git a/mm/mlock.c b/mm/mlock.c
index f59c6d8d376f..73551c71cebf 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -480,7 +480,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
*/
goto out;

- vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
+ vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 988c366137d5..fa818cd58201 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -813,7 +813,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
newflags &= ~VM_ACCOUNT;
}

- vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags);
+ vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags);
if (IS_ERR(vma)) {
error = PTR_ERR(vma);
goto fail;
diff --git a/mm/mseal.c b/mm/mseal.c
index c561f0ea93e8..3d2f06046e90 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -69,9 +69,10 @@ static int mseal_apply(struct mm_struct *mm,
const unsigned long curr_end = MIN(vma->vm_end, end);

if (!(vma->vm_flags & VM_SEALED)) {
- vma = vma_modify_flags(&vmi, prev, vma,
- curr_start, curr_end,
- vma->vm_flags | VM_SEALED);
+ vm_flags_t vm_flags = vma->vm_flags | VM_SEALED;
+
+ vma = vma_modify_flags(&vmi, prev, vma, curr_start,
+ curr_end, &vm_flags);
if (IS_ERR(vma))
return PTR_ERR(vma);
vm_flags_set(vma, VM_SEALED);
diff --git a/mm/vma.c b/mm/vma.c
index eeb6a187c3d8..30148e6af775 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -82,15 +82,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex

if (!mpol_equal(vmg->policy, vma_policy(vma)))
return false;
- /*
- * VM_SOFTDIRTY should not prevent from VMA merging, if we
- * match the flags but dirty bit -- the caller should mark
- * merged VMA as dirty. If dirty bit won't be excluded from
- * comparison, we increase pressure on the memory system forcing
- * the kernel to generate new VMAs when old one could be
- * extended instead.
- */
- if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY)
+ if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE)
return false;
if (vma->vm_file != vmg->file)
return false;
@@ -810,6 +802,7 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
static __must_check struct vm_area_struct *vma_merge_existing_range(
struct vma_merge_struct *vmg)
{
+ vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY;
struct vm_area_struct *middle = vmg->middle;
struct vm_area_struct *prev = vmg->prev;
struct vm_area_struct *next;
@@ -904,11 +897,13 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
if (merge_right) {
vma_start_write(next);
vmg->target = next;
+ sticky_flags |= (next->vm_flags & VM_STICKY);
}

if (merge_left) {
vma_start_write(prev);
vmg->target = prev;
+ sticky_flags |= (prev->vm_flags & VM_STICKY);
}

if (merge_both) {
@@ -978,6 +973,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
if (err || commit_merge(vmg))
goto abort;

+ vm_flags_set(vmg->target, sticky_flags);
khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
@@ -1156,14 +1152,20 @@ int vma_expand(struct vma_merge_struct *vmg)
struct vm_area_struct *target = vmg->target;
struct vm_area_struct *next = vmg->next;
int ret = 0;
+ vm_flags_t sticky_flags;
+
+ sticky_flags = vmg->vm_flags & VM_STICKY;
+ sticky_flags |= target->vm_flags & VM_STICKY;

VM_WARN_ON_VMG(!target, vmg);

mmap_assert_write_locked(vmg->mm);
vma_start_write(target);

- if (next && target != next && vmg->end == next->vm_end)
+ if (next && target != next && vmg->end == next->vm_end) {
+ sticky_flags |= next->vm_flags & VM_STICKY;
remove_next = true;
+ }

/* We must have a target. */
VM_WARN_ON_VMG(!target, vmg);
@@ -1197,6 +1199,7 @@ int vma_expand(struct vma_merge_struct *vmg)
if (commit_merge(vmg))
goto nomem;

+ vm_flags_set(target, sticky_flags);
return 0;

nomem:
@@ -1676,25 +1679,35 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
return vma;
}

-struct vm_area_struct *vma_modify_flags(
- struct vma_iterator *vmi, struct vm_area_struct *prev,
- struct vm_area_struct *vma, unsigned long start, unsigned long end,
- vm_flags_t vm_flags)
+struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ vm_flags_t *vm_flags_ptr)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+ const vm_flags_t vm_flags = *vm_flags_ptr;
+ struct vm_area_struct *ret;

vmg.vm_flags = vm_flags;

- return vma_modify(&vmg);
+ ret = vma_modify(&vmg);
+ if (IS_ERR(ret))
+ return ret;
+
+ /*
+ * For a merge to succeed, the flags must match those
+ * requested. However, sticky flags may have been retained, so propagate
+ * them to the caller.
+ */
+ if (vmg.state == VMA_MERGE_SUCCESS)
+ *vm_flags_ptr = ret->vm_flags;
+ return ret;
}

-struct vm_area_struct
-*vma_modify_name(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- struct anon_vma_name *new_name)
+struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct anon_vma_name *new_name)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

@@ -1703,12 +1716,10 @@ struct vm_area_struct
return vma_modify(&vmg);
}

-struct vm_area_struct
-*vma_modify_policy(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct mempolicy *new_pol)
+struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct mempolicy *new_pol)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

@@ -1717,14 +1728,10 @@ struct vm_area_struct
return vma_modify(&vmg);
}

-struct vm_area_struct
-*vma_modify_flags_uffd(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- vm_flags_t vm_flags,
- struct vm_userfaultfd_ctx new_ctx,
- bool give_up_on_oom)
+struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, vm_flags_t vm_flags,
+ struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

@@ -1955,7 +1962,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
return a->vm_end == b->vm_start &&
mpol_equal(vma_policy(a), vma_policy(b)) &&
a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) &&
b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}

diff --git a/mm/vma.h b/mm/vma.h
index d73e1b324bfd..4c7ea8b893ab 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -266,47 +266,113 @@ void remove_vma(struct vm_area_struct *vma);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);

-/* We are about to modify the VMA's flags. */
-__must_check struct vm_area_struct
-*vma_modify_flags(struct vma_iterator *vmi,
+/**
+ * vma_modify_flags() - Peform any necessary split/merge in preparation for
+ * setting VMA flags to *@vm_flags in the range @start to @end contained within
+ * @vma.
+ * @vmi: Valid VMA iterator positioned at @vma.
+ * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
+ * @vma: The VMA containing the range @start to @end to be updated.
+ * @start: The start of the range to update. May be offset within @vma.
+ * @end: The exclusive end of the range to update, may be offset within @vma.
+ * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is
+ * about to be set to. On merge, this will be updated to include sticky flags.
+ *
+ * IMPORTANT: The actual modification being requested here is NOT applied,
+ * rather the VMA is perhaps split, perhaps merged to accommodate the change,
+ * and the caller is expected to perform the actual modification.
+ *
+ * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points
+ * to the requested flags which are then updated so the caller, should they
+ * overwrite any existing flags, correctly retains these.
+ *
+ * Returns: A VMA which contains the range @start to @end ready to have its
+ * flags altered to *@vm_flags.
+ */
+__must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ vm_flags_t *vm_flags_ptr);
+
+/**
+ * vma_modify_name() - Peform any necessary split/merge in preparation for
+ * setting anonymous VMA name to @new_name in the range @start to @end contained
+ * within @vma.
+ * @vmi: Valid VMA iterator positioned at @vma.
+ * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
+ * @vma: The VMA containing the range @start to @end to be updated.
+ * @start: The start of the range to update. May be offset within @vma.
+ * @end: The exclusive end of the range to update, may be offset within @vma.
+ * @new_name: The anonymous VMA name that the @start to @end range is about to
+ * be set to.
+ *
+ * IMPORTANT: The actual modification being requested here is NOT applied,
+ * rather the VMA is perhaps split, perhaps merged to accommodate the change,
+ * and the caller is expected to perform the actual modification.
+ *
+ * Returns: A VMA which contains the range @start to @end ready to have its
+ * anonymous VMA name changed to @new_name.
+ */
+__must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi,
struct vm_area_struct *prev, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- vm_flags_t vm_flags);
-
-/* We are about to modify the VMA's anon_name. */
-__must_check struct vm_area_struct
-*vma_modify_name(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- struct anon_vma_name *new_name);
-
-/* We are about to modify the VMA's memory policy. */
-__must_check struct vm_area_struct
-*vma_modify_policy(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
+ struct anon_vma_name *new_name);
+
+/**
+ * vma_modify_policy() - Peform any necessary split/merge in preparation for
+ * setting NUMA policy to @new_pol in the range @start to @end contained
+ * within @vma.
+ * @vmi: Valid VMA iterator positioned at @vma.
+ * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
+ * @vma: The VMA containing the range @start to @end to be updated.
+ * @start: The start of the range to update. May be offset within @vma.
+ * @end: The exclusive end of the range to update, may be offset within @vma.
+ * @new_pol: The NUMA policy that the @start to @end range is about to be set
+ * to.
+ *
+ * IMPORTANT: The actual modification being requested here is NOT applied,
+ * rather the VMA is perhaps split, perhaps merged to accommodate the change,
+ * and the caller is expected to perform the actual modification.
+ *
+ * Returns: A VMA which contains the range @start to @end ready to have its
+ * NUMA policy changed to @new_pol.
+ */
+__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct mempolicy *new_pol);

-/* We are about to modify the VMA's flags and/or uffd context. */
-__must_check struct vm_area_struct
-*vma_modify_flags_uffd(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- vm_flags_t vm_flags,
- struct vm_userfaultfd_ctx new_ctx,
- bool give_up_on_oom);
-
-__must_check struct vm_area_struct
-*vma_merge_new_range(struct vma_merge_struct *vmg);
-
-__must_check struct vm_area_struct
-*vma_merge_extend(struct vma_iterator *vmi,
- struct vm_area_struct *vma,
- unsigned long delta);
+/**
+ * vma_modify_flags_uffd() - Peform any necessary split/merge in preparation for
+ * setting VMA flags to @vm_flags and UFFD context to @new_ctx in the range
+ * @start to @end contained within @vma.
+ * @vmi: Valid VMA iterator positioned at @vma.
+ * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
+ * @vma: The VMA containing the range @start to @end to be updated.
+ * @start: The start of the range to update. May be offset within @vma.
+ * @end: The exclusive end of the range to update, may be offset within @vma.
+ * @vm_flags: The VMA flags that the @start to @end range is about to be set to.
+ * @new_ctx: The userfaultfd context that the @start to @end range is about to
+ * be set to.
+ * @give_up_on_oom: If an out of memory condition occurs on merge, simply give
+ * up on it and treat the merge as best-effort.
+ *
+ * IMPORTANT: The actual modification being requested here is NOT applied,
+ * rather the VMA is perhaps split, perhaps merged to accommodate the change,
+ * and the caller is expected to perform the actual modification.
+ *
+ * Returns: A VMA which contains the range @start to @end ready to have its VMA
+ * flags changed to @vm_flags and its userfaultfd context changed to @new_ctx.
+ */
+__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, vm_flags_t vm_flags,
+ struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom);
+
+__must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);
+
+__must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
+ struct vm_area_struct *vma, unsigned long delta);

void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);

diff --git a/net/core/dev.c b/net/core/dev.c
index 681d7de89c50..5853aada258c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1724,6 +1724,7 @@ int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)

return ret;
}
+EXPORT_SYMBOL(netif_open);

static void __dev_close_many(struct list_head *head)
{
diff --git a/net/core/failover.c b/net/core/failover.c
index 2a140b3ea669..7fc3b9d8ab2a 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -12,6 +12,7 @@
#include <uapi/linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/if_vlan.h>
+#include <net/netdev_lock.h>
#include <net/failover.h>

static LIST_HEAD(failover_list);
@@ -221,8 +222,11 @@ failover_existing_slave_register(struct net_device *failover_dev)
for_each_netdev(net, dev) {
if (netif_is_failover(dev))
continue;
- if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) {
+ netdev_lock_ops(dev);
failover_slave_register(dev);
+ netdev_unlock_ops(dev);
+ }
}
rtnl_unlock();
}
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 53c9bc71f813..38bfe60665c0 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -211,8 +211,21 @@ static void rose_kill_by_device(struct net_device *dev)
spin_lock_bh(&rose_list_lock);
if (rose->device == dev) {
rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
- if (rose->neighbour)
+ /* Mark for destruction so rose_heartbeat_expiry()
+ * cleans up the socket at its next tick rather than
+ * looping forever in ROSE_STATE_0 with no owner.
+ */
+ sock_set_flag(sk, SOCK_DESTROY);
+ if (rose->neighbour) {
rose_neigh_put(rose->neighbour);
+ /* Clear the pointer after dropping the reference, as
+ * every other rose_neigh_put() site does. Otherwise
+ * rose_heartbeat_expiry() (STATE_0 reaping) sees a stale
+ * rose->neighbour and puts it a second time -> rose_neigh
+ * refcount underflow / use-after-free.
+ */
+ rose->neighbour = NULL;
+ }
netdev_put(rose->device, &rose->dev_tracker);
rose->device = NULL;
}
@@ -358,6 +371,7 @@ static void rose_destroy_timer(struct timer_list *t)
*/
void rose_destroy_socket(struct sock *sk)
{
+ struct rose_sock *rose = rose_sk(sk);
struct sk_buff *skb;

rose_remove_socket(sk);
@@ -365,6 +379,14 @@ void rose_destroy_socket(struct sock *sk)
rose_stop_idletimer(sk);
rose_stop_timer(sk);

+ /* Drop any device reference not already released by rose_kill_by_device()
+ * or rose_release() -- e.g. incoming sockets that were never accepted.
+ */
+ if (rose->device) {
+ netdev_put(rose->device, &rose->dev_tracker);
+ rose->device = NULL;
+ }
+
rose_clear_queues(sk); /* Flush the queues */

while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
@@ -626,9 +648,7 @@ static struct sock *rose_make_new(struct sock *osk)
rose->hb = orose->hb;
rose->idle = orose->idle;
rose->defer = orose->defer;
- rose->device = orose->device;
- if (rose->device)
- netdev_hold(rose->device, &rose->dev_tracker, GFP_ATOMIC);
+ rose->device = NULL; /* rose_rx_call_request() sets this */
rose->qbitincl = orose->qbitincl;

return sk;
@@ -1078,9 +1098,11 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros
make_rose->source_digis[n] = facilities.source_digis[n];
make_rose->neighbour = neigh;
make_rose->device = dev;
- /* Caller got a reference for us. */
- netdev_tracker_alloc(make_rose->device, &make_rose->dev_tracker,
- GFP_ATOMIC);
+ /* Take an independent reference for this socket; callers keep their
+ * own reference (from rose_dev_get / dev_hold) and will release it
+ * themselves via dev_put().
+ */
+ netdev_hold(make_rose->device, &make_rose->dev_tracker, GFP_ATOMIC);
make_rose->facilities = facilities;

rose_neigh_hold(make_rose->neighbour);
@@ -1667,19 +1689,28 @@ static void __exit rose_exit(void)
#ifdef CONFIG_SYSCTL
rose_unregister_sysctl();
#endif
- unregister_netdevice_notifier(&rose_dev_notifier);
-
sock_unregister(PF_ROSE);

for (i = 0; i < rose_ndevs; i++) {
struct net_device *dev = dev_rose[i];

if (dev) {
+ /* unregister_netdev() fires NETDEV_DOWN, which -- while the
+ * notifier is still registered below -- invokes
+ * rose_kill_by_device(dev). That releases every socket's
+ * netdev reference and disconnects all active circuits.
+ * Unregistering the notifier before this loop was the
+ * original bug: NETDEV_DOWN was never delivered, leaving
+ * 165 netdev_tracker entries leaked and stale timers live.
+ */
unregister_netdev(dev);
free_netdev(dev);
}
}

+ /* Now safe to remove the notifier -- all ROSE devices are gone. */
+ unregister_netdevice_notifier(&rose_dev_notifier);
+
kfree(dev_rose);
proto_unregister(&rose_proto);
}
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index 0276b393f0e5..622527f1354f 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -57,6 +57,7 @@ static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int framety
rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]);
rose_neigh_put(rose->neighbour);
+ rose->neighbour = NULL;
break;

default:
@@ -80,11 +81,13 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety
rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
rose_neigh_put(rose->neighbour);
+ rose->neighbour = NULL;
break;

case ROSE_CLEAR_CONFIRMATION:
rose_disconnect(sk, 0, -1, -1);
rose_neigh_put(rose->neighbour);
+ rose->neighbour = NULL;
break;

default:
@@ -122,6 +125,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety
rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
rose_neigh_put(rose->neighbour);
+ rose->neighbour = NULL;
break;

case ROSE_RR:
@@ -235,6 +239,7 @@ static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int framety
rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
rose_neigh_put(rose->neighbour);
+ rose->neighbour = NULL;
break;

default:
@@ -255,6 +260,7 @@ static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int framety
rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
rose_neigh_put(rose_sk(sk)->neighbour);
+ rose_sk(sk)->neighbour = NULL;
}

return 0;
diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c
index b538e39b3df5..fd4e4795776a 100644
--- a/net/rose/rose_loopback.c
+++ b/net/rose/rose_loopback.c
@@ -12,13 +12,15 @@
#include <net/rose.h>
#include <linux/init.h>

-static struct sk_buff_head loopback_queue;
#define ROSE_LOOPBACK_LIMIT 1000
-static struct timer_list loopback_timer;

+static struct timer_list loopback_timer;
+static struct sk_buff_head loopback_queue;
static void rose_set_loopback_timer(void);
static void rose_loopback_timer(struct timer_list *unused);

+static atomic_t loopback_stopping = ATOMIC_INIT(0);
+
void rose_loopback_init(void)
{
skb_queue_head_init(&loopback_queue);
@@ -66,10 +68,25 @@ static void rose_loopback_timer(struct timer_list *unused)
unsigned int lci_i, lci_o;
int count;

+ if (atomic_read(&loopback_stopping))
+ return;
+
+ if (rose_loopback_neigh)
+ rose_neigh_hold(rose_loopback_neigh);
+ else
+ return;
+
for (count = 0; count < ROSE_LOOPBACK_LIMIT; count++) {
skb = skb_dequeue(&loopback_queue);
if (!skb)
- return;
+ goto out;
+
+ if (atomic_read(&loopback_stopping)) {
+ kfree_skb(skb);
+ skb_queue_purge(&loopback_queue);
+ goto out;
+ }
+
if (skb->len < ROSE_MIN_LEN) {
kfree_skb(skb);
continue;
@@ -96,27 +113,34 @@ static void rose_loopback_timer(struct timer_list *unused)
}

if (frametype == ROSE_CALL_REQUEST) {
- if (!rose_loopback_neigh->dev &&
- !rose_loopback_neigh->loopback) {
- kfree_skb(skb);
- continue;
- }
-
dev = rose_dev_get(dest);
if (!dev) {
kfree_skb(skb);
continue;
}
-
- if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) {
+ /* rose_kill_by_device() runs on NETDEV_DOWN (IFF_UP cleared)
+ * before the device is unregistered. If we create a new
+ * socket here after that cleanup, the ref never gets released
+ * because NETDEV_DOWN fires only once. Drop the call instead.
+ */
+ if (!netif_running(dev)) {
dev_put(dev);
kfree_skb(skb);
+ continue;
}
+
+ if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0)
+ kfree_skb(skb);
+ dev_put(dev);
} else {
kfree_skb(skb);
}
}
- if (!skb_queue_empty(&loopback_queue))
+
+out:
+ rose_neigh_put(rose_loopback_neigh);
+
+ if (!atomic_read(&loopback_stopping) && !skb_queue_empty(&loopback_queue))
mod_timer(&loopback_timer, jiffies + 1);
}

@@ -124,10 +148,15 @@ void __exit rose_loopback_clear(void)
{
struct sk_buff *skb;

- timer_delete(&loopback_timer);
+ atomic_set(&loopback_stopping, 1);
+ /* Pairs with atomic_read() in rose_loopback_timer(): ensure the
+ * stopping flag is visible before we cancel, so a concurrent
+ * callback aborts its loop early rather than re-arming the timer.
+ */
+ smp_mb();

- while ((skb = skb_dequeue(&loopback_queue)) != NULL) {
- skb->sk = NULL;
+ timer_delete_sync(&loopback_timer);
+
+ while ((skb = skb_dequeue(&loopback_queue)) != NULL)
kfree_skb(skb);
- }
}
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index bb60a1654d61..d967cbc1db3d 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -126,12 +126,68 @@ static void rose_heartbeat_expiry(struct timer_list *t)
sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ/20);
goto out;
}
+
+ /* The bound device went down while we still hold a reference on it.
+ * This catches the narrow race where rose_loopback_timer() created a
+ * socket in the window after rose_kill_by_device()'s NETDEV_DOWN sweep
+ * but before rose_insert_socket() -- leaving a STATE_3 socket that no
+ * other branch reaps. A down device means the link is dead, so tear
+ * the socket down regardless of state. rose_destroy_socket() releases
+ * the held netdev reference (rose->device still set).
+ */
+ if (rose->device && !netif_running(rose->device)) {
+ if (rose->neighbour) {
+ rose_neigh_put(rose->neighbour);
+ rose->neighbour = NULL;
+ }
+ rose_disconnect(sk, ENETDOWN, -1, -1);
+
+ /* Only reap the socket if userspace no longer holds it. A socket
+ * still attached to a struct socket (sk->sk_socket != NULL -- e.g.
+ * a connection an fpad client has accepted and kept open) is owned
+ * by that fd: rose_release() will destroy it on close(). Dropping
+ * the last reference here leaves the open fd dangling, so the
+ * eventual close() touches freed memory -> slab-use-after-free in
+ * rose_release(). Unaccepted incoming sockets and post-close
+ * orphans have sk->sk_socket == NULL and stay safe to reap here.
+ */
+ if (!sk->sk_socket) {
+ sock_set_flag(sk, SOCK_DESTROY);
+ bh_unlock_sock(sk);
+ rose_destroy_socket(sk);
+ sock_put(sk);
+ return;
+ }
+
+ /* Owned by userspace: the link is down and the socket is now
+ * disconnected (rose_disconnect() moved it to STATE_0). Fall
+ * through to the switch, which re-arms the heartbeat; the close()
+ * will tear the socket down. */
+ }
+
switch (rose->state) {
case ROSE_STATE_0:
- /* Magic here: If we listen() and a new link dies before it
- is accepted() it isn't 'dead' so doesn't get removed. */
- if (sock_flag(sk, SOCK_DESTROY) ||
- (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) {
+ /* Destroy any orphaned STATE_0 socket: either explicitly
+ * flagged SOCK_DESTROY, or SOCK_DEAD (covers both unaccepted
+ * incoming connections and listening sockets whose link died).
+ */
+ if ((sock_flag(sk, SOCK_DESTROY) || sock_flag(sk, SOCK_DEAD)) &&
+ !sk->sk_socket) {
+ /* Reap only orphaned sockets (sk->sk_socket == NULL). A
+ * socket still owned by a userspace fd reaches here via the
+ * STATE_2 device-gone branch, which sets SOCK_DESTROY without
+ * knowing about the fd; freeing it would race rose_release()
+ * at close() -> use-after-free. Leave it for close().
+ *
+ * Orphaned incoming sockets (rose_rx_call_request) hold a
+ * neighbour reference; release it before teardown, as the
+ * STATE_2 and device-down branches do. rose_destroy_socket()
+ * does not drop it.
+ */
+ if (rose->neighbour) {
+ rose_neigh_put(rose->neighbour);
+ rose->neighbour = NULL;
+ }
bh_unlock_sock(sk);
rose_destroy_socket(sk);
sock_put(sk);
@@ -139,6 +195,20 @@ static void rose_heartbeat_expiry(struct timer_list *t)
}
break;

+ case ROSE_STATE_2:
+ /* Device gone before CLEAR CONFIRM arrived: stop waiting for T3
+ * and disconnect now instead of blocking rmmod for up to 180s. */
+ if (!rose->device) {
+ rose_stop_timer(sk);
+ if (rose->neighbour) {
+ rose_neigh_put(rose->neighbour);
+ rose->neighbour = NULL;
+ }
+ rose_disconnect(sk, ENETDOWN, -1, -1);
+ sock_set_flag(sk, SOCK_DESTROY);
+ }
+ break;
+
case ROSE_STATE_3:
/*
* Check for the state of the receive buffer.
@@ -180,7 +250,10 @@ static void rose_timer_expiry(struct timer_list *t)
break;

case ROSE_STATE_2: /* T3 */
- rose_neigh_put(rose->neighbour);
+ if (rose->neighbour) {
+ rose_neigh_put(rose->neighbour);
+ rose->neighbour = NULL;
+ }
rose_disconnect(sk, ETIMEDOUT, -1, -1);
break;

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index d725b2158758..7434309785cc 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -261,9 +261,11 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
skb_set_inner_ipproto(skb, IPPROTO_SCTP);
label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6);

+ local_bh_disable();
udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr,
tclass, ip6_dst_hoplimit(dst), label,
sctp_sk(sk)->udp_port, t->encap_port, false, 0);
+ local_bh_enable();
return 0;
}

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 9dbc24af749b..6ce58fc95ef5 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1102,10 +1102,12 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
skb_reset_inner_mac_header(skb);
skb_reset_inner_transport_header(skb);
skb_set_inner_ipproto(skb, IPPROTO_SCTP);
+ local_bh_disable();
udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr,
fl4->daddr, dscp, ip4_dst_hoplimit(dst), df,
sctp_sk(sk)->udp_port, t->encap_port, false, false,
0);
+ local_bh_enable();
return 0;
}

diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
index 4ee4db3750c1..c3a9585de98c 100644
--- a/tools/testing/selftests/mm/soft-dirty.c
+++ b/tools/testing/selftests/mm/soft-dirty.c
@@ -184,6 +184,130 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon)
close(test_fd);
}

+static void test_merge(int pagemap_fd, int pagesize)
+{
+ char *reserved, *map, *map2;
+
+ /*
+ * Reserve space for tests:
+ *
+ * ---padding to ---
+ * | avoid adj. |
+ * v merge v
+ * |---|---|---|---|---|
+ * | | 1 | 2 | 3 | |
+ * |---|---|---|---|---|
+ */
+ reserved = mmap(NULL, 5 * pagesize, PROT_NONE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ if (reserved == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+ munmap(reserved, 4 * pagesize);
+
+ /*
+ * Establish initial VMA:
+ *
+ * S/D
+ * |---|---|---|---|---|
+ * | | 1 | | | |
+ * |---|---|---|---|---|
+ */
+ map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ /* This will clear VM_SOFTDIRTY too. */
+ clear_softdirty();
+
+ /*
+ * Now place a new mapping which will be marked VM_SOFTDIRTY. Away from
+ * map:
+ *
+ * - S/D
+ * |---|---|---|---|---|
+ * | | 1 | | 2 | |
+ * |---|---|---|---|---|
+ */
+ map2 = mmap(&reserved[3 * pagesize], pagesize, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (map2 == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ /*
+ * Now remap it immediately adjacent to map, if the merge correctly
+ * propagates VM_SOFTDIRTY, we should then observe the VMA as a whole
+ * being marked soft-dirty:
+ *
+ * merge
+ * S/D
+ * |---|-------|---|---|
+ * | | 1 | | |
+ * |---|-------|---|---|
+ */
+ map2 = mremap(map2, pagesize, pagesize, MREMAP_FIXED | MREMAP_MAYMOVE,
+ &reserved[2 * pagesize]);
+ if (map2 == MAP_FAILED)
+ ksft_exit_fail_msg("mremap failed\n");
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+ "Test %s-anon soft-dirty after remap merge 1st pg\n",
+ __func__);
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1,
+ "Test %s-anon soft-dirty after remap merge 2nd pg\n",
+ __func__);
+
+ munmap(map, 2 * pagesize);
+
+ /*
+ * Now establish another VMA:
+ *
+ * S/D
+ * |---|---|---|---|---|
+ * | | 1 | | | |
+ * |---|---|---|---|---|
+ */
+ map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ /* Clear VM_SOFTDIRTY... */
+ clear_softdirty();
+ /* ...and establish incompatible adjacent VMA:
+ *
+ * - S/D
+ * |---|---|---|---|---|
+ * | | 1 | 2 | | |
+ * |---|---|---|---|---|
+ */
+ map2 = mmap(&reserved[2 * pagesize], pagesize,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (map2 == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ /*
+ * Now mprotect() VMA 1 so it's compatible with 2 and therefore merges:
+ *
+ * merge
+ * S/D
+ * |---|-------|---|---|
+ * | | 1 | | |
+ * |---|-------|---|---|
+ */
+ if (mprotect(map, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC))
+ ksft_exit_fail_msg("mprotect failed\n");
+
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+ "Test %s-anon soft-dirty after mprotect merge 1st pg\n",
+ __func__);
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1,
+ "Test %s-anon soft-dirty after mprotect merge 2nd pg\n",
+ __func__);
+
+ munmap(map, 2 * pagesize);
+}
+
static void test_mprotect_anon(int pagemap_fd, int pagesize)
{
test_mprotect(pagemap_fd, pagesize, true);
@@ -204,7 +328,7 @@ int main(int argc, char **argv)
if (!softdirty_supported())
ksft_exit_skip("soft-dirty is not support\n");

- ksft_set_plan(15);
+ ksft_set_plan(19);
pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY);
if (pagemap_fd < 0)
ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH);
@@ -216,6 +340,7 @@ int main(int argc, char **argv)
test_hugepage(pagemap_fd, pagesize);
test_mprotect_anon(pagemap_fd, pagesize);
test_mprotect_file(pagemap_fd, pagesize);
+ test_merge(pagemap_fd, pagesize);

close(pagemap_fd);

diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 656e1c75b711..fd37ce3b2628 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -339,6 +339,7 @@ static bool test_simple_modify(void)
struct mm_struct mm = {};
struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags);
VMA_ITERATOR(vmi, &mm, 0x1000);
+ vm_flags_t flags = VM_READ | VM_MAYREAD;

ASSERT_FALSE(attach_vma(&mm, init_vma));

@@ -347,7 +348,7 @@ static bool test_simple_modify(void)
* performs the merge/split only.
*/
vma = vma_modify_flags(&vmi, init_vma, init_vma,
- 0x1000, 0x2000, VM_READ | VM_MAYREAD);
+ 0x1000, 0x2000, &flags);
ASSERT_NE(vma, NULL);
/* We modify the provided VMA, and on split allocate new VMAs. */
ASSERT_EQ(vma, init_vma);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 9f724954a0f6..f64064ee4858 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -56,6 +56,7 @@ extern unsigned long dac_mmap_min_addr;
#define VM_MAYEXEC 0x00000040
#define VM_GROWSDOWN 0x00000100
#define VM_PFNMAP 0x00000400
+#define VM_MAYBE_GUARD 0x00000800
#define VM_LOCKED 0x00002000
#define VM_IO 0x00004000
#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
@@ -116,6 +117,54 @@ extern unsigned long dac_mmap_min_addr;
#define VM_SEALED VM_NONE
#endif

+/*
+ * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
+ * possesses it but the other does not, the merged VMA should nonetheless have
+ * applied to it:
+ *
+ * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its
+ * references cleared via /proc/$pid/clear_refs, any merged VMA
+ * should be considered soft-dirty also as it operates at a VMA
+ * granularity.
+ */
+#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
+
+/*
+ * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
+ * of these flags and the other not does not preclude a merge.
+ *
+ * VM_STICKY - When merging VMAs, VMA flags must match, unless they are
+ * 'sticky'. If any sticky flags exist in either VMA, we simply
+ * set all of them on the merged VMA.
+ */
+#define VM_IGNORE_MERGE VM_STICKY
+
+/*
+ * Flags which should result in page tables being copied on fork. These are
+ * flags which indicate that the VMA maps page tables which cannot be
+ * reconsistuted upon page fault, so necessitate page table copying upon
+ *
+ * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
+ * reasonably reconstructed on page fault.
+ *
+ * VM_UFFD_WP - Encodes metadata about an installed uffd
+ * write protect handler, which cannot be
+ * reconstructed on page fault.
+ *
+ * We always copy pgtables when dst_vma has uffd-wp
+ * enabled even if it's file-backed
+ * (e.g. shmem). Because when uffd-wp is enabled,
+ * pgtable contains uffd-wp protection information,
+ * that's something we can't retrieve from page cache,
+ * and skip copying will lose those info.
+ *
+ * VM_MAYBE_GUARD - Could contain page guard region markers which
+ * by design are a property of the page tables
+ * only and thus cannot be reconstructed on page
+ * fault.
+ */
+#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
+
#define FIRST_USER_ADDRESS 0UL
#define USER_PGTABLES_CEILING 0UL