Re: [PATCH v2 01/11] splice: copy_splice_read: do the I/O with IOCB_NOWAIT

From: Ahelenia Ziemiańska
Date: Thu Dec 21 2023 - 11:31:18 EST


On Thu, Dec 21, 2023 at 12:27:12AM -0800, Christoph Hellwig wrote:
> On Thu, Dec 21, 2023 at 04:08:45AM +0100, Ahelenia Ziemiańska wrote:
> > Otherwise we risk sleeping with the pipe locked for indeterminate
> You can't just assume that any ->read_iter support IOCB_NOWAIT.
Let's see.

zero_fops drivers/char/mem.c: .splice_read = copy_splice_read,
full_fops drivers/char/mem.c: .splice_read = copy_splice_read,

random_fops drivers/char/random.c: .splice_read = copy_splice_read,
random_read_iter checks
urandom_fops drivers/char/random.c: .splice_read = copy_splice_read,
urandom_read_iter returns instantly

if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
fs/splice.c: return copy_splice_read(in, ppos, pipe, len, flags);
FMODE_CAN_ODIRECT is set by filesystems and blockdevs, so trusted

fs/9p/vfs_file.c: return copy_splice_read(in, ppos, pipe, len, flags);
fs/ceph/file.c: return copy_splice_read(in, ppos, pipe, len, flags);
fs/ceph/file.c: return copy_splice_read(in, ppos, pipe, len, flags);
fs/gfs2/file.c: .splice_read = copy_splice_read,
fs/gfs2/file.c: .splice_read = copy_splice_read,
fs/kernfs/file.c: .splice_read = copy_splice_read,
fs/smb/client/cifsfs.c: .splice_read = copy_splice_read,
fs/smb/client/cifsfs.c: .splice_read = copy_splice_read,
fs/proc/inode.c: .splice_read = copy_splice_read,
fs/proc/inode.c: .splice_read = copy_splice_read,
fs/proc/proc_sysctl.c: .splice_read = copy_splice_read,
fs/proc_namespace.c: .splice_read = copy_splice_read,
fs/proc_namespace.c: .splice_read = copy_splice_read,
fs/proc_namespace.c: .splice_read = copy_splice_read,
filesystems => trusted

tracing_fops
kernel/trace/trace.c: .splice_read = copy_splice_read,
used in /sys/kernel/debug/tracing/per_cpu/cpu*/trace
and /sys/kernel/debug/tracing/trace
which are seq_read_iter and even if they did block,
it's in tracefs so same logic as tracing_buffers_splice_read applies.

net/socket.c: return copy_splice_read(file, ppos, pipe, len, flags);
this is the default implementation for protocols without explicit
splice_reads, and sock_read_iter translates IOCB_NOWAIT into
MSG_DONTAIT.


So I think I can, because the ~three implementations that we want
to constrain do support it. If anything, this hints to me that to
yield a more consistent API that doesn't arbitrarily distinguish
between O_DIRECT files with and without IOCB_NOWAIT support, something
to the effect of the following diff may be used.

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 11cd8d23f6f2..dc42837ee0af 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -392,7 +392,7 @@ static ssize_t v9fs_file_splice_read(struct file *in, loff_t *ppos,
fid->fid, len, *ppos);

if (fid->mode & P9L_DIRECT)
- return copy_splice_read(in, ppos, pipe, len, flags);
+ return copy_splice_read_sleepok(in, ppos, pipe, len, flags);
return filemap_splice_read(in, ppos, pipe, len, flags);
}

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b5aae29e944..9a4679013135 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2209,7 +2209,7 @@ static ssize_t ceph_splice_read(struct file *in, loff_t *ppos,

if (ceph_has_inline_data(ci) ||
(fi->flags & CEPH_F_SYNC))
- return copy_splice_read(in, ppos, pipe, len, flags);
+ return copy_splice_read_sleepok(in, ppos, pipe, len, flags);

ceph_start_io_read(inode);

@@ -2228,7 +2228,7 @@ static ssize_t ceph_splice_read(struct file *in, loff_t *ppos,

ceph_put_cap_refs(ci, got);
ceph_end_io_read(inode);
- return copy_splice_read(in, ppos, pipe, len, flags);
+ return copy_splice_read_sleepok(in, ppos, pipe, len, flags);
}

dout("splice_read %p %llx.%llx %llu~%zu got cap refs on %s\n",
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4b66efc1a82a..5b0cbb6b95c4 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1581,7 +1581,7 @@ const struct file_operations gfs2_file_fops = {
.fsync = gfs2_fsync,
.lock = gfs2_lock,
.flock = gfs2_flock,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
@@ -1612,7 +1612,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.splice_write = gfs2_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f0cb729e9a97..f0b6e85b2c5b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -989,7 +989,7 @@ const struct file_operations kernfs_file_fops = {
.release = kernfs_fop_release,
.poll = kernfs_fop_poll,
.fsync = noop_fsync,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.splice_write = iter_file_splice_write,
};

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b33e490e3fd9..7ec2f4653299 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -588,7 +588,7 @@ static const struct file_operations proc_iter_file_ops = {
.llseek = proc_reg_llseek,
.read_iter = proc_reg_read_iter,
.write = proc_reg_write,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
@@ -614,7 +614,7 @@ static const struct file_operations proc_reg_file_ops_compat = {
static const struct file_operations proc_iter_file_ops_compat = {
.llseek = proc_reg_llseek,
.read_iter = proc_reg_read_iter,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.write = proc_reg_write,
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 8064ea76f80b..11d26fd14e7d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -864,7 +864,7 @@ static const struct file_operations proc_sys_file_operations = {
.poll = proc_sys_poll,
.read_iter = proc_sys_read,
.write_iter = proc_sys_write,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.splice_write = iter_file_splice_write,
.llseek = default_llseek,
};
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 250eb5bf7b52..e9d19a856dd7 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -324,7 +324,7 @@ static int mountstats_open(struct inode *inode, struct file *file)
const struct file_operations proc_mounts_operations = {
.open = mounts_open,
.read_iter = seq_read_iter,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.llseek = seq_lseek,
.release = mounts_release,
.poll = mounts_poll,
@@ -333,7 +333,7 @@ const struct file_operations proc_mounts_operations = {
const struct file_operations proc_mountinfo_operations = {
.open = mountinfo_open,
.read_iter = seq_read_iter,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.llseek = seq_lseek,
.release = mounts_release,
.poll = mounts_poll,
@@ -342,7 +342,7 @@ const struct file_operations proc_mountinfo_operations = {
const struct file_operations proc_mountstats_operations = {
.open = mountstats_open,
.read_iter = seq_read_iter,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.llseek = seq_lseek,
.release = mounts_release,
};
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 2131638f26d0..5f9fb3ce3bcb 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1561,7 +1561,7 @@ const struct file_operations cifs_file_direct_ops = {
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
@@ -1615,7 +1615,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
- .splice_read = copy_splice_read,
+ .splice_read = copy_splice_read_sleepok,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
diff --git a/fs/splice.c b/fs/splice.c
index 2871c6f9366f..90ebcf236c05 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -298,12 +298,14 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
}

/**
- * copy_splice_read - Copy data from a file and splice the copy into a pipe
+ * __copy_splice_read - Copy data from a file and splice the copy into a pipe
* @in: The file to read from
* @ppos: Pointer to the file position to read from
* @pipe: The pipe to splice into
* @len: The amount to splice
* @flags: The SPLICE_F_* flags
+ * @sleepok: Set if splicing from a trusted filesystem,
+ * don't set if splicing from an IPC mechanism
*
* This function allocates a bunch of pages sufficient to hold the requested
* amount of data (but limited by the remaining pipe capacity), passes it to
@@ -317,10 +319,11 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
* if the pipe has insufficient space, we reach the end of the data or we hit a
* hole.
*/
-ssize_t copy_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len, unsigned int flags)
+static ssize_t __copy_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags, bool sleepok)
{
+ printk("__copy_splice_read(%d)\n", sleepok);
struct iov_iter to;
struct bio_vec *bv;
struct kiocb kiocb;
@@ -361,7 +364,8 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
- kiocb.ki_flags |= IOCB_NOWAIT;
+ if (!sleepok)
+ kiocb.ki_flags |= IOCB_NOWAIT;
ret = call_read_iter(in, &kiocb, &to);

if (ret > 0) {
@@ -399,8 +403,21 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
kfree(bv);
return ret;
}
+
+ssize_t copy_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags) {
+ return __copy_splice_read(in, ppos, pipe, len, flags, false);
+}
EXPORT_SYMBOL(copy_splice_read);

+ssize_t copy_splice_read_sleepok(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags) {
+ return __copy_splice_read(in, ppos, pipe, len, flags, true);
+}
+EXPORT_SYMBOL(copy_splice_read_sleepok);
+
const struct pipe_buf_operations default_pipe_buf_ops = {
.release = generic_pipe_buf_release,
.try_steal = generic_pipe_buf_try_steal,
@@ -988,7 +1005,7 @@ long vfs_splice_read(struct file *in, loff_t *ppos,
* buffer, copy into it and splice that into the pipe.
*/
if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
- return copy_splice_read(in, ppos, pipe, len, flags);
+ return copy_splice_read_sleepok(in, ppos, pipe, len, flags);
return in->f_op->splice_read(in, ppos, pipe, len, flags);
}
EXPORT_SYMBOL_GPL(vfs_splice_read);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..0980bf6ba8fd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2989,6 +2989,9 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
ssize_t copy_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags);
+ssize_t copy_splice_read_sleepok(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
struct file *, loff_t *, size_t, unsigned int);
extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,

Attachment: signature.asc
Description: PGP signature