Re: [PATCH 1/3] lib, include/linux: add usercopy failure capability

From: Dmitry Vyukov
Date: Fri Aug 21 2020 - 07:54:14 EST


On Fri, Aug 21, 2020 at 12:50 PM <albert.linde@xxxxxxxxx> wrote:
>
> From: Albert van der Linde <alinde@xxxxxxxxxx>
>
> Add a failure injection capability to improve testing of fault-tolerance
> in usages of user memory access functions.
>
> Adds CONFIG_FAULT_INJECTION_USERCOPY to enable faults in usercopy
> functions. By default functions are to either fail with -EFAULT or
> return that no bytes were copied. To use partial copies (e.g.,
> copy_from_user permits partial failures) the number of bytes not to copy
> must be written to /sys/kernel/debug/fail_usercopy/failsize.
>
> Signed-off-by: Albert van der Linde <alinde@xxxxxxxxxx>
> ---
> .../fault-injection/fault-injection.rst | 64 ++++++++++++++++++
> include/linux/fault-inject-usercopy.h | 20 ++++++
> lib/Kconfig.debug | 7 ++
> lib/Makefile | 1 +
> lib/fault-inject-usercopy.c | 66 +++++++++++++++++++
> 5 files changed, 158 insertions(+)
> create mode 100644 include/linux/fault-inject-usercopy.h
> create mode 100644 lib/fault-inject-usercopy.c
>
> diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst
> index f850ad018b70..cf52eabde332 100644
> --- a/Documentation/fault-injection/fault-injection.rst
> +++ b/Documentation/fault-injection/fault-injection.rst
> @@ -16,6 +16,10 @@ Available fault injection capabilities
>
> injects page allocation failures. (alloc_pages(), get_free_pages(), ...)
>
> +- fail_usercopy
> +
> + injects failures in user memory access functions. (copy_from_user(), get_user(), ...)
> +
> - fail_futex
>
> injects futex deadlock and uaddr fault errors.
> @@ -138,6 +142,12 @@ configuration of fault-injection capabilities.
> specifies the minimum page allocation order to be injected
> failures.
>
> +- /sys/kernel/debug/fail_usercopy/failsize:
> +
> + specifies the error code to return or amount of bytes not to copy.
> + If set to 0 then -EFAULT is returned instead. Positive values can be
> + used to inject partial copies (copy_from_user(), ...)
> +
> - /sys/kernel/debug/fail_futex/ignore-private:
>
> Format: { 'Y' | 'N' }
> @@ -177,6 +187,7 @@ use the boot option::
>
> failslab=
> fail_page_alloc=
> + fail_usercopy=
> fail_make_request=
> fail_futex=
> mmc_core.fail_request=<interval>,<probability>,<space>,<times>
> @@ -383,6 +394,59 @@ allocation failure::
> ./tools/testing/fault-injection/failcmd.sh --times=100 \
> -- make -C tools/testing/selftests/ run_tests
>
> +
> +Fail usercopy functions
> +---------------------------------
> +
> +The following code fails the usercopy call in stat: copy_to_user is only
> +partially completed.
> +
> +#include <errno.h>
> +#include <fcntl.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/sendfile.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <time.h>
> +#include <unistd.h>
> +
> +void inject_size()
> +{
> + int fd = open("/sys/kernel/debug/fail_usercopy/failsize", O_RDWR);
> + char buf[16];
> + sprintf(buf, "%d", 60);
> + write(fd, buf, strlen(buf));
> +}
> +
> +void inject_fault()
> +{
> + int fd = open("/proc/thread-self/fail-nth", O_RDWR);
> + char buf[16];
> + sprintf(buf, "%d", 4);
> + write(fd, buf, strlen(buf));
> +}
> +
> +int main()
> +{
> + struct stat sb;
> + inject_size();
> + inject_fault();
> + stat(".", &sb);
> + printf("Stat error code: %d\n", errno);
> + printf("Last status change: %s", ctime(&sb.st_ctime));
> + printf("Last file access: %s", ctime(&sb.st_atime));
> + printf("Last file modification: %s", ctime(&sb.st_mtime));
> + return 0;
> +}
> +
> +Example output:
> +Stat error code: 14
> +Last status change: Thu Jan 1 00:00:00 1970
> +Last file access: Tue Aug 18 14:09:34 2020
> +Last file modification: Sun Dec 12 00:19:57 2989041
> +
> Systematic faults using fail-nth
> ---------------------------------
>
> diff --git a/include/linux/fault-inject-usercopy.h b/include/linux/fault-inject-usercopy.h
> new file mode 100644
> index 000000000000..95dad8ddb56f
> --- /dev/null
> +++ b/include/linux/fault-inject-usercopy.h
> @@ -0,0 +1,20 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * This header provides a wrapper for partial failures on usercopy functions.
> + * For usage see Documentation/fault-injection/fault-injection.rst
> + */
> +#ifndef __LINUX_FAULT_INJECT_USERCOPY_H__
> +#define __LINUX_FAULT_INJECT_USERCOPY_H__
> +
> +#ifdef CONFIG_FAULT_INJECTION_USERCOPY
> +
> +long should_fail_usercopy(unsigned long n);
> +
> +#else
> +
> +static inline long should_fail_usercopy(unsigned long n) { return 0; }
> +
> +#endif /* CONFIG_FAULT_INJECTION_USERCOPY */
> +
> +#endif /* __LINUX_FAULT_INJECT_USERCOPY_H__ */
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index e068c3c7189a..bd0ec3ddb459 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -1770,6 +1770,13 @@ config FAIL_PAGE_ALLOC
> help
> Provide fault-injection capability for alloc_pages().
>
> +config FAULT_INJECTION_USERCOPY
> + bool "Fault injection capability for usercopy functions"
> + depends on FAULT_INJECTION
> + help
> + Provides fault-injection capability to inject failures and
> + partial copies in usercopy functions.
> +
> config FAIL_MAKE_REQUEST
> bool "Fault-injection capability for disk IO"
> depends on FAULT_INJECTION && BLOCK
> diff --git a/lib/Makefile b/lib/Makefile
> index a4a4c6864f51..18daad2bc606 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -207,6 +207,7 @@ obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
>
> obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
> obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
> +obj-$(CONFIG_FAULT_INJECTION_USERCOPY) += fault-inject-usercopy.o
> obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
> obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
> obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o
> diff --git a/lib/fault-inject-usercopy.c b/lib/fault-inject-usercopy.c
> new file mode 100644
> index 000000000000..c151e0817ca7
> --- /dev/null
> +++ b/lib/fault-inject-usercopy.c
> @@ -0,0 +1,66 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/fault-inject.h>
> +#include <linux/fault-inject-usercopy.h>
> +#include <linux/random.h>
> +
> +static struct {
> + struct fault_attr attr;
> + u32 failsize;
> +} fail_usercopy = {
> + .attr = FAULT_ATTR_INITIALIZER,
> + .failsize = 0,
> +};
> +
> +static int __init setup_fail_usercopy(char *str)
> +{
> + return setup_fault_attr(&fail_usercopy.attr, str);
> +}
> +__setup("fail_usercopy=", setup_fail_usercopy);
> +
> +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
> +
> +static int __init fail_usercopy_debugfs(void)
> +{
> + umode_t mode = S_IFREG | 0600;
> + struct dentry *dir;
> +
> + dir = fault_create_debugfs_attr("fail_usercopy", NULL,
> + &fail_usercopy.attr);
> + if (IS_ERR(dir))
> + return PTR_ERR(dir);
> +
> + debugfs_create_u32("failsize", mode, dir,
> + &fail_usercopy.failsize);

Marco, what's the right way to annotate these concurrent accesses for KCSAN?

> + return 0;
> +}
> +
> +late_initcall(fail_usercopy_debugfs);
> +
> +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
> +
> +/**
> + * should_fail_usercopy() - Failure code or amount of bytes not to copy.
> + * @n: Size of the original copy call.
> + *
> + * The general idea is to have a method which returns the amount of bytes not
> + * to copy, a failure to return, or 0 if the calling function should progress
> + * without a failure. E.g., copy_{to,from}_user should NOT copy the amount of
> + * bytes returned by should_fail_usercopy, returning this value (in addition
> + * to any bytes that could actually not be copied) or a failure.
> + *
> + * Return: one of:
> + * negative, failure to return;
> + * 0, progress normally;
> + * a number in ]0, n], the number of bytes not to copy.
> + *
> + */
> +long should_fail_usercopy(unsigned long n)
> +{
> + if (should_fail(&fail_usercopy.attr, n)) {
> + if (fail_usercopy.failsize > 0)
> + return fail_usercopy.failsize % (n + 1);

What's the use-case for this fail_usercopy.failsize? How is it
supposed to be used?
It seems that setting an exact value only makes sense if you know the
exact failure site (and size it will use). But it's global and not
per-task.
It does not allow systematic enumeration of all return values because
there is no way to understand that we tried all possible return values
for a particular usercopy. And it also can "undo" failure if %(n+1)
yields 0, right? Maybe it should saturate?


> + return -EFAULT;
> + }
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(should_fail_usercopy);
> --
> 2.28.0.297.g1956fa8f8d-goog
>