[PATCH 01/14] resource limits: foundation for resource highwater tracking

From: Topi Miettinen
Date: Fri Jul 15 2016 - 06:37:15 EST


There are many basic ways to control processes, including capabilities,
cgroups and resource limits. However, there are far fewer ways to find out
useful values for the limits, except blind trial and error.

Prepare a foundation for resource highwater tracking.

The collected highwater marks for the resources can be seen using
taskstats netlink interface.

This depends on CONFIG_TASK_XACCT.

Signed-off-by: Topi Miettinen <toiwoton@xxxxxxxxx>
---
Documentation/accounting/getdelays.c | 52 +++++++++++++++++++++++++++++++++---
include/linux/sched.h | 31 +++++++++++++++++++++
include/linux/tsacct_kern.h | 3 +++
include/uapi/linux/taskstats.h | 10 ++++++-
kernel/taskstats.c | 4 +++
kernel/tsacct.c | 26 ++++++++++++++++++
6 files changed, 122 insertions(+), 4 deletions(-)

diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index b5ca536..489f1b7 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -8,7 +8,7 @@
* Copyright (c) Jay Lan, SGI. 2006
*
* Compile with
- * gcc -I/usr/src/linux/include getdelays.c -o getdelays
+ * gcc -I/usr/src/linux getdelays.c -o getdelays
*/

#include <stdio.h>
@@ -22,10 +22,11 @@
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/wait.h>
+#include <sys/resource.h>
#include <signal.h>

#include <linux/genetlink.h>
-#include <linux/taskstats.h>
+#include "include/uapi/linux/taskstats.h"
#include <linux/cgroupstats.h>

/*
@@ -50,6 +51,7 @@ char name[100];
int dbg;
int print_delays;
int print_io_accounting;
+int print_resource_accounting;
int print_task_context_switch_counts;

#define PRINTF(fmt, arg...) { \
@@ -63,6 +65,8 @@ int print_task_context_switch_counts;
/* Maximum number of cpus expected to be specified in a cpumask */
#define MAX_CPUS 32

+#define TASKSTATS_VERSION_WITH_RESOURCE 9
+
struct msgtemplate {
struct nlmsghdr n;
struct genlmsghdr g;
@@ -77,6 +81,7 @@ static void usage(void)
"[-m cpumask] [-t tgid] [-p pid]\n");
fprintf(stderr, " -d: print delayacct stats\n");
fprintf(stderr, " -i: print IO accounting (works only with -p)\n");
+ fprintf(stderr, " -R: print resource accounting stats\n");
fprintf(stderr, " -l: listen forever\n");
fprintf(stderr, " -v: debug on\n");
fprintf(stderr, " -C: container path\n");
@@ -232,6 +237,25 @@ static void task_context_switch_counts(struct taskstats *t)
(unsigned long long)t->nvcsw, (unsigned long long)t->nivcsw);
}

+static const char *const rlimit_names[] = {
+ [RLIMIT_CPU] = "RLIMIT_CPU",
+ [RLIMIT_FSIZE] = "RLIMIT_FSIZE",
+ [RLIMIT_DATA] = "RLIMIT_DATA",
+ [RLIMIT_STACK] = "RLIMIT_STACK",
+ [RLIMIT_CORE] = "RLIMIT_CORE",
+ [RLIMIT_RSS] = "RLIMIT_RSS",
+ [RLIMIT_NPROC] = "RLIMIT_NPROC",
+ [RLIMIT_NOFILE] = "RLIMIT_NOFILE",
+ [RLIMIT_MEMLOCK] = "RLIMIT_MEMLOCK",
+ [RLIMIT_AS] = "RLIMIT_AS",
+ [RLIMIT_LOCKS] = "RLIMIT_LOCKS",
+ [RLIMIT_SIGPENDING] = "RLIMIT_SIGPENDING",
+ [RLIMIT_MSGQUEUE] = "RLIMIT_MSGQUEUE",
+ [RLIMIT_NICE] = "RLIMIT_NICE",
+ [RLIMIT_RTPRIO] = "RLIMIT_RTPRIO",
+ [RLIMIT_RTTIME] = "RLIMIT_RTTIME",
+};
+
static void print_cgroupstats(struct cgroupstats *c)
{
printf("sleeping %llu, blocked %llu, running %llu, stopped %llu, "
@@ -252,6 +276,22 @@ static void print_ioacct(struct taskstats *t)
(unsigned long long)t->cancelled_write_bytes);
}

+static void print_racct(const struct taskstats *t)
+{
+ int i;
+
+ if (t->version < TASKSTATS_VERSION_WITH_RESOURCE) {
+ printf("kernel too old (%d < %d)\n", t->version,
+ TASKSTATS_VERSION_WITH_RESOURCE);
+ return;
+ }
+
+ for (i = 0; i < RLIM_NLIMITS; i++)
+ printf("%s=%llu\n",
+ rlimit_names[i],
+ (unsigned long long)t->resource_hiwater[i]);
+}
+
int main(int argc, char *argv[])
{
int c, rc, rep_len, aggr_len, len2;
@@ -280,7 +320,7 @@ int main(int argc, char *argv[])
struct msgtemplate msg;

while (!forking) {
- c = getopt(argc, argv, "qdiw:r:m:t:p:vlC:c:");
+ c = getopt(argc, argv, "qdiw:r:m:t:p:vlC:c:R");
if (c < 0)
break;

@@ -297,6 +337,10 @@ int main(int argc, char *argv[])
printf("printing task/process context switch rates\n");
print_task_context_switch_counts = 1;
break;
+ case 'R':
+ printf("printing resource accounting\n");
+ print_resource_accounting = 1;
+ break;
case 'C':
containerset = 1;
containerpath = optarg;
@@ -497,6 +541,8 @@ int main(int argc, char *argv[])
print_ioacct((struct taskstats *) NLA_DATA(na));
if (print_task_context_switch_counts)
task_context_switch_counts((struct taskstats *) NLA_DATA(na));
+ if (print_resource_accounting)
+ print_racct((struct taskstats *) NLA_DATA(na));
if (fd) {
if (write(fd, NLA_DATA(na), na->nla_len) < 0) {
err(1,"write error\n");
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 253538f..e4d7482 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -763,6 +763,9 @@ struct signal_struct {
unsigned long inblock, oublock, cinblock, coublock;
unsigned long maxrss, cmaxrss;
struct task_io_accounting ioac;
+#ifdef CONFIG_TASK_XACCT
+ unsigned long resource_highwatermark[RLIM_NLIMITS];
+#endif

/*
* Cumulative ns of schedule CPU time fo dead threads in the
@@ -3323,6 +3326,24 @@ static inline void inc_syscw(struct task_struct *tsk)
{
tsk->ioac.syscw++;
}
+
+static inline void task_update_resource_highwatermark(struct task_struct *tsk,
+ unsigned int limit,
+ unsigned long r)
+{
+ struct signal_struct *sig = tsk->signal;
+
+ write_seqlock(&sig->stats_lock);
+ if ((sig->resource_highwatermark[limit]) < r)
+ sig->resource_highwatermark[limit] = r;
+ write_sequnlock(&sig->stats_lock);
+}
+
+static inline void update_resource_highwatermark(unsigned int limit,
+ unsigned long r)
+{
+ task_update_resource_highwatermark(current, limit, r);
+}
#else
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
@@ -3339,6 +3360,16 @@ static inline void inc_syscr(struct task_struct *tsk)
static inline void inc_syscw(struct task_struct *tsk)
{
}
+static inline void task_update_resource_highwatermark(struct task_struct *tsk,
+ unsigned int limit,
+ unsigned long r)
+{
+}
+
+static inline void update_resource_highwatermark(unsigned int limit,
+ unsigned long r)
+{
+}
#endif

#ifndef TASK_SIZE_OF
diff --git a/include/linux/tsacct_kern.h b/include/linux/tsacct_kern.h
index 3251965..bcf1301 100644
--- a/include/linux/tsacct_kern.h
+++ b/include/linux/tsacct_kern.h
@@ -25,6 +25,7 @@ extern void xacct_add_tsk(struct taskstats *stats, struct task_struct *p);
extern void acct_update_integrals(struct task_struct *tsk);
extern void acct_account_cputime(struct task_struct *tsk);
extern void acct_clear_integrals(struct task_struct *tsk);
+extern void racct_add_tsk(struct taskstats *stats, struct task_struct *tsk);
#else
static inline void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
{}
@@ -34,6 +35,8 @@ static inline void acct_account_cputime(struct task_struct *tsk)
{}
static inline void acct_clear_integrals(struct task_struct *tsk)
{}
+static inline void racct_add_tsk(struct taskstats *stats, struct task_struct *p)
+{}
#endif /* CONFIG_TASK_XACCT */

#endif
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 2466e55..8c65194 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -33,7 +33,7 @@
*/


-#define TASKSTATS_VERSION 8
+#define TASKSTATS_VERSION 9
#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
* in linux/sched.h */

@@ -163,6 +163,14 @@ struct taskstats {
/* Delay waiting for memory reclaim */
__u64 freepages_count;
__u64 freepages_delay_total;
+ /* Per-task storage I/O accounting ends */
+
+#define TASKSTATS_HAS_LIMIT_ACCOUNTING
+ /* Per-task resource accounting starts */
+ __u64 resource_hiwater[RLIM_NLIMITS]; /* high-watermark of
+ RLIMIT
+ resources */
+ /* Per-task resource accounting ends */
};


diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee..9a03e6b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -17,6 +17,7 @@
*/

#include <linux/kernel.h>
+#include <linux/resource.h>
#include <linux/taskstats_kern.h>
#include <linux/tsacct_kern.h>
#include <linux/delayacct.h>
@@ -188,6 +189,9 @@ static void fill_stats(struct user_namespace *user_ns,

/* fill in extended acct fields */
xacct_add_tsk(stats, tsk);
+
+ /* fill in resource acct fields */
+ racct_add_tsk(stats, tsk);
}

static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f8e26ab..231bae3 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -183,4 +183,30 @@ void acct_clear_integrals(struct task_struct *tsk)
tsk->acct_rss_mem1 = 0;
tsk->acct_vm_mem1 = 0;
}
+
+/*
+ * fill in resource accounting fields
+ */
+void racct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+ int i;
+ unsigned int seq, nextseq;
+ unsigned long flags;
+
+ rcu_read_lock();
+ /* Attempt a lockless read on the first round. */
+ nextseq = 0;
+ do {
+ seq = nextseq;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ for (i = 0; i < RLIM_NLIMITS; i++)
+ stats->resource_hiwater[i] = (__u64)sig->resource_highwatermark[i];
+
+ /* If lockless access failed, take the lock. */
+ nextseq = 1;
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+ rcu_read_unlock();
+}
#endif
--
2.8.1