aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2007-10-19 02:39:44 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:36 -0400
commit846c7bb055747989891f5cd2bb6e8d56243ba1e7 (patch)
treee044041366efa8298157c4ae86615d68d30dd6d2
parentc2e2c7fa1cb2cf2b114a6c9bc132b6601db5a7c8 (diff)
Add cgroupstats
This patch is inspired by the discussion at http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263. The patch is on top of 2.6.21-mm1 with Paul's cgroups v9 patches (forward ported) This patch implements per cgroup statistics infrastructure and re-uses code from the taskstats interface. A new set of cgroup operations are registered with commands and attributes. It should be very easy to *extend* per cgroup statistics, by adding members to the cgroupstats structure. The current model for cgroupstats is a pull, a push model (to post statistics on interesting events), should be very easy to add. Currently user space requests for statistics by passing the cgroup file descriptor. Statistics about the state of all the tasks in the cgroup is returned to user space. TODO's/NOTE: This patch provides an infrastructure for implementing cgroup statistics. Based on the needs of each controller, we can incrementally add more statistics, event based support for notification of statistics, accumulation of taskstats into cgroup statistics in the future. Sample output # ./cgroupstats -C /cgroup/a sleeping 2, blocked 0, running 1, stopped 0, uninterruptible 0 # ./cgroupstats -C /cgroup/ sleeping 154, blocked 0, running 0, stopped 0, uninterruptible 0 If the approach looks good, I'll enhance and post the user space utility for the same Feedback, comments, test results are always welcome! [akpm@linux-foundation.org: build fix] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Paul Menage <menage@google.com> Cc: Jay Lan <jlan@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/accounting/cgroupstats.txt27
-rw-r--r--include/linux/Kbuild1
-rw-r--r--include/linux/cgroup.h8
-rw-r--r--include/linux/cgroupstats.h70
-rw-r--r--include/linux/delayacct.h13
-rw-r--r--kernel/cgroup.c55
-rw-r--r--kernel/taskstats.c67
7 files changed, 241 insertions, 0 deletions
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt
new file mode 100644
index 000000000000..eda40fd39cad
--- /dev/null
+++ b/Documentation/accounting/cgroupstats.txt
@@ -0,0 +1,27 @@
1Control Groupstats is inspired by the discussion at
2http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
3suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
4
5Per cgroup statistics infrastructure re-uses code from the taskstats
6interface. A new set of cgroup operations are registered with commands
7and attributes specific to cgroups. It should be very easy to
8extend per cgroup statistics, by adding members to the cgroupstats
9structure.
10
11The current model for cgroupstats is a pull, a push model (to post
12statistics on interesting events), should be very easy to add. Currently
13user space requests for statistics by passing the cgroup path.
14Statistics about the state of all the tasks in the cgroup is returned to
15user space.
16
17NOTE: We currently rely on delay accounting for extracting information
18about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
19information will not be available.
20
21To extract cgroup statistics a utility very similar to getdelays.c
22has been developed, the sample output of the utility is shown below
23
24~/balbir/cgroupstats # ./getdelays -C "/cgroup/a"
25sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
26~/balbir/cgroupstats # ./getdelays -C "/cgroup"
27sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 7ac8303c8471..e3ffd14a3f0b 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -47,6 +47,7 @@ header-y += coda_psdev.h
47header-y += coff.h 47header-y += coff.h
48header-y += comstats.h 48header-y += comstats.h
49header-y += const.h 49header-y += const.h
50header-y += cgroupstats.h
50header-y += cycx_cfm.h 51header-y += cycx_cfm.h
51header-y += dlm_device.h 52header-y += dlm_device.h
52header-y += dlm_netlink.h 53header-y += dlm_netlink.h
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9e9b7efa180b..87479328d46d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -13,6 +13,7 @@
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/nodemask.h> 14#include <linux/nodemask.h>
15#include <linux/rcupdate.h> 15#include <linux/rcupdate.h>
16#include <linux/cgroupstats.h>
16 17
17#ifdef CONFIG_CGROUPS 18#ifdef CONFIG_CGROUPS
18 19
@@ -29,6 +30,8 @@ extern void cgroup_fork(struct task_struct *p);
29extern void cgroup_fork_callbacks(struct task_struct *p); 30extern void cgroup_fork_callbacks(struct task_struct *p);
30extern void cgroup_post_fork(struct task_struct *p); 31extern void cgroup_post_fork(struct task_struct *p);
31extern void cgroup_exit(struct task_struct *p, int run_callbacks); 32extern void cgroup_exit(struct task_struct *p, int run_callbacks);
33extern int cgroupstats_build(struct cgroupstats *stats,
34 struct dentry *dentry);
32 35
33extern struct file_operations proc_cgroup_operations; 36extern struct file_operations proc_cgroup_operations;
34 37
@@ -313,6 +316,11 @@ static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
313 316
314static inline void cgroup_lock(void) {} 317static inline void cgroup_lock(void) {}
315static inline void cgroup_unlock(void) {} 318static inline void cgroup_unlock(void) {}
319static inline int cgroupstats_build(struct cgroupstats *stats,
320 struct dentry *dentry)
321{
322 return -EINVAL;
323}
316 324
317#endif /* !CONFIG_CGROUPS */ 325#endif /* !CONFIG_CGROUPS */
318 326
diff --git a/include/linux/cgroupstats.h b/include/linux/cgroupstats.h
new file mode 100644
index 000000000000..4f53abf6855d
--- /dev/null
+++ b/include/linux/cgroupstats.h
@@ -0,0 +1,70 @@
1/* cgroupstats.h - exporting per-cgroup statistics
2 *
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 */
14
15#ifndef _LINUX_CGROUPSTATS_H
16#define _LINUX_CGROUPSTATS_H
17
18#include <linux/taskstats.h>
19
20/*
21 * Data shared between user space and kernel space on a per cgroup
22 * basis. This data is shared using taskstats.
23 *
24 * Most of these states are derived by looking at the task->state value
25 * For the nr_io_wait state, a flag in the delay accounting structure
26 * indicates that the task is waiting on IO
27 *
28 * Each member is aligned to a 8 byte boundary.
29 */
30struct cgroupstats {
31 __u64 nr_sleeping; /* Number of tasks sleeping */
32 __u64 nr_running; /* Number of tasks running */
33 __u64 nr_stopped; /* Number of tasks in stopped state */
34 __u64 nr_uninterruptible; /* Number of tasks in uninterruptible */
35 /* state */
36 __u64 nr_io_wait; /* Number of tasks waiting on IO */
37};
38
39/*
40 * Commands sent from userspace
41 * Not versioned. New commands should only be inserted at the enum's end
42 * prior to __CGROUPSTATS_CMD_MAX
43 */
44
45enum {
46 CGROUPSTATS_CMD_UNSPEC = __TASKSTATS_CMD_MAX, /* Reserved */
47 CGROUPSTATS_CMD_GET, /* user->kernel request/get-response */
48 CGROUPSTATS_CMD_NEW, /* kernel->user event */
49 __CGROUPSTATS_CMD_MAX,
50};
51
52#define CGROUPSTATS_CMD_MAX (__CGROUPSTATS_CMD_MAX - 1)
53
54enum {
55 CGROUPSTATS_TYPE_UNSPEC = 0, /* Reserved */
56 CGROUPSTATS_TYPE_CGROUP_STATS, /* contains name + stats */
57 __CGROUPSTATS_TYPE_MAX,
58};
59
60#define CGROUPSTATS_TYPE_MAX (__CGROUPSTATS_TYPE_MAX - 1)
61
62enum {
63 CGROUPSTATS_CMD_ATTR_UNSPEC = 0,
64 CGROUPSTATS_CMD_ATTR_FD,
65 __CGROUPSTATS_CMD_ATTR_MAX,
66};
67
68#define CGROUPSTATS_CMD_ATTR_MAX (__CGROUPSTATS_CMD_ATTR_MAX - 1)
69
70#endif /* _LINUX_CGROUPSTATS_H */
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 55d1ca5e60f5..ab94bc083558 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -26,6 +26,7 @@
26 * Used to set current->delays->flags 26 * Used to set current->delays->flags
27 */ 27 */
28#define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ 28#define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */
29#define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */
29 30
30#ifdef CONFIG_TASK_DELAY_ACCT 31#ifdef CONFIG_TASK_DELAY_ACCT
31 32
@@ -39,6 +40,14 @@ extern void __delayacct_blkio_end(void);
39extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); 40extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
40extern __u64 __delayacct_blkio_ticks(struct task_struct *); 41extern __u64 __delayacct_blkio_ticks(struct task_struct *);
41 42
43static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
44{
45 if (p->delays)
46 return (p->delays->flags & DELAYACCT_PF_BLKIO);
47 else
48 return 0;
49}
50
42static inline void delayacct_set_flag(int flag) 51static inline void delayacct_set_flag(int flag)
43{ 52{
44 if (current->delays) 53 if (current->delays)
@@ -71,6 +80,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
71 80
72static inline void delayacct_blkio_start(void) 81static inline void delayacct_blkio_start(void)
73{ 82{
83 delayacct_set_flag(DELAYACCT_PF_BLKIO);
74 if (current->delays) 84 if (current->delays)
75 __delayacct_blkio_start(); 85 __delayacct_blkio_start();
76} 86}
@@ -79,6 +89,7 @@ static inline void delayacct_blkio_end(void)
79{ 89{
80 if (current->delays) 90 if (current->delays)
81 __delayacct_blkio_end(); 91 __delayacct_blkio_end();
92 delayacct_clear_flag(DELAYACCT_PF_BLKIO);
82} 93}
83 94
84static inline int delayacct_add_tsk(struct taskstats *d, 95static inline int delayacct_add_tsk(struct taskstats *d,
@@ -116,6 +127,8 @@ static inline int delayacct_add_tsk(struct taskstats *d,
116{ return 0; } 127{ return 0; }
117static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) 128static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
118{ return 0; } 129{ return 0; }
130static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
131{ return 0; }
119#endif /* CONFIG_TASK_DELAY_ACCT */ 132#endif /* CONFIG_TASK_DELAY_ACCT */
120 133
121#endif 134#endif
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d65a1246829f..ca38db223f84 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -44,6 +44,9 @@
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/sort.h> 45#include <linux/sort.h>
46#include <linux/kmod.h> 46#include <linux/kmod.h>
47#include <linux/delayacct.h>
48#include <linux/cgroupstats.h>
49
47#include <asm/atomic.h> 50#include <asm/atomic.h>
48 51
49static DEFINE_MUTEX(cgroup_mutex); 52static DEFINE_MUTEX(cgroup_mutex);
@@ -1766,6 +1769,58 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cont)
1766 return n; 1769 return n;
1767} 1770}
1768 1771
1772/**
1773 * Build and fill cgroupstats so that taskstats can export it to user
1774 * space.
1775 *
1776 * @stats: cgroupstats to fill information into
1777 * @dentry: A dentry entry belonging to the cgroup for which stats have
1778 * been requested.
1779 */
1780int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
1781{
1782 int ret = -EINVAL;
1783 struct cgroup *cont;
1784 struct cgroup_iter it;
1785 struct task_struct *tsk;
1786 /*
1787 * Validate dentry by checking the superblock operations
1788 */
1789 if (dentry->d_sb->s_op != &cgroup_ops)
1790 goto err;
1791
1792 ret = 0;
1793 cont = dentry->d_fsdata;
1794 rcu_read_lock();
1795
1796 cgroup_iter_start(cont, &it);
1797 while ((tsk = cgroup_iter_next(cont, &it))) {
1798 switch (tsk->state) {
1799 case TASK_RUNNING:
1800 stats->nr_running++;
1801 break;
1802 case TASK_INTERRUPTIBLE:
1803 stats->nr_sleeping++;
1804 break;
1805 case TASK_UNINTERRUPTIBLE:
1806 stats->nr_uninterruptible++;
1807 break;
1808 case TASK_STOPPED:
1809 stats->nr_stopped++;
1810 break;
1811 default:
1812 if (delayacct_is_task_waiting_on_io(tsk))
1813 stats->nr_io_wait++;
1814 break;
1815 }
1816 }
1817 cgroup_iter_end(cont, &it);
1818
1819 rcu_read_unlock();
1820err:
1821 return ret;
1822}
1823
1769static int cmppid(const void *a, const void *b) 1824static int cmppid(const void *a, const void *b)
1770{ 1825{
1771 return *(pid_t *)a - *(pid_t *)b; 1826 return *(pid_t *)a - *(pid_t *)b;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 7d4d7f9c1bb2..9f360f68aad6 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,10 @@
22#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/cpumask.h> 23#include <linux/cpumask.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/cgroupstats.h>
26#include <linux/cgroup.h>
27#include <linux/fs.h>
28#include <linux/file.h>
25#include <net/genetlink.h> 29#include <net/genetlink.h>
26#include <asm/atomic.h> 30#include <asm/atomic.h>
27 31
@@ -49,6 +53,11 @@ __read_mostly = {
49 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
50 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
51 55
56static struct nla_policy
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59};
60
52struct listener { 61struct listener {
53 struct list_head list; 62 struct list_head list;
54 pid_t pid; 63 pid_t pid;
@@ -372,6 +381,51 @@ err:
372 return NULL; 381 return NULL;
373} 382}
374 383
384static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
385{
386 int rc = 0;
387 struct sk_buff *rep_skb;
388 struct cgroupstats *stats;
389 struct nlattr *na;
390 size_t size;
391 u32 fd;
392 struct file *file;
393 int fput_needed;
394
395 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
396 if (!na)
397 return -EINVAL;
398
399 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
400 file = fget_light(fd, &fput_needed);
401 if (file) {
402 size = nla_total_size(sizeof(struct cgroupstats));
403
404 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
405 size);
406 if (rc < 0)
407 goto err;
408
409 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
410 sizeof(struct cgroupstats));
411 stats = nla_data(na);
412 memset(stats, 0, sizeof(*stats));
413
414 rc = cgroupstats_build(stats, file->f_dentry);
415 if (rc < 0)
416 goto err;
417
418 fput_light(file, fput_needed);
419 return send_reply(rep_skb, info->snd_pid);
420 }
421
422err:
423 if (file)
424 fput_light(file, fput_needed);
425 nlmsg_free(rep_skb);
426 return rc;
427}
428
375static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 429static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
376{ 430{
377 int rc = 0; 431 int rc = 0;
@@ -522,6 +576,12 @@ static struct genl_ops taskstats_ops = {
522 .policy = taskstats_cmd_get_policy, 576 .policy = taskstats_cmd_get_policy,
523}; 577};
524 578
579static struct genl_ops cgroupstats_ops = {
580 .cmd = CGROUPSTATS_CMD_GET,
581 .doit = cgroupstats_user_cmd,
582 .policy = cgroupstats_cmd_get_policy,
583};
584
525/* Needed early in initialization */ 585/* Needed early in initialization */
526void __init taskstats_init_early(void) 586void __init taskstats_init_early(void)
527{ 587{
@@ -546,8 +606,15 @@ static int __init taskstats_init(void)
546 if (rc < 0) 606 if (rc < 0)
547 goto err; 607 goto err;
548 608
609 rc = genl_register_ops(&family, &cgroupstats_ops);
610 if (rc < 0)
611 goto err_cgroup_ops;
612
549 family_registered = 1; 613 family_registered = 1;
614 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
550 return 0; 615 return 0;
616err_cgroup_ops:
617 genl_unregister_ops(&family, &taskstats_ops);
551err: 618err:
552 genl_unregister_family(&family); 619 genl_unregister_family(&family);
553 return rc; 620 return rc;