diff options
| -rw-r--r-- | Documentation/accounting/taskstats.txt | 146 | ||||
| -rw-r--r-- | include/linux/taskstats.h | 84 | ||||
| -rw-r--r-- | include/linux/taskstats_kern.h | 57 | ||||
| -rw-r--r-- | init/Kconfig | 13 | ||||
| -rw-r--r-- | init/main.c | 2 | ||||
| -rw-r--r-- | kernel/Makefile | 1 | ||||
| -rw-r--r-- | kernel/exit.c | 7 | ||||
| -rw-r--r-- | kernel/taskstats.c | 336 |
8 files changed, 646 insertions, 0 deletions
diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt new file mode 100644 index 000000000000..ad9b6997e162 --- /dev/null +++ b/Documentation/accounting/taskstats.txt | |||
| @@ -0,0 +1,146 @@ | |||
| 1 | Per-task statistics interface | ||
| 2 | ----------------------------- | ||
| 3 | |||
| 4 | |||
| 5 | Taskstats is a netlink-based interface for sending per-task and | ||
| 6 | per-process statistics from the kernel to userspace. | ||
| 7 | |||
| 8 | Taskstats was designed for the following benefits: | ||
| 9 | |||
| 10 | - efficiently provide statistics during lifetime of a task and on its exit | ||
| 11 | - unified interface for multiple accounting subsystems | ||
| 12 | - extensibility for use by future accounting patches | ||
| 13 | |||
| 14 | Terminology | ||
| 15 | ----------- | ||
| 16 | |||
| 17 | "pid", "tid" and "task" are used interchangeably and refer to the standard | ||
| 18 | Linux task defined by struct task_struct. per-pid stats are the same as | ||
| 19 | per-task stats. | ||
| 20 | |||
| 21 | "tgid", "process" and "thread group" are used interchangeably and refer to the | ||
| 22 | tasks that share an mm_struct i.e. the traditional Unix process. Despite the | ||
| 23 | use of tgid, there is no special treatment for the task that is thread group | ||
| 24 | leader - a process is deemed alive as long as it has any task belonging to it. | ||
| 25 | |||
| 26 | Usage | ||
| 27 | ----- | ||
| 28 | |||
| 29 | To get statistics during task's lifetime, userspace opens a unicast netlink | ||
| 30 | socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid. | ||
| 31 | The response contains statistics for a task (if pid is specified) or the sum of | ||
| 32 | statistics for all tasks of the process (if tgid is specified). | ||
| 33 | |||
| 34 | To obtain statistics for tasks which are exiting, userspace opens a multicast | ||
| 35 | netlink socket. Each time a task exits, two records are sent by the kernel to | ||
| 36 | each listener on the multicast socket. The first the per-pid task's statistics | ||
| 37 | and the second is the sum for all tasks of the process to which the task | ||
| 38 | belongs (the task does not need to be the thread group leader). The need for | ||
| 39 | per-tgid stats to be sent for each exiting task is explained in the per-tgid | ||
| 40 | stats section below. | ||
| 41 | |||
| 42 | |||
| 43 | Interface | ||
| 44 | --------- | ||
| 45 | |||
| 46 | The user-kernel interface is encapsulated in include/linux/taskstats.h | ||
| 47 | |||
| 48 | To avoid this documentation becoming obsolete as the interface evolves, only | ||
| 49 | an outline of the current version is given. taskstats.h always overrides the | ||
| 50 | description here. | ||
| 51 | |||
| 52 | struct taskstats is the common accounting structure for both per-pid and | ||
| 53 | per-tgid data. It is versioned and can be extended by each accounting subsystem | ||
| 54 | that is added to the kernel. The fields and their semantics are defined in the | ||
| 55 | taskstats.h file. | ||
| 56 | |||
| 57 | The data exchanged between user and kernel space is a netlink message belonging | ||
| 58 | to the NETLINK_GENERIC family and using the netlink attributes interface. | ||
| 59 | The messages are in the format | ||
| 60 | |||
| 61 | +----------+- - -+-------------+-------------------+ | ||
| 62 | | nlmsghdr | Pad | genlmsghdr | taskstats payload | | ||
| 63 | +----------+- - -+-------------+-------------------+ | ||
| 64 | |||
| 65 | |||
| 66 | The taskstats payload is one of the following three kinds: | ||
| 67 | |||
| 68 | 1. Commands: Sent from user to kernel. The payload is one attribute, of type | ||
| 69 | TASKSTATS_CMD_ATTR_PID/TGID, containing a u32 pid or tgid in the attribute | ||
| 70 | payload. The pid/tgid denotes the task/process for which userspace wants | ||
| 71 | statistics. | ||
| 72 | |||
| 73 | 2. Response for a command: sent from the kernel in response to a userspace | ||
| 74 | command. The payload is a series of three attributes of type: | ||
| 75 | |||
| 76 | a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates | ||
| 77 | a pid/tgid will be followed by some stats. | ||
| 78 | |||
| 79 | b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats | ||
| 80 | is being returned. | ||
| 81 | |||
| 82 | c) TASKSTATS_TYPE_STATS: attribute with a struct taskstsats as payload. The | ||
| 83 | same structure is used for both per-pid and per-tgid stats. | ||
| 84 | |||
| 85 | 3. New message sent by kernel whenever a task exits. The payload consists of a | ||
| 86 | series of attributes of the following type: | ||
| 87 | |||
| 88 | a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats | ||
| 89 | b) TASKSTATS_TYPE_PID: contains exiting task's pid | ||
| 90 | c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats | ||
| 91 | d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats | ||
| 92 | e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs | ||
| 93 | f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process | ||
| 94 | |||
| 95 | |||
| 96 | per-tgid stats | ||
| 97 | -------------- | ||
| 98 | |||
| 99 | Taskstats provides per-process stats, in addition to per-task stats, since | ||
| 100 | resource management is often done at a process granularity and aggregating task | ||
| 101 | stats in userspace alone is inefficient and potentially inaccurate (due to lack | ||
| 102 | of atomicity). | ||
| 103 | |||
| 104 | However, maintaining per-process, in addition to per-task stats, within the | ||
| 105 | kernel has space and time overheads. Hence the taskstats implementation | ||
| 106 | dynamically sums up the per-task stats for each task belonging to a process | ||
| 107 | whenever per-process stats are needed. | ||
| 108 | |||
| 109 | Not maintaining per-tgid stats creates a problem when userspace is interested | ||
| 110 | in getting these stats when the process dies i.e. the last thread of | ||
| 111 | a process exits. It isn't possible to simply return some aggregated per-process | ||
| 112 | statistic from the kernel. | ||
| 113 | |||
| 114 | The approach taken by taskstats is to return the per-tgid stats *each* time | ||
| 115 | a task exits, in addition to the per-pid stats for that task. Userspace can | ||
| 116 | maintain task<->process mappings and use them to maintain the per-process stats | ||
| 117 | in userspace, updating the aggregate appropriately as the tasks of a process | ||
| 118 | exit. | ||
| 119 | |||
| 120 | Extending taskstats | ||
| 121 | ------------------- | ||
| 122 | |||
| 123 | There are two ways to extend the taskstats interface to export more | ||
| 124 | per-task/process stats as patches to collect them get added to the kernel | ||
| 125 | in future: | ||
| 126 | |||
| 127 | 1. Adding more fields to the end of the existing struct taskstats. Backward | ||
| 128 | compatibility is ensured by the version number within the | ||
| 129 | structure. Userspace will use only the fields of the struct that correspond | ||
| 130 | to the version its using. | ||
| 131 | |||
| 132 | 2. Defining separate statistic structs and using the netlink attributes | ||
| 133 | interface to return them. Since userspace processes each netlink attribute | ||
| 134 | independently, it can always ignore attributes whose type it does not | ||
| 135 | understand (because it is using an older version of the interface). | ||
| 136 | |||
| 137 | |||
| 138 | Choosing between 1. and 2. is a matter of trading off flexibility and | ||
| 139 | overhead. If only a few fields need to be added, then 1. is the preferable | ||
| 140 | path since the kernel and userspace don't need to incur the overhead of | ||
| 141 | processing new netlink attributes. But if the new fields expand the existing | ||
| 142 | struct too much, requiring disparate userspace accounting utilities to | ||
| 143 | unnecessarily receive large structures whose fields are of no interest, then | ||
| 144 | extending the attributes structure would be worthwhile. | ||
| 145 | |||
| 146 | ---- | ||
diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h new file mode 100644 index 000000000000..51f62759bea9 --- /dev/null +++ b/include/linux/taskstats.h | |||
| @@ -0,0 +1,84 @@ | |||
| 1 | /* taskstats.h - exporting per-task statistics | ||
| 2 | * | ||
| 3 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | ||
| 4 | * (C) Balbir Singh, IBM Corp. 2006 | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify it | ||
| 7 | * under the terms of version 2.1 of the GNU Lesser General Public License | ||
| 8 | * as published by the Free Software Foundation. | ||
| 9 | * | ||
| 10 | * This program is distributed in the hope that it would be useful, but | ||
| 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #ifndef _LINUX_TASKSTATS_H | ||
| 16 | #define _LINUX_TASKSTATS_H | ||
| 17 | |||
| 18 | /* Format for per-task data returned to userland when | ||
| 19 | * - a task exits | ||
| 20 | * - listener requests stats for a task | ||
| 21 | * | ||
| 22 | * The struct is versioned. Newer versions should only add fields to | ||
| 23 | * the bottom of the struct to maintain backward compatibility. | ||
| 24 | * | ||
| 25 | * | ||
| 26 | * To add new fields | ||
| 27 | * a) bump up TASKSTATS_VERSION | ||
| 28 | * b) add comment indicating new version number at end of struct | ||
| 29 | * c) add new fields after version comment; maintain 64-bit alignment | ||
| 30 | */ | ||
| 31 | |||
| 32 | #define TASKSTATS_VERSION 1 | ||
| 33 | |||
| 34 | struct taskstats { | ||
| 35 | |||
| 36 | /* Version 1 */ | ||
| 37 | __u64 version; | ||
| 38 | }; | ||
| 39 | |||
| 40 | |||
| 41 | #define TASKSTATS_LISTEN_GROUP 0x1 | ||
| 42 | |||
| 43 | /* | ||
| 44 | * Commands sent from userspace | ||
| 45 | * Not versioned. New commands should only be inserted at the enum's end | ||
| 46 | * prior to __TASKSTATS_CMD_MAX | ||
| 47 | */ | ||
| 48 | |||
| 49 | enum { | ||
| 50 | TASKSTATS_CMD_UNSPEC = 0, /* Reserved */ | ||
| 51 | TASKSTATS_CMD_GET, /* user->kernel request/get-response */ | ||
| 52 | TASKSTATS_CMD_NEW, /* kernel->user event */ | ||
| 53 | __TASKSTATS_CMD_MAX, | ||
| 54 | }; | ||
| 55 | |||
| 56 | #define TASKSTATS_CMD_MAX (__TASKSTATS_CMD_MAX - 1) | ||
| 57 | |||
| 58 | enum { | ||
| 59 | TASKSTATS_TYPE_UNSPEC = 0, /* Reserved */ | ||
| 60 | TASKSTATS_TYPE_PID, /* Process id */ | ||
| 61 | TASKSTATS_TYPE_TGID, /* Thread group id */ | ||
| 62 | TASKSTATS_TYPE_STATS, /* taskstats structure */ | ||
| 63 | TASKSTATS_TYPE_AGGR_PID, /* contains pid + stats */ | ||
| 64 | TASKSTATS_TYPE_AGGR_TGID, /* contains tgid + stats */ | ||
| 65 | __TASKSTATS_TYPE_MAX, | ||
| 66 | }; | ||
| 67 | |||
| 68 | #define TASKSTATS_TYPE_MAX (__TASKSTATS_TYPE_MAX - 1) | ||
| 69 | |||
| 70 | enum { | ||
| 71 | TASKSTATS_CMD_ATTR_UNSPEC = 0, | ||
| 72 | TASKSTATS_CMD_ATTR_PID, | ||
| 73 | TASKSTATS_CMD_ATTR_TGID, | ||
| 74 | __TASKSTATS_CMD_ATTR_MAX, | ||
| 75 | }; | ||
| 76 | |||
| 77 | #define TASKSTATS_CMD_ATTR_MAX (__TASKSTATS_CMD_ATTR_MAX - 1) | ||
| 78 | |||
| 79 | /* NETLINK_GENERIC related info */ | ||
| 80 | |||
| 81 | #define TASKSTATS_GENL_NAME "TASKSTATS" | ||
| 82 | #define TASKSTATS_GENL_VERSION 0x1 | ||
| 83 | |||
| 84 | #endif /* _LINUX_TASKSTATS_H */ | ||
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h new file mode 100644 index 000000000000..bd0ecb969c26 --- /dev/null +++ b/include/linux/taskstats_kern.h | |||
| @@ -0,0 +1,57 @@ | |||
| 1 | /* taskstats_kern.h - kernel header for per-task statistics interface | ||
| 2 | * | ||
| 3 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | ||
| 4 | * (C) Balbir Singh, IBM Corp. 2006 | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef _LINUX_TASKSTATS_KERN_H | ||
| 8 | #define _LINUX_TASKSTATS_KERN_H | ||
| 9 | |||
| 10 | #include <linux/taskstats.h> | ||
| 11 | #include <linux/sched.h> | ||
| 12 | |||
| 13 | enum { | ||
| 14 | TASKSTATS_MSG_UNICAST, /* send data only to requester */ | ||
| 15 | TASKSTATS_MSG_MULTICAST, /* send data to a group */ | ||
| 16 | }; | ||
| 17 | |||
| 18 | #ifdef CONFIG_TASKSTATS | ||
| 19 | extern kmem_cache_t *taskstats_cache; | ||
| 20 | |||
| 21 | static inline void taskstats_exit_alloc(struct taskstats **ptidstats, | ||
| 22 | struct taskstats **ptgidstats) | ||
| 23 | { | ||
| 24 | *ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); | ||
| 25 | *ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); | ||
| 26 | } | ||
| 27 | |||
| 28 | static inline void taskstats_exit_free(struct taskstats *tidstats, | ||
| 29 | struct taskstats *tgidstats) | ||
| 30 | { | ||
| 31 | if (tidstats) | ||
| 32 | kmem_cache_free(taskstats_cache, tidstats); | ||
| 33 | if (tgidstats) | ||
| 34 | kmem_cache_free(taskstats_cache, tgidstats); | ||
| 35 | } | ||
| 36 | |||
| 37 | extern void taskstats_exit_send(struct task_struct *, struct taskstats *, | ||
| 38 | struct taskstats *); | ||
| 39 | extern void taskstats_init_early(void); | ||
| 40 | |||
| 41 | #else | ||
| 42 | static inline void taskstats_exit_alloc(struct taskstats **ptidstats, | ||
| 43 | struct taskstats **ptgidstats) | ||
| 44 | {} | ||
| 45 | static inline void taskstats_exit_free(struct taskstats *ptidstats, | ||
| 46 | struct taskstats *ptgidstats) | ||
| 47 | {} | ||
| 48 | static inline void taskstats_exit_send(struct task_struct *tsk, | ||
| 49 | struct taskstats *tidstats, | ||
| 50 | struct taskstats *tgidstats) | ||
| 51 | {} | ||
| 52 | static inline void taskstats_init_early(void) | ||
| 53 | {} | ||
| 54 | #endif /* CONFIG_TASKSTATS */ | ||
| 55 | |||
| 56 | #endif | ||
| 57 | |||
diff --git a/init/Kconfig b/init/Kconfig index 90498a3e53da..56a7093b4e4c 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -158,6 +158,19 @@ config BSD_PROCESS_ACCT_V3 | |||
| 158 | for processing it. A preliminary version of these tools is available | 158 | for processing it. A preliminary version of these tools is available |
| 159 | at <http://www.physik3.uni-rostock.de/tim/kernel/utils/acct/>. | 159 | at <http://www.physik3.uni-rostock.de/tim/kernel/utils/acct/>. |
| 160 | 160 | ||
| 161 | config TASKSTATS | ||
| 162 | bool "Export task/process statistics through netlink (EXPERIMENTAL)" | ||
| 163 | depends on NET | ||
| 164 | default n | ||
| 165 | help | ||
| 166 | Export selected statistics for tasks/processes through the | ||
| 167 | generic netlink interface. Unlike BSD process accounting, the | ||
| 168 | statistics are available during the lifetime of tasks/processes as | ||
| 169 | responses to commands. Like BSD accounting, they are sent to user | ||
| 170 | space on task exit. | ||
| 171 | |||
| 172 | Say N if unsure. | ||
| 173 | |||
| 161 | config TASK_DELAY_ACCT | 174 | config TASK_DELAY_ACCT |
| 162 | bool "Enable per-task delay accounting (EXPERIMENTAL)" | 175 | bool "Enable per-task delay accounting (EXPERIMENTAL)" |
| 163 | help | 176 | help |
diff --git a/init/main.c b/init/main.c index 9e8e8c152142..8651a720a092 100644 --- a/init/main.c +++ b/init/main.c | |||
| @@ -41,6 +41,7 @@ | |||
| 41 | #include <linux/cpu.h> | 41 | #include <linux/cpu.h> |
| 42 | #include <linux/cpuset.h> | 42 | #include <linux/cpuset.h> |
| 43 | #include <linux/efi.h> | 43 | #include <linux/efi.h> |
| 44 | #include <linux/taskstats_kern.h> | ||
| 44 | #include <linux/delayacct.h> | 45 | #include <linux/delayacct.h> |
| 45 | #include <linux/unistd.h> | 46 | #include <linux/unistd.h> |
| 46 | #include <linux/rmap.h> | 47 | #include <linux/rmap.h> |
| @@ -575,6 +576,7 @@ asmlinkage void __init start_kernel(void) | |||
| 575 | proc_root_init(); | 576 | proc_root_init(); |
| 576 | #endif | 577 | #endif |
| 577 | cpuset_init(); | 578 | cpuset_init(); |
| 579 | taskstats_init_early(); | ||
| 578 | delayacct_init(); | 580 | delayacct_init(); |
| 579 | 581 | ||
| 580 | check_bugs(); | 582 | check_bugs(); |
diff --git a/kernel/Makefile b/kernel/Makefile index 87bb34cc8938..d62ec66c1af2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -49,6 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o | |||
| 49 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 49 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
| 50 | obj-$(CONFIG_RELAY) += relay.o | 50 | obj-$(CONFIG_RELAY) += relay.o |
| 51 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 51 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
| 52 | obj-$(CONFIG_TASKSTATS) += taskstats.o | ||
| 52 | 53 | ||
| 53 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 54 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
| 54 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 55 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/exit.c b/kernel/exit.c index 3c2cf91defa7..9852ed8c2988 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/mount.h> | 25 | #include <linux/mount.h> |
| 26 | #include <linux/proc_fs.h> | 26 | #include <linux/proc_fs.h> |
| 27 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
| 28 | #include <linux/taskstats_kern.h> | ||
| 28 | #include <linux/delayacct.h> | 29 | #include <linux/delayacct.h> |
| 29 | #include <linux/cpuset.h> | 30 | #include <linux/cpuset.h> |
| 30 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
| @@ -844,6 +845,7 @@ static void exit_notify(struct task_struct *tsk) | |||
| 844 | fastcall NORET_TYPE void do_exit(long code) | 845 | fastcall NORET_TYPE void do_exit(long code) |
| 845 | { | 846 | { |
| 846 | struct task_struct *tsk = current; | 847 | struct task_struct *tsk = current; |
| 848 | struct taskstats *tidstats, *tgidstats; | ||
| 847 | int group_dead; | 849 | int group_dead; |
| 848 | 850 | ||
| 849 | profile_task_exit(tsk); | 851 | profile_task_exit(tsk); |
| @@ -882,6 +884,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 882 | current->comm, current->pid, | 884 | current->comm, current->pid, |
| 883 | preempt_count()); | 885 | preempt_count()); |
| 884 | 886 | ||
| 887 | taskstats_exit_alloc(&tidstats, &tgidstats); | ||
| 888 | |||
| 885 | acct_update_integrals(tsk); | 889 | acct_update_integrals(tsk); |
| 886 | if (tsk->mm) { | 890 | if (tsk->mm) { |
| 887 | update_hiwater_rss(tsk->mm); | 891 | update_hiwater_rss(tsk->mm); |
| @@ -901,7 +905,10 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 901 | #endif | 905 | #endif |
| 902 | if (unlikely(tsk->audit_context)) | 906 | if (unlikely(tsk->audit_context)) |
| 903 | audit_free(tsk); | 907 | audit_free(tsk); |
| 908 | taskstats_exit_send(tsk, tidstats, tgidstats); | ||
| 909 | taskstats_exit_free(tidstats, tgidstats); | ||
| 904 | delayacct_tsk_exit(tsk); | 910 | delayacct_tsk_exit(tsk); |
| 911 | |||
| 905 | exit_mm(tsk); | 912 | exit_mm(tsk); |
| 906 | 913 | ||
| 907 | if (group_dead) | 914 | if (group_dead) |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 000000000000..82ec9137d908 --- /dev/null +++ b/kernel/taskstats.c | |||
| @@ -0,0 +1,336 @@ | |||
| 1 | /* | ||
| 2 | * taskstats.c - Export per-task statistics to userland | ||
| 3 | * | ||
| 4 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | ||
| 5 | * (C) Balbir Singh, IBM Corp. 2006 | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License as published by | ||
| 9 | * the Free Software Foundation; either version 2 of the License, or | ||
| 10 | * (at your option) any later version. | ||
| 11 | * | ||
| 12 | * This program is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | * GNU General Public License for more details. | ||
| 16 | * | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include <linux/kernel.h> | ||
| 20 | #include <linux/taskstats_kern.h> | ||
| 21 | #include <net/genetlink.h> | ||
| 22 | #include <asm/atomic.h> | ||
| 23 | |||
| 24 | static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; | ||
| 25 | static int family_registered; | ||
| 26 | kmem_cache_t *taskstats_cache; | ||
| 27 | static DEFINE_MUTEX(taskstats_exit_mutex); | ||
| 28 | |||
| 29 | static struct genl_family family = { | ||
| 30 | .id = GENL_ID_GENERATE, | ||
| 31 | .name = TASKSTATS_GENL_NAME, | ||
| 32 | .version = TASKSTATS_GENL_VERSION, | ||
| 33 | .maxattr = TASKSTATS_CMD_ATTR_MAX, | ||
| 34 | }; | ||
| 35 | |||
| 36 | static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] | ||
| 37 | __read_mostly = { | ||
| 38 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, | ||
| 39 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, | ||
| 40 | }; | ||
| 41 | |||
| 42 | |||
| 43 | static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | ||
| 44 | void **replyp, size_t size) | ||
| 45 | { | ||
| 46 | struct sk_buff *skb; | ||
| 47 | void *reply; | ||
| 48 | |||
| 49 | /* | ||
| 50 | * If new attributes are added, please revisit this allocation | ||
| 51 | */ | ||
| 52 | skb = nlmsg_new(size); | ||
| 53 | if (!skb) | ||
| 54 | return -ENOMEM; | ||
| 55 | |||
| 56 | if (!info) { | ||
| 57 | int seq = get_cpu_var(taskstats_seqnum)++; | ||
| 58 | put_cpu_var(taskstats_seqnum); | ||
| 59 | |||
| 60 | reply = genlmsg_put(skb, 0, seq, | ||
| 61 | family.id, 0, 0, | ||
| 62 | cmd, family.version); | ||
| 63 | } else | ||
| 64 | reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, | ||
| 65 | family.id, 0, 0, | ||
| 66 | cmd, family.version); | ||
| 67 | if (reply == NULL) { | ||
| 68 | nlmsg_free(skb); | ||
| 69 | return -EINVAL; | ||
| 70 | } | ||
| 71 | |||
| 72 | *skbp = skb; | ||
| 73 | *replyp = reply; | ||
| 74 | return 0; | ||
| 75 | } | ||
| 76 | |||
| 77 | static int send_reply(struct sk_buff *skb, pid_t pid, int event) | ||
| 78 | { | ||
| 79 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | ||
| 80 | void *reply; | ||
| 81 | int rc; | ||
| 82 | |||
| 83 | reply = genlmsg_data(genlhdr); | ||
| 84 | |||
| 85 | rc = genlmsg_end(skb, reply); | ||
| 86 | if (rc < 0) { | ||
| 87 | nlmsg_free(skb); | ||
| 88 | return rc; | ||
| 89 | } | ||
| 90 | |||
| 91 | if (event == TASKSTATS_MSG_MULTICAST) | ||
| 92 | return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP); | ||
| 93 | return genlmsg_unicast(skb, pid); | ||
| 94 | } | ||
| 95 | |||
| 96 | static int fill_pid(pid_t pid, struct task_struct *pidtsk, | ||
| 97 | struct taskstats *stats) | ||
| 98 | { | ||
| 99 | int rc; | ||
| 100 | struct task_struct *tsk = pidtsk; | ||
| 101 | |||
| 102 | if (!pidtsk) { | ||
| 103 | read_lock(&tasklist_lock); | ||
| 104 | tsk = find_task_by_pid(pid); | ||
| 105 | if (!tsk) { | ||
| 106 | read_unlock(&tasklist_lock); | ||
| 107 | return -ESRCH; | ||
| 108 | } | ||
| 109 | get_task_struct(tsk); | ||
| 110 | read_unlock(&tasklist_lock); | ||
| 111 | } else | ||
| 112 | get_task_struct(tsk); | ||
| 113 | |||
| 114 | /* | ||
| 115 | * Each accounting subsystem adds calls to its functions to | ||
| 116 | * fill in relevant parts of struct taskstsats as follows | ||
| 117 | * | ||
| 118 | * rc = per-task-foo(stats, tsk); | ||
| 119 | * if (rc) | ||
| 120 | * goto err; | ||
| 121 | */ | ||
| 122 | |||
| 123 | err: | ||
| 124 | put_task_struct(tsk); | ||
| 125 | return rc; | ||
| 126 | |||
| 127 | } | ||
| 128 | |||
| 129 | static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, | ||
| 130 | struct taskstats *stats) | ||
| 131 | { | ||
| 132 | int rc; | ||
| 133 | struct task_struct *tsk, *first; | ||
| 134 | |||
| 135 | first = tgidtsk; | ||
| 136 | read_lock(&tasklist_lock); | ||
| 137 | if (!first) { | ||
| 138 | first = find_task_by_pid(tgid); | ||
| 139 | if (!first) { | ||
| 140 | read_unlock(&tasklist_lock); | ||
| 141 | return -ESRCH; | ||
| 142 | } | ||
| 143 | } | ||
| 144 | tsk = first; | ||
| 145 | do { | ||
| 146 | /* | ||
| 147 | * Each accounting subsystem adds calls its functions to | ||
| 148 | * fill in relevant parts of struct taskstsats as follows | ||
| 149 | * | ||
| 150 | * rc = per-task-foo(stats, tsk); | ||
| 151 | * if (rc) | ||
| 152 | * break; | ||
| 153 | */ | ||
| 154 | |||
| 155 | } while_each_thread(first, tsk); | ||
| 156 | read_unlock(&tasklist_lock); | ||
| 157 | |||
| 158 | /* | ||
| 159 | * Accounting subsytems can also add calls here if they don't | ||
| 160 | * wish to aggregate statistics for per-tgid stats | ||
| 161 | */ | ||
| 162 | |||
| 163 | return rc; | ||
| 164 | } | ||
| 165 | |||
| 166 | static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) | ||
| 167 | { | ||
| 168 | int rc = 0; | ||
| 169 | struct sk_buff *rep_skb; | ||
| 170 | struct taskstats stats; | ||
| 171 | void *reply; | ||
| 172 | size_t size; | ||
| 173 | struct nlattr *na; | ||
| 174 | |||
| 175 | /* | ||
| 176 | * Size includes space for nested attributes | ||
| 177 | */ | ||
| 178 | size = nla_total_size(sizeof(u32)) + | ||
| 179 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
| 180 | |||
| 181 | memset(&stats, 0, sizeof(stats)); | ||
| 182 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
| 183 | if (rc < 0) | ||
| 184 | return rc; | ||
| 185 | |||
| 186 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | ||
| 187 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | ||
| 188 | rc = fill_pid(pid, NULL, &stats); | ||
| 189 | if (rc < 0) | ||
| 190 | goto err; | ||
| 191 | |||
| 192 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | ||
| 193 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); | ||
| 194 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
| 195 | stats); | ||
| 196 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | ||
| 197 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | ||
| 198 | rc = fill_tgid(tgid, NULL, &stats); | ||
| 199 | if (rc < 0) | ||
| 200 | goto err; | ||
| 201 | |||
| 202 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | ||
| 203 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); | ||
| 204 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
| 205 | stats); | ||
| 206 | } else { | ||
| 207 | rc = -EINVAL; | ||
| 208 | goto err; | ||
| 209 | } | ||
| 210 | |||
| 211 | nla_nest_end(rep_skb, na); | ||
| 212 | |||
| 213 | return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); | ||
| 214 | |||
| 215 | nla_put_failure: | ||
| 216 | return genlmsg_cancel(rep_skb, reply); | ||
| 217 | err: | ||
| 218 | nlmsg_free(rep_skb); | ||
| 219 | return rc; | ||
| 220 | } | ||
| 221 | |||
| 222 | /* Send pid data out on exit */ | ||
| 223 | void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, | ||
| 224 | struct taskstats *tgidstats) | ||
| 225 | { | ||
| 226 | int rc; | ||
| 227 | struct sk_buff *rep_skb; | ||
| 228 | void *reply; | ||
| 229 | size_t size; | ||
| 230 | int is_thread_group; | ||
| 231 | struct nlattr *na; | ||
| 232 | |||
| 233 | if (!family_registered || !tidstats) | ||
| 234 | return; | ||
| 235 | |||
| 236 | mutex_lock(&taskstats_exit_mutex); | ||
| 237 | |||
| 238 | is_thread_group = !thread_group_empty(tsk); | ||
| 239 | rc = 0; | ||
| 240 | |||
| 241 | /* | ||
| 242 | * Size includes space for nested attributes | ||
| 243 | */ | ||
| 244 | size = nla_total_size(sizeof(u32)) + | ||
| 245 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
| 246 | |||
| 247 | if (is_thread_group) | ||
| 248 | size = 2 * size; /* PID + STATS + TGID + STATS */ | ||
| 249 | |||
| 250 | rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
| 251 | if (rc < 0) | ||
| 252 | goto ret; | ||
| 253 | |||
| 254 | rc = fill_pid(tsk->pid, tsk, tidstats); | ||
| 255 | if (rc < 0) | ||
| 256 | goto err_skb; | ||
| 257 | |||
| 258 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | ||
| 259 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); | ||
| 260 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
| 261 | *tidstats); | ||
| 262 | nla_nest_end(rep_skb, na); | ||
| 263 | |||
| 264 | if (!is_thread_group || !tgidstats) { | ||
| 265 | send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); | ||
| 266 | goto ret; | ||
| 267 | } | ||
| 268 | |||
| 269 | rc = fill_tgid(tsk->pid, tsk, tgidstats); | ||
| 270 | /* | ||
| 271 | * If fill_tgid() failed then one probable reason could be that the | ||
| 272 | * thread group leader has exited. fill_tgid() will fail, send out | ||
| 273 | * the pid statistics collected earlier. | ||
| 274 | */ | ||
| 275 | if (rc < 0) { | ||
| 276 | send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); | ||
| 277 | goto ret; | ||
| 278 | } | ||
| 279 | |||
| 280 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | ||
| 281 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); | ||
| 282 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
| 283 | *tgidstats); | ||
| 284 | nla_nest_end(rep_skb, na); | ||
| 285 | |||
| 286 | send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); | ||
| 287 | goto ret; | ||
| 288 | |||
| 289 | nla_put_failure: | ||
| 290 | genlmsg_cancel(rep_skb, reply); | ||
| 291 | goto ret; | ||
| 292 | err_skb: | ||
| 293 | nlmsg_free(rep_skb); | ||
| 294 | ret: | ||
| 295 | mutex_unlock(&taskstats_exit_mutex); | ||
| 296 | return; | ||
| 297 | } | ||
| 298 | |||
| 299 | static struct genl_ops taskstats_ops = { | ||
| 300 | .cmd = TASKSTATS_CMD_GET, | ||
| 301 | .doit = taskstats_send_stats, | ||
| 302 | .policy = taskstats_cmd_get_policy, | ||
| 303 | }; | ||
| 304 | |||
| 305 | /* Needed early in initialization */ | ||
| 306 | void __init taskstats_init_early(void) | ||
| 307 | { | ||
| 308 | taskstats_cache = kmem_cache_create("taskstats_cache", | ||
| 309 | sizeof(struct taskstats), | ||
| 310 | 0, SLAB_PANIC, NULL, NULL); | ||
| 311 | } | ||
| 312 | |||
| 313 | static int __init taskstats_init(void) | ||
| 314 | { | ||
| 315 | int rc; | ||
| 316 | |||
| 317 | rc = genl_register_family(&family); | ||
| 318 | if (rc) | ||
| 319 | return rc; | ||
| 320 | |||
| 321 | rc = genl_register_ops(&family, &taskstats_ops); | ||
| 322 | if (rc < 0) | ||
| 323 | goto err; | ||
| 324 | |||
| 325 | family_registered = 1; | ||
| 326 | return 0; | ||
| 327 | err: | ||
| 328 | genl_unregister_family(&family); | ||
| 329 | return rc; | ||
| 330 | } | ||
| 331 | |||
| 332 | /* | ||
| 333 | * late initcall ensures initialization of statistics collection | ||
| 334 | * mechanisms precedes initialization of the taskstats interface | ||
| 335 | */ | ||
| 336 | late_initcall(taskstats_init); | ||
