aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/accounting/delay-accounting.txt13
-rw-r--r--Documentation/accounting/taskstats.txt33
-rw-r--r--MAINTAINERS12
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/taskstats_kern.h71
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/taskstats.c98
8 files changed, 162 insertions, 81 deletions
diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt
index f3dc0ca04fa4..be215e58423b 100644
--- a/Documentation/accounting/delay-accounting.txt
+++ b/Documentation/accounting/delay-accounting.txt
@@ -48,9 +48,10 @@ counter (say cpu_delay_total) for a task will give the delay
48experienced by the task waiting for the corresponding resource 48experienced by the task waiting for the corresponding resource
49in that interval. 49in that interval.
50 50
51When a task exits, records containing the per-task and per-process statistics 51When a task exits, records containing the per-task statistics
52are sent to userspace without requiring a command. More details are given in 52are sent to userspace without requiring a command. If it is the last exiting
53the taskstats interface description. 53task of a thread group, the per-tgid statistics are also sent. More details
54are given in the taskstats interface description.
54 55
55The getdelays.c userspace utility in this directory allows simple commands to 56The getdelays.c userspace utility in this directory allows simple commands to
56be run and the corresponding delay statistics to be displayed. It also serves 57be run and the corresponding delay statistics to be displayed. It also serves
@@ -107,9 +108,3 @@ IO count delay total
107 0 0 108 0 0
108MEM count delay total 109MEM count delay total
109 0 0 110 0 0
110
111
112
113
114
115
diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
index acc6b4f37fc7..efd8f605bcd5 100644
--- a/Documentation/accounting/taskstats.txt
+++ b/Documentation/accounting/taskstats.txt
@@ -32,12 +32,11 @@ The response contains statistics for a task (if pid is specified) or the sum of
32statistics for all tasks of the process (if tgid is specified). 32statistics for all tasks of the process (if tgid is specified).
33 33
34To obtain statistics for tasks which are exiting, userspace opens a multicast 34To obtain statistics for tasks which are exiting, userspace opens a multicast
35netlink socket. Each time a task exits, two records are sent by the kernel to 35netlink socket. Each time a task exits, its per-pid statistics is always sent
36each listener on the multicast socket. The first the per-pid task's statistics 36by the kernel to each listener on the multicast socket. In addition, if it is
37and the second is the sum for all tasks of the process to which the task 37the last thread exiting its thread group, an additional record containing the
38belongs (the task does not need to be the thread group leader). The need for 38per-tgid stats are also sent. The latter contains the sum of per-pid stats for
39per-tgid stats to be sent for each exiting task is explained in the per-tgid 39all threads in the thread group, both past and present.
40stats section below.
41 40
42getdelays.c is a simple utility demonstrating usage of the taskstats interface 41getdelays.c is a simple utility demonstrating usage of the taskstats interface
43for reporting delay accounting statistics. 42for reporting delay accounting statistics.
@@ -104,20 +103,14 @@ stats in userspace alone is inefficient and potentially inaccurate (due to lack
104of atomicity). 103of atomicity).
105 104
106However, maintaining per-process, in addition to per-task stats, within the 105However, maintaining per-process, in addition to per-task stats, within the
107kernel has space and time overheads. Hence the taskstats implementation 106kernel has space and time overheads. To address this, the taskstats code
108dynamically sums up the per-task stats for each task belonging to a process 107accumalates each exiting task's statistics into a process-wide data structure.
109whenever per-process stats are needed. 108When the last task of a process exits, the process level data accumalated also
110 109gets sent to userspace (along with the per-task data).
111Not maintaining per-tgid stats creates a problem when userspace is interested 110
112in getting these stats when the process dies i.e. the last thread of 111When a user queries to get per-tgid data, the sum of all other live threads in
113a process exits. It isn't possible to simply return some aggregated per-process 112the group is added up and added to the accumalated total for previously exited
114statistic from the kernel. 113threads of the same thread group.
115
116The approach taken by taskstats is to return the per-tgid stats *each* time
117a task exits, in addition to the per-pid stats for that task. Userspace can
118maintain task<->process mappings and use them to maintain the per-process stats
119in userspace, updating the aggregate appropriately as the tasks of a process
120exit.
121 114
122Extending taskstats 115Extending taskstats
123------------------- 116-------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 0557cfde053d..e99028ca2f7c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2240,6 +2240,12 @@ M: tsbogend@alpha.franken.de
2240L: netdev@vger.kernel.org 2240L: netdev@vger.kernel.org
2241S: Maintained 2241S: Maintained
2242 2242
2243PER-TASK DELAY ACCOUNTING
2244P: Shailabh Nagar
2245M: nagar@watson.ibm.com
2246L: linux-kernel@vger.kernel.org
2247S: Maintained
2248
2243PERSONALITY HANDLING 2249PERSONALITY HANDLING
2244P: Christoph Hellwig 2250P: Christoph Hellwig
2245M: hch@infradead.org 2251M: hch@infradead.org
@@ -2767,6 +2773,12 @@ P: Deepak Saxena
2767M: dsaxena@plexity.net 2773M: dsaxena@plexity.net
2768S: Maintained 2774S: Maintained
2769 2775
2776TASKSTATS STATISTICS INTERFACE
2777P: Shailabh Nagar
2778M: nagar@watson.ibm.com
2779L: linux-kernel@vger.kernel.org
2780S: Maintained
2781
2770TI PARALLEL LINK CABLE DRIVER 2782TI PARALLEL LINK CABLE DRIVER
2771P: Romain Lievin 2783P: Romain Lievin
2772M: roms@lpg.ticalc.org 2784M: roms@lpg.ticalc.org
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c5610ca0c92..6afa72e080cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -463,6 +463,10 @@ struct signal_struct {
463#ifdef CONFIG_BSD_PROCESS_ACCT 463#ifdef CONFIG_BSD_PROCESS_ACCT
464 struct pacct_struct pacct; /* per-process accounting information */ 464 struct pacct_struct pacct; /* per-process accounting information */
465#endif 465#endif
466#ifdef CONFIG_TASKSTATS
467 spinlock_t stats_lock;
468 struct taskstats *stats;
469#endif
466}; 470};
467 471
468/* Context switch must be unlocked if interrupts are to be enabled */ 472/* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index fc9da2e26443..0ae8f67af1fd 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -19,36 +19,75 @@ enum {
19extern kmem_cache_t *taskstats_cache; 19extern kmem_cache_t *taskstats_cache;
20extern struct mutex taskstats_exit_mutex; 20extern struct mutex taskstats_exit_mutex;
21 21
22static inline void taskstats_exit_alloc(struct taskstats **ptidstats, 22static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
23 struct taskstats **ptgidstats)
24{ 23{
25 *ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 24 *ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
26 *ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
27} 25}
28 26
29static inline void taskstats_exit_free(struct taskstats *tidstats, 27static inline void taskstats_exit_free(struct taskstats *tidstats)
30 struct taskstats *tgidstats)
31{ 28{
32 if (tidstats) 29 if (tidstats)
33 kmem_cache_free(taskstats_cache, tidstats); 30 kmem_cache_free(taskstats_cache, tidstats);
34 if (tgidstats)
35 kmem_cache_free(taskstats_cache, tgidstats);
36} 31}
37 32
38extern void taskstats_exit_send(struct task_struct *, struct taskstats *, 33static inline void taskstats_tgid_init(struct signal_struct *sig)
39 struct taskstats *); 34{
40extern void taskstats_init_early(void); 35 spin_lock_init(&sig->stats_lock);
36 sig->stats = NULL;
37}
38
39static inline void taskstats_tgid_alloc(struct signal_struct *sig)
40{
41 struct taskstats *stats;
42 unsigned long flags;
43
44 stats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
45 if (!stats)
46 return;
47
48 spin_lock_irqsave(&sig->stats_lock, flags);
49 if (!sig->stats) {
50 sig->stats = stats;
51 stats = NULL;
52 }
53 spin_unlock_irqrestore(&sig->stats_lock, flags);
54
55 if (stats)
56 kmem_cache_free(taskstats_cache, stats);
57}
41 58
59static inline void taskstats_tgid_free(struct signal_struct *sig)
60{
61 struct taskstats *stats = NULL;
62 unsigned long flags;
63
64 spin_lock_irqsave(&sig->stats_lock, flags);
65 if (sig->stats) {
66 stats = sig->stats;
67 sig->stats = NULL;
68 }
69 spin_unlock_irqrestore(&sig->stats_lock, flags);
70 if (stats)
71 kmem_cache_free(taskstats_cache, stats);
72}
73
74extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int);
75extern void taskstats_init_early(void);
76extern void taskstats_tgid_alloc(struct signal_struct *);
42#else 77#else
43static inline void taskstats_exit_alloc(struct taskstats **ptidstats, 78static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
44 struct taskstats **ptgidstats)
45{} 79{}
46static inline void taskstats_exit_free(struct taskstats *ptidstats, 80static inline void taskstats_exit_free(struct taskstats *ptidstats)
47 struct taskstats *ptgidstats)
48{} 81{}
49static inline void taskstats_exit_send(struct task_struct *tsk, 82static inline void taskstats_exit_send(struct task_struct *tsk,
50 struct taskstats *tidstats, 83 struct taskstats *tidstats,
51 struct taskstats *tgidstats) 84 int group_dead)
85{}
86static inline void taskstats_tgid_init(struct signal_struct *sig)
87{}
88static inline void taskstats_tgid_alloc(struct signal_struct *sig)
89{}
90static inline void taskstats_tgid_free(struct signal_struct *sig)
52{} 91{}
53static inline void taskstats_init_early(void) 92static inline void taskstats_init_early(void)
54{} 93{}
diff --git a/kernel/exit.c b/kernel/exit.c
index 9852ed8c2988..67c1e9a4f812 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -845,7 +845,7 @@ static void exit_notify(struct task_struct *tsk)
845fastcall NORET_TYPE void do_exit(long code) 845fastcall NORET_TYPE void do_exit(long code)
846{ 846{
847 struct task_struct *tsk = current; 847 struct task_struct *tsk = current;
848 struct taskstats *tidstats, *tgidstats; 848 struct taskstats *tidstats;
849 int group_dead; 849 int group_dead;
850 850
851 profile_task_exit(tsk); 851 profile_task_exit(tsk);
@@ -884,7 +884,7 @@ fastcall NORET_TYPE void do_exit(long code)
884 current->comm, current->pid, 884 current->comm, current->pid,
885 preempt_count()); 885 preempt_count());
886 886
887 taskstats_exit_alloc(&tidstats, &tgidstats); 887 taskstats_exit_alloc(&tidstats);
888 888
889 acct_update_integrals(tsk); 889 acct_update_integrals(tsk);
890 if (tsk->mm) { 890 if (tsk->mm) {
@@ -905,8 +905,8 @@ fastcall NORET_TYPE void do_exit(long code)
905#endif 905#endif
906 if (unlikely(tsk->audit_context)) 906 if (unlikely(tsk->audit_context))
907 audit_free(tsk); 907 audit_free(tsk);
908 taskstats_exit_send(tsk, tidstats, tgidstats); 908 taskstats_exit_send(tsk, tidstats, group_dead);
909 taskstats_exit_free(tidstats, tgidstats); 909 taskstats_exit_free(tidstats);
910 delayacct_tsk_exit(tsk); 910 delayacct_tsk_exit(tsk);
911 911
912 exit_mm(tsk); 912 exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 451cfd35bf22..1b0f7b1e0881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -44,6 +44,7 @@
44#include <linux/acct.h> 44#include <linux/acct.h>
45#include <linux/cn_proc.h> 45#include <linux/cn_proc.h>
46#include <linux/delayacct.h> 46#include <linux/delayacct.h>
47#include <linux/taskstats_kern.h>
47 48
48#include <asm/pgtable.h> 49#include <asm/pgtable.h>
49#include <asm/pgalloc.h> 50#include <asm/pgalloc.h>
@@ -819,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
819 if (clone_flags & CLONE_THREAD) { 820 if (clone_flags & CLONE_THREAD) {
820 atomic_inc(&current->signal->count); 821 atomic_inc(&current->signal->count);
821 atomic_inc(&current->signal->live); 822 atomic_inc(&current->signal->live);
823 taskstats_tgid_alloc(current->signal);
822 return 0; 824 return 0;
823 } 825 }
824 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 826 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -863,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
863 INIT_LIST_HEAD(&sig->cpu_timers[0]); 865 INIT_LIST_HEAD(&sig->cpu_timers[0]);
864 INIT_LIST_HEAD(&sig->cpu_timers[1]); 866 INIT_LIST_HEAD(&sig->cpu_timers[1]);
865 INIT_LIST_HEAD(&sig->cpu_timers[2]); 867 INIT_LIST_HEAD(&sig->cpu_timers[2]);
868 taskstats_tgid_init(sig);
866 869
867 task_lock(current->group_leader); 870 task_lock(current->group_leader);
868 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 871 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -884,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
884void __cleanup_signal(struct signal_struct *sig) 887void __cleanup_signal(struct signal_struct *sig)
885{ 888{
886 exit_thread_group_keys(sig); 889 exit_thread_group_keys(sig);
890 taskstats_tgid_free(sig);
887 kmem_cache_free(signal_cachep, sig); 891 kmem_cache_free(signal_cachep, sig);
888} 892}
889 893
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea9506de3b85..4a0a5022b299 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -132,46 +132,79 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
132static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, 132static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
133 struct taskstats *stats) 133 struct taskstats *stats)
134{ 134{
135 int rc;
136 struct task_struct *tsk, *first; 135 struct task_struct *tsk, *first;
136 unsigned long flags;
137 137
138 /*
139 * Add additional stats from live tasks except zombie thread group
140 * leaders who are already counted with the dead tasks
141 */
138 first = tgidtsk; 142 first = tgidtsk;
139 read_lock(&tasklist_lock);
140 if (!first) { 143 if (!first) {
144 read_lock(&tasklist_lock);
141 first = find_task_by_pid(tgid); 145 first = find_task_by_pid(tgid);
142 if (!first) { 146 if (!first) {
143 read_unlock(&tasklist_lock); 147 read_unlock(&tasklist_lock);
144 return -ESRCH; 148 return -ESRCH;
145 } 149 }
146 } 150 get_task_struct(first);
151 read_unlock(&tasklist_lock);
152 } else
153 get_task_struct(first);
154
155 /* Start with stats from dead tasks */
156 spin_lock_irqsave(&first->signal->stats_lock, flags);
157 if (first->signal->stats)
158 memcpy(stats, first->signal->stats, sizeof(*stats));
159 spin_unlock_irqrestore(&first->signal->stats_lock, flags);
160
147 tsk = first; 161 tsk = first;
162 read_lock(&tasklist_lock);
148 do { 163 do {
164 if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
165 continue;
149 /* 166 /*
150 * Each accounting subsystem adds calls its functions to 167 * Accounting subsystem can call its functions here to
151 * fill in relevant parts of struct taskstsats as follows 168 * fill in relevant parts of struct taskstsats as follows
152 * 169 *
153 * rc = per-task-foo(stats, tsk); 170 * per-task-foo(stats, tsk);
154 * if (rc)
155 * break;
156 */ 171 */
157 172 delayacct_add_tsk(stats, tsk);
158 rc = delayacct_add_tsk(stats, tsk);
159 if (rc)
160 break;
161 173
162 } while_each_thread(first, tsk); 174 } while_each_thread(first, tsk);
163 read_unlock(&tasklist_lock); 175 read_unlock(&tasklist_lock);
164 stats->version = TASKSTATS_VERSION; 176 stats->version = TASKSTATS_VERSION;
165 177
166
167 /* 178 /*
168 * Accounting subsytems can also add calls here if they don't 179 * Accounting subsytems can also add calls here to modify
169 * wish to aggregate statistics for per-tgid stats 180 * fields of taskstats.
170 */ 181 */
171 182
172 return rc; 183 return 0;
184}
185
186
187static void fill_tgid_exit(struct task_struct *tsk)
188{
189 unsigned long flags;
190
191 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
192 if (!tsk->signal->stats)
193 goto ret;
194
195 /*
196 * Each accounting subsystem calls its functions here to
197 * accumalate its per-task stats for tsk, into the per-tgid structure
198 *
199 * per-task-foo(tsk->signal->stats, tsk);
200 */
201 delayacct_add_tsk(tsk->signal->stats, tsk);
202ret:
203 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
204 return;
173} 205}
174 206
207
175static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) 208static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
176{ 209{
177 int rc = 0; 210 int rc = 0;
@@ -230,7 +263,7 @@ err:
230 263
231/* Send pid data out on exit */ 264/* Send pid data out on exit */
232void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 265void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
233 struct taskstats *tgidstats) 266 int group_dead)
234{ 267{
235 int rc; 268 int rc;
236 struct sk_buff *rep_skb; 269 struct sk_buff *rep_skb;
@@ -238,13 +271,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
238 size_t size; 271 size_t size;
239 int is_thread_group; 272 int is_thread_group;
240 struct nlattr *na; 273 struct nlattr *na;
274 unsigned long flags;
241 275
242 if (!family_registered || !tidstats) 276 if (!family_registered || !tidstats)
243 return; 277 return;
244 278
245 is_thread_group = !thread_group_empty(tsk); 279 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
246 rc = 0; 280 is_thread_group = tsk->signal->stats ? 1 : 0;
281 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
247 282
283 rc = 0;
248 /* 284 /*
249 * Size includes space for nested attributes 285 * Size includes space for nested attributes
250 */ 286 */
@@ -268,30 +304,28 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
268 *tidstats); 304 *tidstats);
269 nla_nest_end(rep_skb, na); 305 nla_nest_end(rep_skb, na);
270 306
271 if (!is_thread_group || !tgidstats) { 307 if (!is_thread_group)
272 send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); 308 goto send;
273 goto ret;
274 }
275 309
276 rc = fill_tgid(tsk->pid, tsk, tgidstats);
277 /* 310 /*
278 * If fill_tgid() failed then one probable reason could be that the 311 * tsk has/had a thread group so fill the tsk->signal->stats structure
279 * thread group leader has exited. fill_tgid() will fail, send out 312 * Doesn't matter if tsk is the leader or the last group member leaving
280 * the pid statistics collected earlier.
281 */ 313 */
282 if (rc < 0) { 314
283 send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); 315 fill_tgid_exit(tsk);
284 goto ret; 316 if (!group_dead)
285 } 317 goto send;
286 318
287 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 319 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
288 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); 320 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
321 /* No locking needed for tsk->signal->stats since group is dead */
289 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 322 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
290 *tgidstats); 323 *tsk->signal->stats);
291 nla_nest_end(rep_skb, na); 324 nla_nest_end(rep_skb, na);
292 325
326send:
293 send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); 327 send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
294 goto ret; 328 return;
295 329
296nla_put_failure: 330nla_put_failure:
297 genlmsg_cancel(rep_skb, reply); 331 genlmsg_cancel(rep_skb, reply);