diff options
Diffstat (limited to 'kernel/taskstats.c')
-rw-r--r-- | kernel/taskstats.c | 564 |
1 files changed, 564 insertions, 0 deletions
diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 000000000000..e78187657330 --- /dev/null +++ b/kernel/taskstats.c | |||
@@ -0,0 +1,564 @@ | |||
1 | /* | ||
2 | * taskstats.c - Export per-task statistics to userland | ||
3 | * | ||
4 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | ||
5 | * (C) Balbir Singh, IBM Corp. 2006 | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or | ||
10 | * (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | */ | ||
18 | |||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/taskstats_kern.h> | ||
21 | #include <linux/delayacct.h> | ||
22 | #include <linux/cpumask.h> | ||
23 | #include <linux/percpu.h> | ||
24 | #include <net/genetlink.h> | ||
25 | #include <asm/atomic.h> | ||
26 | |||
27 | /* | ||
28 | * Maximum length of a cpumask that can be specified in | ||
29 | * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute | ||
30 | */ | ||
31 | #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) | ||
32 | |||
33 | static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; | ||
34 | static int family_registered; | ||
35 | kmem_cache_t *taskstats_cache; | ||
36 | |||
37 | static struct genl_family family = { | ||
38 | .id = GENL_ID_GENERATE, | ||
39 | .name = TASKSTATS_GENL_NAME, | ||
40 | .version = TASKSTATS_GENL_VERSION, | ||
41 | .maxattr = TASKSTATS_CMD_ATTR_MAX, | ||
42 | }; | ||
43 | |||
44 | static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] | ||
45 | __read_mostly = { | ||
46 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, | ||
47 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, | ||
48 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, | ||
49 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; | ||
50 | |||
51 | struct listener { | ||
52 | struct list_head list; | ||
53 | pid_t pid; | ||
54 | char valid; | ||
55 | }; | ||
56 | |||
57 | struct listener_list { | ||
58 | struct rw_semaphore sem; | ||
59 | struct list_head list; | ||
60 | }; | ||
61 | static DEFINE_PER_CPU(struct listener_list, listener_array); | ||
62 | |||
63 | enum actions { | ||
64 | REGISTER, | ||
65 | DEREGISTER, | ||
66 | CPU_DONT_CARE | ||
67 | }; | ||
68 | |||
69 | static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | ||
70 | void **replyp, size_t size) | ||
71 | { | ||
72 | struct sk_buff *skb; | ||
73 | void *reply; | ||
74 | |||
75 | /* | ||
76 | * If new attributes are added, please revisit this allocation | ||
77 | */ | ||
78 | skb = nlmsg_new(size); | ||
79 | if (!skb) | ||
80 | return -ENOMEM; | ||
81 | |||
82 | if (!info) { | ||
83 | int seq = get_cpu_var(taskstats_seqnum)++; | ||
84 | put_cpu_var(taskstats_seqnum); | ||
85 | |||
86 | reply = genlmsg_put(skb, 0, seq, | ||
87 | family.id, 0, 0, | ||
88 | cmd, family.version); | ||
89 | } else | ||
90 | reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, | ||
91 | family.id, 0, 0, | ||
92 | cmd, family.version); | ||
93 | if (reply == NULL) { | ||
94 | nlmsg_free(skb); | ||
95 | return -EINVAL; | ||
96 | } | ||
97 | |||
98 | *skbp = skb; | ||
99 | *replyp = reply; | ||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * Send taskstats data in @skb to listener with nl_pid @pid | ||
105 | */ | ||
106 | static int send_reply(struct sk_buff *skb, pid_t pid) | ||
107 | { | ||
108 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | ||
109 | void *reply = genlmsg_data(genlhdr); | ||
110 | int rc; | ||
111 | |||
112 | rc = genlmsg_end(skb, reply); | ||
113 | if (rc < 0) { | ||
114 | nlmsg_free(skb); | ||
115 | return rc; | ||
116 | } | ||
117 | |||
118 | return genlmsg_unicast(skb, pid); | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * Send taskstats data in @skb to listeners registered for @cpu's exit data | ||
123 | */ | ||
124 | static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) | ||
125 | { | ||
126 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | ||
127 | struct listener_list *listeners; | ||
128 | struct listener *s, *tmp; | ||
129 | struct sk_buff *skb_next, *skb_cur = skb; | ||
130 | void *reply = genlmsg_data(genlhdr); | ||
131 | int rc, delcount = 0; | ||
132 | |||
133 | rc = genlmsg_end(skb, reply); | ||
134 | if (rc < 0) { | ||
135 | nlmsg_free(skb); | ||
136 | return; | ||
137 | } | ||
138 | |||
139 | rc = 0; | ||
140 | listeners = &per_cpu(listener_array, cpu); | ||
141 | down_read(&listeners->sem); | ||
142 | list_for_each_entry(s, &listeners->list, list) { | ||
143 | skb_next = NULL; | ||
144 | if (!list_is_last(&s->list, &listeners->list)) { | ||
145 | skb_next = skb_clone(skb_cur, GFP_KERNEL); | ||
146 | if (!skb_next) | ||
147 | break; | ||
148 | } | ||
149 | rc = genlmsg_unicast(skb_cur, s->pid); | ||
150 | if (rc == -ECONNREFUSED) { | ||
151 | s->valid = 0; | ||
152 | delcount++; | ||
153 | } | ||
154 | skb_cur = skb_next; | ||
155 | } | ||
156 | up_read(&listeners->sem); | ||
157 | |||
158 | if (skb_cur) | ||
159 | nlmsg_free(skb_cur); | ||
160 | |||
161 | if (!delcount) | ||
162 | return; | ||
163 | |||
164 | /* Delete invalidated entries */ | ||
165 | down_write(&listeners->sem); | ||
166 | list_for_each_entry_safe(s, tmp, &listeners->list, list) { | ||
167 | if (!s->valid) { | ||
168 | list_del(&s->list); | ||
169 | kfree(s); | ||
170 | } | ||
171 | } | ||
172 | up_write(&listeners->sem); | ||
173 | } | ||
174 | |||
175 | static int fill_pid(pid_t pid, struct task_struct *pidtsk, | ||
176 | struct taskstats *stats) | ||
177 | { | ||
178 | int rc = 0; | ||
179 | struct task_struct *tsk = pidtsk; | ||
180 | |||
181 | if (!pidtsk) { | ||
182 | read_lock(&tasklist_lock); | ||
183 | tsk = find_task_by_pid(pid); | ||
184 | if (!tsk) { | ||
185 | read_unlock(&tasklist_lock); | ||
186 | return -ESRCH; | ||
187 | } | ||
188 | get_task_struct(tsk); | ||
189 | read_unlock(&tasklist_lock); | ||
190 | } else | ||
191 | get_task_struct(tsk); | ||
192 | |||
193 | /* | ||
194 | * Each accounting subsystem adds calls to its functions to | ||
195 | * fill in relevant parts of struct taskstsats as follows | ||
196 | * | ||
197 | * per-task-foo(stats, tsk); | ||
198 | */ | ||
199 | |||
200 | delayacct_add_tsk(stats, tsk); | ||
201 | stats->version = TASKSTATS_VERSION; | ||
202 | |||
203 | /* Define err: label here if needed */ | ||
204 | put_task_struct(tsk); | ||
205 | return rc; | ||
206 | |||
207 | } | ||
208 | |||
209 | static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, | ||
210 | struct taskstats *stats) | ||
211 | { | ||
212 | struct task_struct *tsk, *first; | ||
213 | unsigned long flags; | ||
214 | |||
215 | /* | ||
216 | * Add additional stats from live tasks except zombie thread group | ||
217 | * leaders who are already counted with the dead tasks | ||
218 | */ | ||
219 | first = tgidtsk; | ||
220 | if (!first) { | ||
221 | read_lock(&tasklist_lock); | ||
222 | first = find_task_by_pid(tgid); | ||
223 | if (!first) { | ||
224 | read_unlock(&tasklist_lock); | ||
225 | return -ESRCH; | ||
226 | } | ||
227 | get_task_struct(first); | ||
228 | read_unlock(&tasklist_lock); | ||
229 | } else | ||
230 | get_task_struct(first); | ||
231 | |||
232 | /* Start with stats from dead tasks */ | ||
233 | spin_lock_irqsave(&first->signal->stats_lock, flags); | ||
234 | if (first->signal->stats) | ||
235 | memcpy(stats, first->signal->stats, sizeof(*stats)); | ||
236 | spin_unlock_irqrestore(&first->signal->stats_lock, flags); | ||
237 | |||
238 | tsk = first; | ||
239 | read_lock(&tasklist_lock); | ||
240 | do { | ||
241 | if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) | ||
242 | continue; | ||
243 | /* | ||
244 | * Accounting subsystem can call its functions here to | ||
245 | * fill in relevant parts of struct taskstsats as follows | ||
246 | * | ||
247 | * per-task-foo(stats, tsk); | ||
248 | */ | ||
249 | delayacct_add_tsk(stats, tsk); | ||
250 | |||
251 | } while_each_thread(first, tsk); | ||
252 | read_unlock(&tasklist_lock); | ||
253 | stats->version = TASKSTATS_VERSION; | ||
254 | |||
255 | /* | ||
256 | * Accounting subsytems can also add calls here to modify | ||
257 | * fields of taskstats. | ||
258 | */ | ||
259 | |||
260 | return 0; | ||
261 | } | ||
262 | |||
263 | |||
264 | static void fill_tgid_exit(struct task_struct *tsk) | ||
265 | { | ||
266 | unsigned long flags; | ||
267 | |||
268 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); | ||
269 | if (!tsk->signal->stats) | ||
270 | goto ret; | ||
271 | |||
272 | /* | ||
273 | * Each accounting subsystem calls its functions here to | ||
274 | * accumalate its per-task stats for tsk, into the per-tgid structure | ||
275 | * | ||
276 | * per-task-foo(tsk->signal->stats, tsk); | ||
277 | */ | ||
278 | delayacct_add_tsk(tsk->signal->stats, tsk); | ||
279 | ret: | ||
280 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | ||
281 | return; | ||
282 | } | ||
283 | |||
284 | static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) | ||
285 | { | ||
286 | struct listener_list *listeners; | ||
287 | struct listener *s, *tmp; | ||
288 | unsigned int cpu; | ||
289 | cpumask_t mask = *maskp; | ||
290 | |||
291 | if (!cpus_subset(mask, cpu_possible_map)) | ||
292 | return -EINVAL; | ||
293 | |||
294 | if (isadd == REGISTER) { | ||
295 | for_each_cpu_mask(cpu, mask) { | ||
296 | s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, | ||
297 | cpu_to_node(cpu)); | ||
298 | if (!s) | ||
299 | goto cleanup; | ||
300 | s->pid = pid; | ||
301 | INIT_LIST_HEAD(&s->list); | ||
302 | s->valid = 1; | ||
303 | |||
304 | listeners = &per_cpu(listener_array, cpu); | ||
305 | down_write(&listeners->sem); | ||
306 | list_add(&s->list, &listeners->list); | ||
307 | up_write(&listeners->sem); | ||
308 | } | ||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | /* Deregister or cleanup */ | ||
313 | cleanup: | ||
314 | for_each_cpu_mask(cpu, mask) { | ||
315 | listeners = &per_cpu(listener_array, cpu); | ||
316 | down_write(&listeners->sem); | ||
317 | list_for_each_entry_safe(s, tmp, &listeners->list, list) { | ||
318 | if (s->pid == pid) { | ||
319 | list_del(&s->list); | ||
320 | kfree(s); | ||
321 | break; | ||
322 | } | ||
323 | } | ||
324 | up_write(&listeners->sem); | ||
325 | } | ||
326 | return 0; | ||
327 | } | ||
328 | |||
329 | static int parse(struct nlattr *na, cpumask_t *mask) | ||
330 | { | ||
331 | char *data; | ||
332 | int len; | ||
333 | int ret; | ||
334 | |||
335 | if (na == NULL) | ||
336 | return 1; | ||
337 | len = nla_len(na); | ||
338 | if (len > TASKSTATS_CPUMASK_MAXLEN) | ||
339 | return -E2BIG; | ||
340 | if (len < 1) | ||
341 | return -EINVAL; | ||
342 | data = kmalloc(len, GFP_KERNEL); | ||
343 | if (!data) | ||
344 | return -ENOMEM; | ||
345 | nla_strlcpy(data, na, len); | ||
346 | ret = cpulist_parse(data, *mask); | ||
347 | kfree(data); | ||
348 | return ret; | ||
349 | } | ||
350 | |||
351 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | ||
352 | { | ||
353 | int rc = 0; | ||
354 | struct sk_buff *rep_skb; | ||
355 | struct taskstats stats; | ||
356 | void *reply; | ||
357 | size_t size; | ||
358 | struct nlattr *na; | ||
359 | cpumask_t mask; | ||
360 | |||
361 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); | ||
362 | if (rc < 0) | ||
363 | return rc; | ||
364 | if (rc == 0) | ||
365 | return add_del_listener(info->snd_pid, &mask, REGISTER); | ||
366 | |||
367 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); | ||
368 | if (rc < 0) | ||
369 | return rc; | ||
370 | if (rc == 0) | ||
371 | return add_del_listener(info->snd_pid, &mask, DEREGISTER); | ||
372 | |||
373 | /* | ||
374 | * Size includes space for nested attributes | ||
375 | */ | ||
376 | size = nla_total_size(sizeof(u32)) + | ||
377 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
378 | |||
379 | memset(&stats, 0, sizeof(stats)); | ||
380 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
381 | if (rc < 0) | ||
382 | return rc; | ||
383 | |||
384 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | ||
385 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | ||
386 | rc = fill_pid(pid, NULL, &stats); | ||
387 | if (rc < 0) | ||
388 | goto err; | ||
389 | |||
390 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | ||
391 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); | ||
392 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
393 | stats); | ||
394 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | ||
395 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | ||
396 | rc = fill_tgid(tgid, NULL, &stats); | ||
397 | if (rc < 0) | ||
398 | goto err; | ||
399 | |||
400 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | ||
401 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); | ||
402 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
403 | stats); | ||
404 | } else { | ||
405 | rc = -EINVAL; | ||
406 | goto err; | ||
407 | } | ||
408 | |||
409 | nla_nest_end(rep_skb, na); | ||
410 | |||
411 | return send_reply(rep_skb, info->snd_pid); | ||
412 | |||
413 | nla_put_failure: | ||
414 | return genlmsg_cancel(rep_skb, reply); | ||
415 | err: | ||
416 | nlmsg_free(rep_skb); | ||
417 | return rc; | ||
418 | } | ||
419 | |||
420 | void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) | ||
421 | { | ||
422 | struct listener_list *listeners; | ||
423 | struct taskstats *tmp; | ||
424 | /* | ||
425 | * This is the cpu on which the task is exiting currently and will | ||
426 | * be the one for which the exit event is sent, even if the cpu | ||
427 | * on which this function is running changes later. | ||
428 | */ | ||
429 | *mycpu = raw_smp_processor_id(); | ||
430 | |||
431 | *ptidstats = NULL; | ||
432 | tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); | ||
433 | if (!tmp) | ||
434 | return; | ||
435 | |||
436 | listeners = &per_cpu(listener_array, *mycpu); | ||
437 | down_read(&listeners->sem); | ||
438 | if (!list_empty(&listeners->list)) { | ||
439 | *ptidstats = tmp; | ||
440 | tmp = NULL; | ||
441 | } | ||
442 | up_read(&listeners->sem); | ||
443 | kfree(tmp); | ||
444 | } | ||
445 | |||
446 | /* Send pid data out on exit */ | ||
447 | void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, | ||
448 | int group_dead, unsigned int mycpu) | ||
449 | { | ||
450 | int rc; | ||
451 | struct sk_buff *rep_skb; | ||
452 | void *reply; | ||
453 | size_t size; | ||
454 | int is_thread_group; | ||
455 | struct nlattr *na; | ||
456 | unsigned long flags; | ||
457 | |||
458 | if (!family_registered || !tidstats) | ||
459 | return; | ||
460 | |||
461 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); | ||
462 | is_thread_group = tsk->signal->stats ? 1 : 0; | ||
463 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | ||
464 | |||
465 | rc = 0; | ||
466 | /* | ||
467 | * Size includes space for nested attributes | ||
468 | */ | ||
469 | size = nla_total_size(sizeof(u32)) + | ||
470 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
471 | |||
472 | if (is_thread_group) | ||
473 | size = 2 * size; /* PID + STATS + TGID + STATS */ | ||
474 | |||
475 | rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
476 | if (rc < 0) | ||
477 | goto ret; | ||
478 | |||
479 | rc = fill_pid(tsk->pid, tsk, tidstats); | ||
480 | if (rc < 0) | ||
481 | goto err_skb; | ||
482 | |||
483 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | ||
484 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); | ||
485 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
486 | *tidstats); | ||
487 | nla_nest_end(rep_skb, na); | ||
488 | |||
489 | if (!is_thread_group) | ||
490 | goto send; | ||
491 | |||
492 | /* | ||
493 | * tsk has/had a thread group so fill the tsk->signal->stats structure | ||
494 | * Doesn't matter if tsk is the leader or the last group member leaving | ||
495 | */ | ||
496 | |||
497 | fill_tgid_exit(tsk); | ||
498 | if (!group_dead) | ||
499 | goto send; | ||
500 | |||
501 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | ||
502 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); | ||
503 | /* No locking needed for tsk->signal->stats since group is dead */ | ||
504 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
505 | *tsk->signal->stats); | ||
506 | nla_nest_end(rep_skb, na); | ||
507 | |||
508 | send: | ||
509 | send_cpu_listeners(rep_skb, mycpu); | ||
510 | return; | ||
511 | |||
512 | nla_put_failure: | ||
513 | genlmsg_cancel(rep_skb, reply); | ||
514 | goto ret; | ||
515 | err_skb: | ||
516 | nlmsg_free(rep_skb); | ||
517 | ret: | ||
518 | return; | ||
519 | } | ||
520 | |||
521 | static struct genl_ops taskstats_ops = { | ||
522 | .cmd = TASKSTATS_CMD_GET, | ||
523 | .doit = taskstats_user_cmd, | ||
524 | .policy = taskstats_cmd_get_policy, | ||
525 | }; | ||
526 | |||
527 | /* Needed early in initialization */ | ||
528 | void __init taskstats_init_early(void) | ||
529 | { | ||
530 | unsigned int i; | ||
531 | |||
532 | taskstats_cache = kmem_cache_create("taskstats_cache", | ||
533 | sizeof(struct taskstats), | ||
534 | 0, SLAB_PANIC, NULL, NULL); | ||
535 | for_each_possible_cpu(i) { | ||
536 | INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); | ||
537 | init_rwsem(&(per_cpu(listener_array, i).sem)); | ||
538 | } | ||
539 | } | ||
540 | |||
541 | static int __init taskstats_init(void) | ||
542 | { | ||
543 | int rc; | ||
544 | |||
545 | rc = genl_register_family(&family); | ||
546 | if (rc) | ||
547 | return rc; | ||
548 | |||
549 | rc = genl_register_ops(&family, &taskstats_ops); | ||
550 | if (rc < 0) | ||
551 | goto err; | ||
552 | |||
553 | family_registered = 1; | ||
554 | return 0; | ||
555 | err: | ||
556 | genl_unregister_family(&family); | ||
557 | return rc; | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * late initcall ensures initialization of statistics collection | ||
562 | * mechanisms precedes initialization of the taskstats interface | ||
563 | */ | ||
564 | late_initcall(taskstats_init); | ||