aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShailabh Nagar <nagar@watson.ibm.com>2006-07-14 03:24:47 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-07-15 00:53:57 -0400
commitf9fd8914c1acca0d98b69d831b128d5b52f03c51 (patch)
tree18c2506bb31e49a9e2c4a483d37b0394c815fe9d
parentc8924363da07aec213e5d359f23eeae1fff91951 (diff)
[PATCH] per-task delay accounting taskstats interface: control exit data through cpumasks
On systems with a large number of cpus, with even a modest rate of tasks exiting per cpu, the volume of taskstats data sent on thread exit can overflow a userspace listener's buffers. One approach to avoiding overflow is to allow listeners to get data for a limited and specific set of cpus. By scaling the number of listeners and/or the cpus they monitor, userspace can handle the statistical data overload more gracefully. In this patch, each listener registers to listen to a specific set of cpus by specifying a cpumask. The interest is recorded per-cpu. When a task exits on a cpu, its taskstats data is unicast to each listener interested in that cpu. Thanks to Andrew Morton for pointing out the various scalability and general concerns of previous attempts and for suggesting this design. [akpm@osdl.org: build fix] Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com> Signed-off-by: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/taskstats.h4
-rw-r--r--include/linux/taskstats_kern.h27
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/taskstats.c200
4 files changed, 198 insertions, 38 deletions
diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index c6aeca32348e..f1cb6cddd19d 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -91,8 +91,6 @@ struct taskstats {
91}; 91};
92 92
93 93
94#define TASKSTATS_LISTEN_GROUP 0x1
95
96/* 94/*
97 * Commands sent from userspace 95 * Commands sent from userspace
98 * Not versioned. New commands should only be inserted at the enum's end 96 * Not versioned. New commands should only be inserted at the enum's end
@@ -124,6 +122,8 @@ enum {
124 TASKSTATS_CMD_ATTR_UNSPEC = 0, 122 TASKSTATS_CMD_ATTR_UNSPEC = 0,
125 TASKSTATS_CMD_ATTR_PID, 123 TASKSTATS_CMD_ATTR_PID,
126 TASKSTATS_CMD_ATTR_TGID, 124 TASKSTATS_CMD_ATTR_TGID,
125 TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
126 TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
127 __TASKSTATS_CMD_ATTR_MAX, 127 __TASKSTATS_CMD_ATTR_MAX,
128}; 128};
129 129
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 2b6adec3a2e4..16894b7edcc8 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -11,30 +11,10 @@
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <net/genetlink.h> 12#include <net/genetlink.h>
13 13
14enum {
15 TASKSTATS_MSG_UNICAST, /* send data only to requester */
16 TASKSTATS_MSG_MULTICAST, /* send data to a group */
17};
18
19#ifdef CONFIG_TASKSTATS 14#ifdef CONFIG_TASKSTATS
20extern kmem_cache_t *taskstats_cache; 15extern kmem_cache_t *taskstats_cache;
21extern struct mutex taskstats_exit_mutex; 16extern struct mutex taskstats_exit_mutex;
22 17
23static inline int taskstats_has_listeners(void)
24{
25 if (!genl_sock)
26 return 0;
27 return netlink_has_listeners(genl_sock, TASKSTATS_LISTEN_GROUP);
28}
29
30
31static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
32{
33 *ptidstats = NULL;
34 if (taskstats_has_listeners())
35 *ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
36}
37
38static inline void taskstats_exit_free(struct taskstats *tidstats) 18static inline void taskstats_exit_free(struct taskstats *tidstats)
39{ 19{
40 if (tidstats) 20 if (tidstats)
@@ -82,17 +62,18 @@ static inline void taskstats_tgid_free(struct signal_struct *sig)
82 kmem_cache_free(taskstats_cache, stats); 62 kmem_cache_free(taskstats_cache, stats);
83} 63}
84 64
85extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int); 65extern void taskstats_exit_alloc(struct taskstats **, unsigned int *);
66extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int, unsigned int);
86extern void taskstats_init_early(void); 67extern void taskstats_init_early(void);
87extern void taskstats_tgid_alloc(struct signal_struct *); 68extern void taskstats_tgid_alloc(struct signal_struct *);
88#else 69#else
89static inline void taskstats_exit_alloc(struct taskstats **ptidstats) 70static inline void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
90{} 71{}
91static inline void taskstats_exit_free(struct taskstats *ptidstats) 72static inline void taskstats_exit_free(struct taskstats *ptidstats)
92{} 73{}
93static inline void taskstats_exit_send(struct task_struct *tsk, 74static inline void taskstats_exit_send(struct task_struct *tsk,
94 struct taskstats *tidstats, 75 struct taskstats *tidstats,
95 int group_dead) 76 int group_dead, unsigned int cpu)
96{} 77{}
97static inline void taskstats_tgid_init(struct signal_struct *sig) 78static inline void taskstats_tgid_init(struct signal_struct *sig)
98{} 79{}
diff --git a/kernel/exit.c b/kernel/exit.c
index 67c1e9a4f812..dba194a8d416 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -847,6 +847,7 @@ fastcall NORET_TYPE void do_exit(long code)
847 struct task_struct *tsk = current; 847 struct task_struct *tsk = current;
848 struct taskstats *tidstats; 848 struct taskstats *tidstats;
849 int group_dead; 849 int group_dead;
850 unsigned int mycpu;
850 851
851 profile_task_exit(tsk); 852 profile_task_exit(tsk);
852 853
@@ -884,7 +885,7 @@ fastcall NORET_TYPE void do_exit(long code)
884 current->comm, current->pid, 885 current->comm, current->pid,
885 preempt_count()); 886 preempt_count());
886 887
887 taskstats_exit_alloc(&tidstats); 888 taskstats_exit_alloc(&tidstats, &mycpu);
888 889
889 acct_update_integrals(tsk); 890 acct_update_integrals(tsk);
890 if (tsk->mm) { 891 if (tsk->mm) {
@@ -905,7 +906,7 @@ fastcall NORET_TYPE void do_exit(long code)
905#endif 906#endif
906 if (unlikely(tsk->audit_context)) 907 if (unlikely(tsk->audit_context))
907 audit_free(tsk); 908 audit_free(tsk);
908 taskstats_exit_send(tsk, tidstats, group_dead); 909 taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
909 taskstats_exit_free(tidstats); 910 taskstats_exit_free(tidstats);
910 delayacct_tsk_exit(tsk); 911 delayacct_tsk_exit(tsk);
911 912
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a0a5022b299..abb59e323544 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -19,9 +19,17 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h> 20#include <linux/taskstats_kern.h>
21#include <linux/delayacct.h> 21#include <linux/delayacct.h>
22#include <linux/cpumask.h>
23#include <linux/percpu.h>
22#include <net/genetlink.h> 24#include <net/genetlink.h>
23#include <asm/atomic.h> 25#include <asm/atomic.h>
24 26
27/*
28 * Maximum length of a cpumask that can be specified in
29 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
30 */
31#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
32
25static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 33static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
26static int family_registered; 34static int family_registered;
27kmem_cache_t *taskstats_cache; 35kmem_cache_t *taskstats_cache;
@@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
37__read_mostly = { 45__read_mostly = {
38 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 46 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
39 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 47 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
48 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
49 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
50
51struct listener {
52 struct list_head list;
53 pid_t pid;
40}; 54};
41 55
56struct listener_list {
57 struct rw_semaphore sem;
58 struct list_head list;
59};
60static DEFINE_PER_CPU(struct listener_list, listener_array);
61
62enum actions {
63 REGISTER,
64 DEREGISTER,
65 CPU_DONT_CARE
66};
42 67
43static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 68static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
44 void **replyp, size_t size) 69 void **replyp, size_t size)
@@ -74,25 +99,68 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
74 return 0; 99 return 0;
75} 100}
76 101
77static int send_reply(struct sk_buff *skb, pid_t pid, int event) 102/*
103 * Send taskstats data in @skb to listener with nl_pid @pid
104 */
105static int send_reply(struct sk_buff *skb, pid_t pid)
78{ 106{
79 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 107 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
80 void *reply; 108 void *reply = genlmsg_data(genlhdr);
81 int rc; 109 int rc;
82 110
83 reply = genlmsg_data(genlhdr);
84
85 rc = genlmsg_end(skb, reply); 111 rc = genlmsg_end(skb, reply);
86 if (rc < 0) { 112 if (rc < 0) {
87 nlmsg_free(skb); 113 nlmsg_free(skb);
88 return rc; 114 return rc;
89 } 115 }
90 116
91 if (event == TASKSTATS_MSG_MULTICAST)
92 return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
93 return genlmsg_unicast(skb, pid); 117 return genlmsg_unicast(skb, pid);
94} 118}
95 119
120/*
121 * Send taskstats data in @skb to listeners registered for @cpu's exit data
122 */
123static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
124{
125 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
126 struct listener_list *listeners;
127 struct listener *s, *tmp;
128 struct sk_buff *skb_next, *skb_cur = skb;
129 void *reply = genlmsg_data(genlhdr);
130 int rc, ret;
131
132 rc = genlmsg_end(skb, reply);
133 if (rc < 0) {
134 nlmsg_free(skb);
135 return rc;
136 }
137
138 rc = 0;
139 listeners = &per_cpu(listener_array, cpu);
140 down_write(&listeners->sem);
141 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
142 skb_next = NULL;
143 if (!list_is_last(&s->list, &listeners->list)) {
144 skb_next = skb_clone(skb_cur, GFP_KERNEL);
145 if (!skb_next) {
146 nlmsg_free(skb_cur);
147 rc = -ENOMEM;
148 break;
149 }
150 }
151 ret = genlmsg_unicast(skb_cur, s->pid);
152 if (ret == -ECONNREFUSED) {
153 list_del(&s->list);
154 kfree(s);
155 rc = ret;
156 }
157 skb_cur = skb_next;
158 }
159 up_write(&listeners->sem);
160
161 return rc;
162}
163
96static int fill_pid(pid_t pid, struct task_struct *pidtsk, 164static int fill_pid(pid_t pid, struct task_struct *pidtsk,
97 struct taskstats *stats) 165 struct taskstats *stats)
98{ 166{
@@ -204,8 +272,73 @@ ret:
204 return; 272 return;
205} 273}
206 274
275static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
276{
277 struct listener_list *listeners;
278 struct listener *s, *tmp;
279 unsigned int cpu;
280 cpumask_t mask = *maskp;
207 281
208static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) 282 if (!cpus_subset(mask, cpu_possible_map))
283 return -EINVAL;
284
285 if (isadd == REGISTER) {
286 for_each_cpu_mask(cpu, mask) {
287 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
288 cpu_to_node(cpu));
289 if (!s)
290 goto cleanup;
291 s->pid = pid;
292 INIT_LIST_HEAD(&s->list);
293
294 listeners = &per_cpu(listener_array, cpu);
295 down_write(&listeners->sem);
296 list_add(&s->list, &listeners->list);
297 up_write(&listeners->sem);
298 }
299 return 0;
300 }
301
302 /* Deregister or cleanup */
303cleanup:
304 for_each_cpu_mask(cpu, mask) {
305 listeners = &per_cpu(listener_array, cpu);
306 down_write(&listeners->sem);
307 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
308 if (s->pid == pid) {
309 list_del(&s->list);
310 kfree(s);
311 break;
312 }
313 }
314 up_write(&listeners->sem);
315 }
316 return 0;
317}
318
319static int parse(struct nlattr *na, cpumask_t *mask)
320{
321 char *data;
322 int len;
323 int ret;
324
325 if (na == NULL)
326 return 1;
327 len = nla_len(na);
328 if (len > TASKSTATS_CPUMASK_MAXLEN)
329 return -E2BIG;
330 if (len < 1)
331 return -EINVAL;
332 data = kmalloc(len, GFP_KERNEL);
333 if (!data)
334 return -ENOMEM;
335 nla_strlcpy(data, na, len);
336 ret = cpulist_parse(data, *mask);
337 kfree(data);
338 return ret;
339}
340
341static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
209{ 342{
210 int rc = 0; 343 int rc = 0;
211 struct sk_buff *rep_skb; 344 struct sk_buff *rep_skb;
@@ -213,6 +346,19 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
213 void *reply; 346 void *reply;
214 size_t size; 347 size_t size;
215 struct nlattr *na; 348 struct nlattr *na;
349 cpumask_t mask;
350
351 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
352 if (rc < 0)
353 return rc;
354 if (rc == 0)
355 return add_del_listener(info->snd_pid, &mask, REGISTER);
356
357 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
358 if (rc < 0)
359 return rc;
360 if (rc == 0)
361 return add_del_listener(info->snd_pid, &mask, DEREGISTER);
216 362
217 /* 363 /*
218 * Size includes space for nested attributes 364 * Size includes space for nested attributes
@@ -252,7 +398,7 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
252 398
253 nla_nest_end(rep_skb, na); 399 nla_nest_end(rep_skb, na);
254 400
255 return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); 401 return send_reply(rep_skb, info->snd_pid);
256 402
257nla_put_failure: 403nla_put_failure:
258 return genlmsg_cancel(rep_skb, reply); 404 return genlmsg_cancel(rep_skb, reply);
@@ -261,9 +407,35 @@ err:
261 return rc; 407 return rc;
262} 408}
263 409
410void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
411{
412 struct listener_list *listeners;
413 struct taskstats *tmp;
414 /*
415 * This is the cpu on which the task is exiting currently and will
416 * be the one for which the exit event is sent, even if the cpu
417 * on which this function is running changes later.
418 */
419 *mycpu = raw_smp_processor_id();
420
421 *ptidstats = NULL;
422 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
423 if (!tmp)
424 return;
425
426 listeners = &per_cpu(listener_array, *mycpu);
427 down_read(&listeners->sem);
428 if (!list_empty(&listeners->list)) {
429 *ptidstats = tmp;
430 tmp = NULL;
431 }
432 up_read(&listeners->sem);
433 kfree(tmp);
434}
435
264/* Send pid data out on exit */ 436/* Send pid data out on exit */
265void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 437void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
266 int group_dead) 438 int group_dead, unsigned int mycpu)
267{ 439{
268 int rc; 440 int rc;
269 struct sk_buff *rep_skb; 441 struct sk_buff *rep_skb;
@@ -324,7 +496,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
324 nla_nest_end(rep_skb, na); 496 nla_nest_end(rep_skb, na);
325 497
326send: 498send:
327 send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); 499 send_cpu_listeners(rep_skb, mycpu);
328 return; 500 return;
329 501
330nla_put_failure: 502nla_put_failure:
@@ -338,16 +510,22 @@ ret:
338 510
339static struct genl_ops taskstats_ops = { 511static struct genl_ops taskstats_ops = {
340 .cmd = TASKSTATS_CMD_GET, 512 .cmd = TASKSTATS_CMD_GET,
341 .doit = taskstats_send_stats, 513 .doit = taskstats_user_cmd,
342 .policy = taskstats_cmd_get_policy, 514 .policy = taskstats_cmd_get_policy,
343}; 515};
344 516
345/* Needed early in initialization */ 517/* Needed early in initialization */
346void __init taskstats_init_early(void) 518void __init taskstats_init_early(void)
347{ 519{
520 unsigned int i;
521
348 taskstats_cache = kmem_cache_create("taskstats_cache", 522 taskstats_cache = kmem_cache_create("taskstats_cache",
349 sizeof(struct taskstats), 523 sizeof(struct taskstats),
350 0, SLAB_PANIC, NULL, NULL); 524 0, SLAB_PANIC, NULL, NULL);
525 for_each_possible_cpu(i) {
526 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
527 init_rwsem(&(per_cpu(listener_array, i).sem));
528 }
351} 529}
352 530
353static int __init taskstats_init(void) 531static int __init taskstats_init(void)