aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/accounting
diff options
context:
space:
mode:
authorShailabh Nagar <nagar@watson.ibm.com>2006-07-14 03:24:45 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-07-15 00:53:57 -0400
commit9e06d3f9f6b14f6e3120923ed215032726246c98 (patch)
treedf0509fedb0cf62bc59edc0038e55880bbc6a592 /Documentation/accounting
parentad4ecbcba72855a2b5319b96e2a3a65ed1ca3bfd (diff)
[PATCH] per task delay accounting taskstats interface: documentation fix
Change documentation and example program to reflect the flow control issues being addressed by the cpumask changes. Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'Documentation/accounting')
-rw-r--r--Documentation/accounting/getdelays.c606
-rw-r--r--Documentation/accounting/taskstats.txt64
2 files changed, 365 insertions, 305 deletions
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index 33de89e56a3d..795ca3911cc5 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -5,6 +5,7 @@
5 * 5 *
6 * Copyright (C) Shailabh Nagar, IBM Corp. 2005 6 * Copyright (C) Shailabh Nagar, IBM Corp. 2005
7 * Copyright (C) Balbir Singh, IBM Corp. 2006 7 * Copyright (C) Balbir Singh, IBM Corp. 2006
8 * Copyright (c) Jay Lan, SGI. 2006
8 * 9 *
9 */ 10 */
10 11
@@ -36,341 +37,360 @@
36 37
37#define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0) 38#define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0)
38int done = 0; 39int done = 0;
40int rcvbufsz=0;
41
42 char name[100];
43int dbg=0, print_delays=0;
44__u64 stime, utime;
45#define PRINTF(fmt, arg...) { \
46 if (dbg) { \
47 printf(fmt, ##arg); \
48 } \
49 }
50
51/* Maximum size of response requested or message sent */
52#define MAX_MSG_SIZE 256
53/* Maximum number of cpus expected to be specified in a cpumask */
54#define MAX_CPUS 32
55/* Maximum length of pathname to log file */
56#define MAX_FILENAME 256
57
58struct msgtemplate {
59 struct nlmsghdr n;
60 struct genlmsghdr g;
61 char buf[MAX_MSG_SIZE];
62};
63
64char cpumask[100+6*MAX_CPUS];
39 65
40/* 66/*
41 * Create a raw netlink socket and bind 67 * Create a raw netlink socket and bind
42 */ 68 */
43static int create_nl_socket(int protocol, int groups) 69static int create_nl_socket(int protocol)
44{ 70{
45 socklen_t addr_len; 71 int fd;
46 int fd; 72 struct sockaddr_nl local;
47 struct sockaddr_nl local; 73
48 74 fd = socket(AF_NETLINK, SOCK_RAW, protocol);
49 fd = socket(AF_NETLINK, SOCK_RAW, protocol); 75 if (fd < 0)
50 if (fd < 0) 76 return -1;
51 return -1; 77
78 if (rcvbufsz)
79 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
80 &rcvbufsz, sizeof(rcvbufsz)) < 0) {
81 printf("Unable to set socket rcv buf size to %d\n",
82 rcvbufsz);
83 return -1;
84 }
52 85
53 memset(&local, 0, sizeof(local)); 86 memset(&local, 0, sizeof(local));
54 local.nl_family = AF_NETLINK; 87 local.nl_family = AF_NETLINK;
55 local.nl_groups = groups;
56 88
57 if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) 89 if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
58 goto error; 90 goto error;
59 91
60 return fd; 92 return fd;
61 error: 93error:
62 close(fd); 94 close(fd);
63 return -1; 95 return -1;
64} 96}
65 97
66int sendto_fd(int s, const char *buf, int bufLen) 98
99int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
100 __u8 genl_cmd, __u16 nla_type,
101 void *nla_data, int nla_len)
67{ 102{
68 struct sockaddr_nl nladdr; 103 struct nlattr *na;
69 int r; 104 struct sockaddr_nl nladdr;
70 105 int r, buflen;
71 memset(&nladdr, 0, sizeof(nladdr)); 106 char *buf;
72 nladdr.nl_family = AF_NETLINK; 107
73 108 struct msgtemplate msg;
74 while ((r = sendto(s, buf, bufLen, 0, (struct sockaddr *) &nladdr, 109
75 sizeof(nladdr))) < bufLen) { 110 msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
76 if (r > 0) { 111 msg.n.nlmsg_type = nlmsg_type;
77 buf += r; 112 msg.n.nlmsg_flags = NLM_F_REQUEST;
78 bufLen -= r; 113 msg.n.nlmsg_seq = 0;
79 } else if (errno != EAGAIN) 114 msg.n.nlmsg_pid = nlmsg_pid;
80 return -1; 115 msg.g.cmd = genl_cmd;
81 } 116 msg.g.version = 0x1;
82 return 0; 117 na = (struct nlattr *) GENLMSG_DATA(&msg);
118 na->nla_type = nla_type;
119 na->nla_len = nla_len + 1 + NLA_HDRLEN;
120 memcpy(NLA_DATA(na), nla_data, nla_len);
121 msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
122
123 buf = (char *) &msg;
124 buflen = msg.n.nlmsg_len ;
125 memset(&nladdr, 0, sizeof(nladdr));
126 nladdr.nl_family = AF_NETLINK;
127 while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
128 sizeof(nladdr))) < buflen) {
129 if (r > 0) {
130 buf += r;
131 buflen -= r;
132 } else if (errno != EAGAIN)
133 return -1;
134 }
135 return 0;
83} 136}
84 137
138
85/* 139/*
86 * Probe the controller in genetlink to find the family id 140 * Probe the controller in genetlink to find the family id
87 * for the TASKSTATS family 141 * for the TASKSTATS family
88 */ 142 */
89int get_family_id(int sd) 143int get_family_id(int sd)
90{ 144{
91 struct { 145 struct {
92 struct nlmsghdr n; 146 struct nlmsghdr n;
93 struct genlmsghdr g; 147 struct genlmsghdr g;
94 char buf[256]; 148 char buf[256];
95 } family_req; 149 } ans;
96 struct { 150
97 struct nlmsghdr n; 151 int id, rc;
98 struct genlmsghdr g; 152 struct nlattr *na;
99 char buf[256]; 153 int rep_len;
100 } ans; 154
101 155 strcpy(name, TASKSTATS_GENL_NAME);
102 int id; 156 rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
103 struct nlattr *na; 157 CTRL_ATTR_FAMILY_NAME, (void *)name,
104 int rep_len; 158 strlen(TASKSTATS_GENL_NAME)+1);
105 159
106 /* Get family name */ 160 rep_len = recv(sd, &ans, sizeof(ans), 0);
107 family_req.n.nlmsg_type = GENL_ID_CTRL; 161 if (ans.n.nlmsg_type == NLMSG_ERROR ||
108 family_req.n.nlmsg_flags = NLM_F_REQUEST; 162 (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
109 family_req.n.nlmsg_seq = 0; 163 return 0;
110 family_req.n.nlmsg_pid = getpid();
111 family_req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
112 family_req.g.cmd = CTRL_CMD_GETFAMILY;
113 family_req.g.version = 0x1;
114 na = (struct nlattr *) GENLMSG_DATA(&family_req);
115 na->nla_type = CTRL_ATTR_FAMILY_NAME;
116 na->nla_len = strlen(TASKSTATS_GENL_NAME) + 1 + NLA_HDRLEN;
117 strcpy(NLA_DATA(na), TASKSTATS_GENL_NAME);
118 family_req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
119
120 if (sendto_fd(sd, (char *) &family_req, family_req.n.nlmsg_len) < 0)
121 err(1, "error sending message via Netlink\n");
122
123 rep_len = recv(sd, &ans, sizeof(ans), 0);
124
125 if (rep_len < 0)
126 err(1, "error receiving reply message via Netlink\n");
127
128
129 /* Validate response message */
130 if (!NLMSG_OK((&ans.n), rep_len))
131 err(1, "invalid reply message received via Netlink\n");
132
133 if (ans.n.nlmsg_type == NLMSG_ERROR) { /* error */
134 printf("error received NACK - leaving\n");
135 exit(1);
136 }
137
138
139 na = (struct nlattr *) GENLMSG_DATA(&ans);
140 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
141 if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
142 id = *(__u16 *) NLA_DATA(na);
143 }
144 return id;
145}
146 164
147void print_taskstats(struct taskstats *t) 165 na = (struct nlattr *) GENLMSG_DATA(&ans);
148{ 166 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
149 printf("\n\nCPU %15s%15s%15s%15s\n" 167 if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
150 " %15llu%15llu%15llu%15llu\n" 168 id = *(__u16 *) NLA_DATA(na);
151 "IO %15s%15s\n" 169 }
152 " %15llu%15llu\n" 170 return id;
153 "MEM %15s%15s\n"
154 " %15llu%15llu\n\n",
155 "count", "real total", "virtual total", "delay total",
156 t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
157 t->cpu_delay_total,
158 "count", "delay total",
159 t->blkio_count, t->blkio_delay_total,
160 "count", "delay total", t->swapin_count, t->swapin_delay_total);
161} 171}
162 172
163void sigchld(int sig) 173void print_delayacct(struct taskstats *t)
164{ 174{
165 done = 1; 175 printf("\n\nCPU %15s%15s%15s%15s\n"
176 " %15llu%15llu%15llu%15llu\n"
177 "IO %15s%15s\n"
178 " %15llu%15llu\n"
179 "MEM %15s%15s\n"
180 " %15llu%15llu\n\n",
181 "count", "real total", "virtual total", "delay total",
182 t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
183 t->cpu_delay_total,
184 "count", "delay total",
185 t->blkio_count, t->blkio_delay_total,
186 "count", "delay total", t->swapin_count, t->swapin_delay_total);
166} 187}
167 188
168int main(int argc, char *argv[]) 189int main(int argc, char *argv[])
169{ 190{
170 int rc; 191 int c, rc, rep_len, aggr_len, len2, cmd_type;
171 int sk_nl; 192 __u16 id;
172 struct nlmsghdr *nlh; 193 __u32 mypid;
173 struct genlmsghdr *genlhdr; 194
174 char *buf; 195 struct nlattr *na;
175 struct taskstats_cmd_param *param; 196 int nl_sd = -1;
176 __u16 id; 197 int len = 0;
177 struct nlattr *na; 198 pid_t tid = 0;
178 199 pid_t rtid = 0;
179 /* For receiving */ 200
180 struct sockaddr_nl kern_nla, from_nla; 201 int fd = 0;
181 socklen_t from_nla_len; 202 int count = 0;
182 int recv_len; 203 int write_file = 0;
183 struct taskstats_reply *reply; 204 int maskset = 0;
184 205 char logfile[128];
185 struct { 206 int loop = 0;
186 struct nlmsghdr n; 207
187 struct genlmsghdr g; 208 struct msgtemplate msg;
188 char buf[256]; 209
189 } req; 210 while (1) {
211 c = getopt(argc, argv, "dw:r:m:t:p:v:l");
212 if (c < 0)
213 break;
190 214
191 struct { 215 switch (c) {
192 struct nlmsghdr n; 216 case 'd':
193 struct genlmsghdr g; 217 printf("print delayacct stats ON\n");
194 char buf[256]; 218 print_delays = 1;
195 } ans; 219 break;
196 220 case 'w':
197 int nl_sd = -1; 221 strncpy(logfile, optarg, MAX_FILENAME);
198 int rep_len; 222 printf("write to file %s\n", logfile);
199 int len = 0; 223 write_file = 1;
200 int aggr_len, len2; 224 break;
201 struct sockaddr_nl nladdr; 225 case 'r':
202 pid_t tid = 0; 226 rcvbufsz = atoi(optarg);
203 pid_t rtid = 0; 227 printf("receive buf size %d\n", rcvbufsz);
204 int cmd_type = TASKSTATS_TYPE_TGID; 228 if (rcvbufsz < 0)
205 int c, status; 229 err(1, "Invalid rcv buf size\n");
206 int forking = 0; 230 break;
207 struct sigaction act = { 231 case 'm':
208 .sa_handler = SIG_IGN, 232 strncpy(cpumask, optarg, sizeof(cpumask));
209 .sa_mask = SA_NOMASK, 233 maskset = 1;
210 }; 234 printf("cpumask %s maskset %d\n", cpumask, maskset);
211 struct sigaction tact ; 235 break;
212 236 case 't':
213 if (argc < 3) { 237 tid = atoi(optarg);
214 printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]); 238 if (!tid)
215 exit(-1); 239 err(1, "Invalid tgid\n");
216 } 240 cmd_type = TASKSTATS_CMD_ATTR_TGID;
217 241 print_delays = 1;
218 tact.sa_handler = sigchld; 242 break;
219 sigemptyset(&tact.sa_mask); 243 case 'p':
220 if (sigaction(SIGCHLD, &tact, NULL) < 0) 244 tid = atoi(optarg);
221 err(1, "sigaction failed for SIGCHLD\n"); 245 if (!tid)
222 246 err(1, "Invalid pid\n");
223 while (1) { 247 cmd_type = TASKSTATS_CMD_ATTR_PID;
224 248 print_delays = 1;
225 c = getopt(argc, argv, "t:p:c:"); 249 break;
226 if (c < 0) 250 case 'v':
227 break; 251 printf("debug on\n");
228 252 dbg = 1;
229 switch (c) { 253 break;
230 case 't': 254 case 'l':
231 tid = atoi(optarg); 255 printf("listen forever\n");
232 if (!tid) 256 loop = 1;
233 err(1, "Invalid tgid\n"); 257 break;
234 cmd_type = TASKSTATS_CMD_ATTR_TGID; 258 default:
235 break; 259 printf("Unknown option %d\n", c);
236 case 'p': 260 exit(-1);
237 tid = atoi(optarg);
238 if (!tid)
239 err(1, "Invalid pid\n");
240 cmd_type = TASKSTATS_CMD_ATTR_TGID;
241 break;
242 case 'c':
243 opterr = 0;
244 tid = fork();
245 if (tid < 0)
246 err(1, "fork failed\n");
247
248 if (tid == 0) { /* child process */
249 if (execvp(argv[optind - 1], &argv[optind - 1]) < 0) {
250 exit(-1);
251 } 261 }
252 }
253 forking = 1;
254 break;
255 default:
256 printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
257 exit(-1);
258 break;
259 } 262 }
260 if (c == 'c')
261 break;
262 }
263
264 /* Construct Netlink request message */
265
266 /* Send Netlink request message & get reply */
267 263
268 if ((nl_sd = 264 if (write_file) {
269 create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0) 265 fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC,
270 err(1, "error creating Netlink socket\n"); 266 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
271 267 if (fd == -1) {
272 268 perror("Cannot open output file\n");
273 id = get_family_id(nl_sd); 269 exit(1);
274 270 }
275 /* Send command needed */ 271 }
276 req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
277 req.n.nlmsg_type = id;
278 req.n.nlmsg_flags = NLM_F_REQUEST;
279 req.n.nlmsg_seq = 0;
280 req.n.nlmsg_pid = tid;
281 req.g.cmd = TASKSTATS_CMD_GET;
282 na = (struct nlattr *) GENLMSG_DATA(&req);
283 na->nla_type = cmd_type;
284 na->nla_len = sizeof(unsigned int) + NLA_HDRLEN;
285 *(__u32 *) NLA_DATA(na) = tid;
286 req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
287
288
289 if (!forking && sendto_fd(nl_sd, (char *) &req, req.n.nlmsg_len) < 0)
290 err(1, "error sending message via Netlink\n");
291 272
292 act.sa_handler = SIG_IGN; 273 if ((nl_sd = create_nl_socket(NETLINK_GENERIC)) < 0)
293 sigemptyset(&act.sa_mask); 274 err(1, "error creating Netlink socket\n");
294 if (sigaction(SIGINT, &act, NULL) < 0)
295 err(1, "sigaction failed for SIGINT\n");
296 275
297 do {
298 int i;
299 struct pollfd pfd;
300 int pollres;
301 276
302 pfd.events = 0xffff & ~POLLOUT; 277 mypid = getpid();
303 pfd.fd = nl_sd; 278 id = get_family_id(nl_sd);
304 pollres = poll(&pfd, 1, 5000); 279 if (!id) {
305 if (pollres < 0 || done) { 280 printf("Error getting family id, errno %d", errno);
306 break; 281 goto err;
307 } 282 }
308 283 PRINTF("family id %d\n", id);
309 rep_len = recv(nl_sd, &ans, sizeof(ans), 0); 284
310 nladdr.nl_family = AF_NETLINK; 285 if (maskset) {
311 nladdr.nl_groups = TASKSTATS_LISTEN_GROUP; 286 rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
312 287 TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
313 if (ans.n.nlmsg_type == NLMSG_ERROR) { /* error */ 288 &cpumask, sizeof(cpumask));
314 printf("error received NACK - leaving\n"); 289 PRINTF("Sent register cpumask, retval %d\n", rc);
315 exit(1); 290 if (rc < 0) {
291 printf("error sending register cpumask\n");
292 goto err;
293 }
316 } 294 }
317 295
318 if (rep_len < 0) { 296 if (tid) {
319 err(1, "error receiving reply message via Netlink\n"); 297 rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
320 break; 298 cmd_type, &tid, sizeof(__u32));
299 PRINTF("Sent pid/tgid, retval %d\n", rc);
300 if (rc < 0) {
301 printf("error sending tid/tgid cmd\n");
302 goto done;
303 }
321 } 304 }
322 305
323 /* Validate response message */ 306 do {
324 if (!NLMSG_OK((&ans.n), rep_len)) 307 int i;
325 err(1, "invalid reply message received via Netlink\n");
326 308
327 rep_len = GENLMSG_PAYLOAD(&ans.n); 309 rep_len = recv(nl_sd, &msg, sizeof(msg), 0);
310 PRINTF("received %d bytes\n", rep_len);
328 311
329 na = (struct nlattr *) GENLMSG_DATA(&ans); 312 if (rep_len < 0) {
330 len = 0; 313 printf("nonfatal reply error: errno %d\n", errno);
331 i = 0; 314 continue;
332 while (len < rep_len) { 315 }
333 len += NLA_ALIGN(na->nla_len); 316 if (msg.n.nlmsg_type == NLMSG_ERROR ||
334 switch (na->nla_type) { 317 !NLMSG_OK((&msg.n), rep_len)) {
335 case TASKSTATS_TYPE_AGGR_PID: 318 printf("fatal reply error, errno %d\n", errno);
336 /* Fall through */ 319 goto done;
337 case TASKSTATS_TYPE_AGGR_TGID: 320 }
338 aggr_len = NLA_PAYLOAD(na->nla_len); 321
339 len2 = 0; 322 PRINTF("nlmsghdr size=%d, nlmsg_len=%d, rep_len=%d\n",
340 /* For nested attributes, na follows */ 323 sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len);
341 na = (struct nlattr *) NLA_DATA(na); 324
342 done = 0; 325
343 while (len2 < aggr_len) { 326 rep_len = GENLMSG_PAYLOAD(&msg.n);
344 switch (na->nla_type) { 327
345 case TASKSTATS_TYPE_PID: 328 na = (struct nlattr *) GENLMSG_DATA(&msg);
346 rtid = *(int *) NLA_DATA(na); 329 len = 0;
347 break; 330 i = 0;
348 case TASKSTATS_TYPE_TGID: 331 while (len < rep_len) {
349 rtid = *(int *) NLA_DATA(na); 332 len += NLA_ALIGN(na->nla_len);
350 break; 333 switch (na->nla_type) {
351 case TASKSTATS_TYPE_STATS: 334 case TASKSTATS_TYPE_AGGR_TGID:
352 if (rtid == tid) { 335 /* Fall through */
353 print_taskstats((struct taskstats *) 336 case TASKSTATS_TYPE_AGGR_PID:
354 NLA_DATA(na)); 337 aggr_len = NLA_PAYLOAD(na->nla_len);
355 done = 1; 338 len2 = 0;
339 /* For nested attributes, na follows */
340 na = (struct nlattr *) NLA_DATA(na);
341 done = 0;
342 while (len2 < aggr_len) {
343 switch (na->nla_type) {
344 case TASKSTATS_TYPE_PID:
345 rtid = *(int *) NLA_DATA(na);
346 if (print_delays)
347 printf("PID\t%d\n", rtid);
348 break;
349 case TASKSTATS_TYPE_TGID:
350 rtid = *(int *) NLA_DATA(na);
351 if (print_delays)
352 printf("TGID\t%d\n", rtid);
353 break;
354 case TASKSTATS_TYPE_STATS:
355 count++;
356 if (print_delays)
357 print_delayacct((struct taskstats *) NLA_DATA(na));
358 if (fd) {
359 if (write(fd, NLA_DATA(na), na->nla_len) < 0) {
360 err(1,"write error\n");
361 }
362 }
363 if (!loop)
364 goto done;
365 break;
366 default:
367 printf("Unknown nested nla_type %d\n", na->nla_type);
368 break;
369 }
370 len2 += NLA_ALIGN(na->nla_len);
371 na = (struct nlattr *) ((char *) na + len2);
372 }
373 break;
374
375 default:
376 printf("Unknown nla_type %d\n", na->nla_type);
377 break;
356 } 378 }
357 break; 379 na = (struct nlattr *) (GENLMSG_DATA(&msg) + len);
358 }
359 len2 += NLA_ALIGN(na->nla_len);
360 na = (struct nlattr *) ((char *) na + len2);
361 if (done)
362 break;
363 } 380 }
364 } 381 } while (loop);
365 na = (struct nlattr *) (GENLMSG_DATA(&ans) + len); 382done:
366 if (done) 383 if (maskset) {
367 break; 384 rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
385 TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
386 &cpumask, sizeof(cpumask));
387 printf("Sent deregister mask, retval %d\n", rc);
388 if (rc < 0)
389 err(rc, "error sending deregister cpumask\n");
368 } 390 }
369 if (done) 391err:
370 break; 392 close(nl_sd);
371 } 393 if (fd)
372 while (1); 394 close(fd);
373 395 return 0;
374 close(nl_sd);
375 return 0;
376} 396}
diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
index efd8f605bcd5..92ebf29e9041 100644
--- a/Documentation/accounting/taskstats.txt
+++ b/Documentation/accounting/taskstats.txt
@@ -26,20 +26,28 @@ leader - a process is deemed alive as long as it has any task belonging to it.
26Usage 26Usage
27----- 27-----
28 28
29To get statistics during task's lifetime, userspace opens a unicast netlink 29To get statistics during a task's lifetime, userspace opens a unicast netlink
30socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid. 30socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
31The response contains statistics for a task (if pid is specified) or the sum of 31The response contains statistics for a task (if pid is specified) or the sum of
32statistics for all tasks of the process (if tgid is specified). 32statistics for all tasks of the process (if tgid is specified).
33 33
34To obtain statistics for tasks which are exiting, userspace opens a multicast 34To obtain statistics for tasks which are exiting, the userspace listener
35netlink socket. Each time a task exits, its per-pid statistics is always sent 35sends a register command and specifies a cpumask. Whenever a task exits on
36by the kernel to each listener on the multicast socket. In addition, if it is 36one of the cpus in the cpumask, its per-pid statistics are sent to the
37the last thread exiting its thread group, an additional record containing the 37registered listener. Using cpumasks allows the data received by one listener
38per-tgid stats are also sent. The latter contains the sum of per-pid stats for 38to be limited and assists in flow control over the netlink interface and is
39all threads in the thread group, both past and present. 39explained in more detail below.
40
41If the exiting task is the last thread exiting its thread group,
42an additional record containing the per-tgid stats is also sent to userspace.
43The latter contains the sum of per-pid stats for all threads in the thread
44group, both past and present.
40 45
41getdelays.c is a simple utility demonstrating usage of the taskstats interface 46getdelays.c is a simple utility demonstrating usage of the taskstats interface
42for reporting delay accounting statistics. 47for reporting delay accounting statistics. Users can register cpumasks,
48send commands and process responses, listen for per-tid/tgid exit data,
49write the data received to a file and do basic flow control by increasing
50receive buffer sizes.
43 51
44Interface 52Interface
45--------- 53---------
@@ -66,10 +74,20 @@ The messages are in the format
66 74
67The taskstats payload is one of the following three kinds: 75The taskstats payload is one of the following three kinds:
68 76
691. Commands: Sent from user to kernel. The payload is one attribute, of type 771. Commands: Sent from user to kernel. Commands to get data on
70TASKSTATS_CMD_ATTR_PID/TGID, containing a u32 pid or tgid in the attribute 78a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
71payload. The pid/tgid denotes the task/process for which userspace wants 79containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
72statistics. 80the task/process for which userspace wants statistics.
81
82Commands to register/deregister interest in exit data from a set of cpus
83consist of one attribute, of type
84TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
85attribute payload. The cpumask is specified as an ascii string of
86comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
87the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
88in cpus before closing the listening socket, the kernel cleans up its interest
89set over time. However, for the sake of efficiency, an explicit deregistration
90is advisable.
73 91
742. Response for a command: sent from the kernel in response to a userspace 922. Response for a command: sent from the kernel in response to a userspace
75command. The payload is a series of three attributes of type: 93command. The payload is a series of three attributes of type:
@@ -138,4 +156,26 @@ struct too much, requiring disparate userspace accounting utilities to
138unnecessarily receive large structures whose fields are of no interest, then 156unnecessarily receive large structures whose fields are of no interest, then
139extending the attributes structure would be worthwhile. 157extending the attributes structure would be worthwhile.
140 158
159Flow control for taskstats
160--------------------------
161
162When the rate of task exits becomes large, a listener may not be able to keep
163up with the kernel's rate of sending per-tid/tgid exit data leading to data
164loss. This possibility gets compounded when the taskstats structure gets
165extended and the number of cpus grows large.
166
167To avoid losing statistics, userspace should do one or more of the following:
168
169- increase the receive buffer sizes for the netlink sockets opened by
170listeners to receive exit data.
171
172- create more listeners and reduce the number of cpus being listened to by
173each listener. In the extreme case, there could be one listener for each cpu.
174Users may also consider setting the cpu affinity of the listener to the subset
175of cpus to which it listens, especially if they are listening to just one cpu.
176
177Despite these measures, if the userspace receives ENOBUFS error messages
178indicated overflow of receive buffers, it should take measures to handle the
179loss of data.
180
141---- 181----